In [1]:
import numpy as np
import pandas as pd

## 1. DataFrame的基本元素Series
#### Series类似于np.array[]，是DataFrame的基本组成单位

In [4]:
from pandas import Series, DataFrame

In [5]:
data = {
    'Country': ['China', 'India', 'Brazil'],
    'Capital': ['Beijing', 'New Delhi', 'Brasilia'],
    'Population': ['1432732201', '1303171635', '207847528']
}

In [12]:

s1 = Series(data['Country'])
s2 = Series(data['Capital'])
s3 = Series(data['Population'])

In [14]:
df = DataFrame(data)
df.head()

Unnamed: 0,Country,Capital,Population
0,China,Beijing,1432732201
1,India,New Delhi,1303171635
2,Brazil,Brasilia,207847528


In [15]:
#遍历行
for row in df.iterrows():
    print(row[0], row[1])
    break

0 Country            China
Capital          Beijing
Population    1432732201
Name: 0, dtype: object


In [18]:
#通过Series创建DataFrame，是按行来显示，需要通过转置后才能按列显示
df_new = DataFrame([s1,s2,s3], index=['Country','Capital', 'Population'])
df_new

Unnamed: 0,0,1,2
Country,China,India,Brazil
Capital,Beijing,New Delhi,Brasilia
Population,1432732201,1303171635,207847528


In [19]:
df_new.T

Unnamed: 0,Country,Capital,Population
0,China,Beijing,1432732201
1,India,New Delhi,1303171635
2,Brazil,Brasilia,207847528


## 2. pandas 常用数据结构 DataFrame

In [20]:
import webbrowser

In [21]:
link = 'http://pandas.pydata.org/pandas-docs/version/0.20/io.html'
webbrowser.open(link)

True

### 创建一个简单的DataFrame

In [63]:
# create a dataframe
df1 = DataFrame({"城市":["北京","上海","广州"], 
                 "人口":[1000,2000,1500]}, index=['A','B','C'])
#增加一列数据
df1['GDP'] = Series([1000,2000,1500], index=['A','B','C'])
df1

Unnamed: 0,城市,人口,GDP
A,北京,1000,1000
B,上海,2000,2000
C,广州,1500,1500


### 用剪贴板中的数据来创建DataFrame

In [28]:
df1 = pd.read_clipboard()
df1

Unnamed: 0,Format,Type,Data,Description,Reader,Writer
0,text,CSV,read_csv,to_csv,,
1,text,JSON,read_json,to_json,,
2,text,HTML,read_html,to_html,,
3,text,Local,clipboard,read_clipboard,to_clipboard,


### 将DataFrame数据输出成csv文件

In [39]:
df1.to_csv('df1.csv', index=False)
df1.to_json()
df1.to_html('df1.html')
df1.to_excel('df1.xlsx')
#从文件中加载数据
pd.read_csv('df1.csv')
pd.read_excel('df1.xlsx')

Unnamed: 0.1,Unnamed: 0,Format,Type,Data,Description,Reader,Writer
0,0,text,CSV,read_csv,to_csv,,
1,1,text,JSON,read_json,to_json,,
2,2,text,HTML,read_html,to_html,,
3,3,text,Local,clipboard,read_clipboard,to_clipboard,


## 3. Series和DataFrame的索引

### reindex Series

In [41]:
s1 = Series([1,2,3,4], index=['A','B','C','D'])
s1

A    1
B    2
C    3
D    4
dtype: int64

In [42]:
#如果索引对应无数据，则用10填充
s1.reindex(index=['A','B','C','D','E'], fill_value=10)

A     1
B     2
C     3
D     4
E    10
dtype: int64

In [44]:
#重新调整索引后，使用ffill函数来填充
s2 = Series(['A','B','C'], index=[1,5,10])
s2.reindex(index=range(15),method='ffill')

0     NaN
1       A
2       A
3       A
4       A
5       B
6       B
7       B
8       B
9       B
10      C
11      C
12      C
13      C
14      C
dtype: object

In [51]:
#减少索引后，其他索引将会被删除
s1.reindex(index=['A','B'])

A    1
B    2
dtype: int64

In [52]:
#删除索引也可以用drop方法
s1.drop('A')

B    2
C    3
D    4
dtype: int64

### reindex dataframe

In [46]:
df1 = DataFrame(np.random.rand(25).reshape([5,5]), index=['A','B','D','E','F'],  columns=['c1','c2','c3','c4','c5'])
df1

Unnamed: 0,c1,c2,c3,c4,c5
A,0.835658,0.950824,0.625893,0.606894,0.289259
B,0.332767,0.944092,0.005445,0.324085,0.242119
D,0.516839,0.871585,0.855273,0.262905,0.859788
E,0.166639,0.494014,0.100407,0.618591,0.512618
F,0.431318,0.036715,0.237629,0.970702,0.600384


In [47]:
#重新调整索引后，如果增加了新索引，对应行被填充为NaN
df1.reindex(index=['A','B','C','D','E','F'])

Unnamed: 0,c1,c2,c3,c4,c5
A,0.835658,0.950824,0.625893,0.606894,0.289259
B,0.332767,0.944092,0.005445,0.324085,0.242119
C,,,,,
D,0.516839,0.871585,0.855273,0.262905,0.859788
E,0.166639,0.494014,0.100407,0.618591,0.512618
F,0.431318,0.036715,0.237629,0.970702,0.600384


In [48]:
#重新调整标签后，如果增加了新标签，对应列被填充为NaN
df1.reindex(columns=['c1','c2','c3','c4','c5','c6'])

Unnamed: 0,c1,c2,c3,c4,c5,c6
A,0.835658,0.950824,0.625893,0.606894,0.289259,
B,0.332767,0.944092,0.005445,0.324085,0.242119,
D,0.516839,0.871585,0.855273,0.262905,0.859788,
E,0.166639,0.494014,0.100407,0.618591,0.512618,
F,0.431318,0.036715,0.237629,0.970702,0.600384,


In [50]:
#减少索引后，其他索引将会被删除
df1.reindex(index=['A','B'])

Unnamed: 0,c1,c2,c3,c4,c5
A,0.835658,0.950824,0.625893,0.606894,0.289259
B,0.332767,0.944092,0.005445,0.324085,0.242119


In [53]:
#删除索引也可以用drop方法，当axis=0时删除行，axis=1时删除列
df1.drop('c1', axis=1)

Unnamed: 0,c2,c3,c4,c5
A,0.950824,0.625893,0.606894,0.289259
B,0.944092,0.005445,0.324085,0.242119
D,0.871585,0.855273,0.262905,0.859788
E,0.494014,0.100407,0.618591,0.512618
F,0.036715,0.237629,0.970702,0.600384


## 4. 切片（iloc、loc）

In [82]:
movies = pd.read_csv("movie_metadata.csv")
movies.head(2)

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0


In [100]:
sub_movies = movies[['color','director_name','country']]
sub_movies.head(1)

Unnamed: 0,color,director_name,country
0,Color,James Cameron,USA


In [117]:
sub_movies_slice = sub_movies.iloc[10:200,:3]
sub_movies_slice.head(2)

Unnamed: 0,color,director_name,country
10,Color,Zack Snyder,USA
11,Color,Bryan Singer,USA


In [111]:
#loc中第二个参数是标签名称，可以是标签范围
#X.loc[startRow:endRow,‘标签1’:'标签N']
sub_movies_slice.loc[5:12,'director_name':]

Unnamed: 0,director_name,country
10,Zack Snyder,USA
11,Bryan Singer,USA
12,Marc Forster,UK
