In [3]:
from pandas import DataFrame,Series
import pandas as pd

### Series
类似一维数组，由数据和索引组成，索引可省略

In [3]:
# 省略索引，自动生成
obj = Series(['Allen','Bill','Didi','Cash'])
obj

0    Allen
1     Bill
2     Didi
3     Cash
dtype: object

In [4]:
obj2 = Series(['Allen','Bill','Didi','Cash'],index=['A','B','D','C'])
obj2

A    Allen
B     Bill
D     Didi
C     Cash
dtype: object

In [5]:
obj2['A']

'Allen'

In [6]:
# 索引可以是一个值对象，也可以是一个列表对象，反正只能是一个对象
obj2[['A','B']]

A    Allen
B     Bill
dtype: object

In [7]:
# 可以视为一个字典——索引到数据的映射
'A' in obj2

True

In [8]:
# 因此可以通过一个字典创建一个Series对象
# 索引是乱序的
data={'id':'yikayiyo','sex':'♂','age':'Ha?'}
obj3 = Series(data)
obj3

age         Ha?
id     yikayiyo
sex           ♂
dtype: object

In [9]:
# 指定索引，找不对应数据的NaN填充
data={'id':'yikayiyo','sex':'♂','age':'Ha?'}
obj4 = Series(data,index=['id','age','sex','tel'])
obj4

id     yikayiyo
age         Ha?
sex           ♂
tel         NaN
dtype: object

In [10]:
# 作用相反的两个函数
# obj4.notnull()
obj4.isnull()

id     False
age    False
sex    False
tel     True
dtype: bool

In [11]:
obj3+obj4

age              Ha?Ha?
id     yikayiyoyikayiyo
sex                  ♂♂
tel                 NaN
dtype: object

In [12]:
obj4.name = 'infos'
obj4.index.name='info'
obj4

info
id     yikayiyo
age         Ha?
sex           ♂
tel         NaN
Name: infos, dtype: object

In [13]:
#索引可以赋值修改
obj.index=['AA','BB','DD','CC']
obj

AA    Allen
BB     Bill
DD     Didi
CC     Cash
dtype: object

### DataFrame 
表格型的数据结构
可以传入等长列表或Numpy数组组成的字典

In [14]:
data={'name':['A','B','C'],'age':[29,27,18],'sex':['male','female','male']}
frame=DataFrame(data)
frame

Unnamed: 0,age,name,sex
0,29,A,male
1,27,B,female
2,18,C,male


In [15]:
# columns 指定列顺序
frame = DataFrame(data,columns=['name','age','sex'])
frame

Unnamed: 0,name,age,sex
0,A,29,male
1,B,27,female
2,C,18,male


In [16]:
# 多出的columns 会补NaN ，index多出或者不够都会报错
frame2 = DataFrame(data,columns=['name','age','sex','tel'],index=['one','two','three'])
frame2

Unnamed: 0,name,age,sex,tel
one,A,29,male,
two,B,27,female,
three,C,18,male,


In [17]:
frame2.sex

one        male
two      female
three      male
Name: sex, dtype: object

In [18]:
# 另一种写法
frame2['sex']

one        male
two      female
three      male
Name: sex, dtype: object

In [19]:
# 根据索引返回某行
# ix 不推荐使用了
frame2.loc['one']

name       A
age       29
sex     male
tel      NaN
Name: one, dtype: object

In [20]:
# 为某列NaN赋值
val=Series({'one':'0001','two':'0002','three':'0003'})
# 别的写法
# val=Series(data=['001','002','003'],index=['one','two','three'])
frame2.tel = val
frame2

Unnamed: 0,name,age,sex,tel
one,A,29,male,1
two,B,27,female,2
three,C,18,male,3


In [21]:
frame2['address']=frame2.sex=='male'
#无效的写法frame2.address=Series(data=['ss','s','sss'],index=['one','two','three'])
frame2

Unnamed: 0,name,age,sex,tel,address
one,A,29,male,1,True
two,B,27,female,2,False
three,C,18,male,3,True


In [22]:
# 无效
del frame2.address

AttributeError: address

In [23]:
frame2

Unnamed: 0,name,age,sex,tel,address
one,A,29,male,1,True
two,B,27,female,2,False
three,C,18,male,3,True


In [24]:
del frame2['address']

In [25]:
frame2

Unnamed: 0,name,age,sex,tel
one,A,29,male,1
two,B,27,female,2
three,C,18,male,3


-----------------------------------------------
传入嵌套字典
外层字典的键作为列，内层字典的键作为行索引

In [27]:
pop={'name':{'one':'A','two':'B'},
    'age':{'one':'22','two':'33'},
     'sex':{'one':'male','two':'female'},
     'tel':{'one':'0001','two':'0002'},
    }
frame3 = DataFrame(pop)
frame3

Unnamed: 0,age,name,sex,tel
one,22,A,male,1
two,33,B,female,2


In [30]:
#显示指定index
frame4 = DataFrame(pop,index=['one','three','two'])
frame4

Unnamed: 0,age,name,sex,tel
one,22.0,A,male,1.0
three,,,,
two,33.0,B,female,2.0


In [35]:
# frame4.index.name=''
frame4.columns.name=''
frame4

Unnamed: 0,age,name,sex,tel
,,,,
one,22.0,A,male,1.0
three,,,,
two,33.0,B,female,2.0


### 索引对象

In [42]:
obj_index = Series(range(3),index=['a','b','c'])
index = obj_index.index
index

Index(['a', 'b', 'c'], dtype='object')

In [43]:
index[1]

'b'

In [44]:
# index 不可修改
index[1]='change'

TypeError: Index does not support mutable operations

In [5]:
import numpy as np
newindex = pd.Index(np.arange(3))
newindex

Int64Index([0, 1, 2], dtype='int64')

In [48]:
obj_index_2 = Series([1.5,2.5,0],index=newindex)
obj_index_2

0    1.5
1    2.5
2    0.0
dtype: float64

### 缺失数据处理


In [46]:
import numpy as np
from pandas import *
randn = np.random.randn

df_hm = DataFrame(randn(8,4),index=['I','II','III','IV','VI','VII','VIII','X'],columns=['A','B','C','D'])
df_hm['E'] = 'Dummy'
df_hm['F'] = df_hm['A'] >0.5
df_hm2 = df_hm.reindex(['I','II','III','IV','V','VI','VII','VIII','IX','X'])
isnull(df_hm2['A'])
# df_hm2['D'].notnull()
df_hm2

Unnamed: 0,A,B,C,D,E,F
I,0.160363,1.028128,1.300255,-0.771766,Dummy,False
II,-0.722873,-0.07896,-0.339029,-0.382753,Dummy,False
III,0.170717,0.734794,-0.361745,2.243551,Dummy,False
IV,-0.844764,0.21167,-1.649014,0.472738,Dummy,False
V,,,,,,
VI,0.196998,0.556493,0.115658,0.418898,Dummy,False
VII,-0.082816,-1.043093,1.37233,-0.896512,Dummy,False
VIII,0.766681,2.721388,-1.034899,2.293946,Dummy,True
IX,,,,,,
X,0.43777,0.450306,-1.634612,1.944781,Dummy,False


In [47]:

df_hm2['D'] = df_hm2['D'].fillna('missing')
df_hm2

Unnamed: 0,A,B,C,D,E,F
I,0.160363,1.028128,1.300255,-0.771766,Dummy,False
II,-0.722873,-0.07896,-0.339029,-0.382753,Dummy,False
III,0.170717,0.734794,-0.361745,2.24355,Dummy,False
IV,-0.844764,0.21167,-1.649014,0.472738,Dummy,False
V,,,,missing,,
VI,0.196998,0.556493,0.115658,0.418898,Dummy,False
VII,-0.082816,-1.043093,1.37233,-0.896512,Dummy,False
VIII,0.766681,2.721388,-1.034899,2.29395,Dummy,True
IX,,,,missing,,
X,0.43777,0.450306,-1.634612,1.94478,Dummy,False


In [48]:
df_hm2.fillna(0)

Unnamed: 0,A,B,C,D,E,F
I,0.160363,1.028128,1.300255,-0.771766,Dummy,False
II,-0.722873,-0.07896,-0.339029,-0.382753,Dummy,False
III,0.170717,0.734794,-0.361745,2.24355,Dummy,False
IV,-0.844764,0.21167,-1.649014,0.472738,Dummy,False
V,0.0,0.0,0.0,missing,0,0
VI,0.196998,0.556493,0.115658,0.418898,Dummy,False
VII,-0.082816,-1.043093,1.37233,-0.896512,Dummy,False
VIII,0.766681,2.721388,-1.034899,2.29395,Dummy,True
IX,0.0,0.0,0.0,missing,0,0
X,0.43777,0.450306,-1.634612,1.94478,Dummy,False


In [50]:
df_hm2.dropna(axis=0)

Unnamed: 0,A,B,C,D,E,F
I,0.160363,1.028128,1.300255,-0.771766,Dummy,False
II,-0.722873,-0.07896,-0.339029,-0.382753,Dummy,False
III,0.170717,0.734794,-0.361745,2.24355,Dummy,False
IV,-0.844764,0.21167,-1.649014,0.472738,Dummy,False
VI,0.196998,0.556493,0.115658,0.418898,Dummy,False
VII,-0.082816,-1.043093,1.37233,-0.896512,Dummy,False
VIII,0.766681,2.721388,-1.034899,2.29395,Dummy,True
X,0.43777,0.450306,-1.634612,1.94478,Dummy,False


In [45]:
df_hm3 = df_hm.copy()
df_hm3['Timestamp'] = Timestamp('20170922')
df_hm3.loc[['I','III','VIII'],['Timestamp','A']] = np.nan
df_hm3

Unnamed: 0,A,B,C,D,E,F,Timestamp
I,,-0.145166,-0.188829,0.063155,Dummy,False,NaT
II,-1.953258,-0.075553,-1.333696,0.03454,Dummy,False,2017-09-22
III,,-0.61976,0.39195,0.382776,Dummy,True,NaT
IV,0.002028,-0.755443,0.309343,-1.960081,Dummy,False,2017-09-22
VI,0.804434,-1.076166,-0.415865,0.63463,Dummy,True,2017-09-22
VII,0.289184,0.349391,-0.735722,-1.668379,Dummy,False,2017-09-22
VIII,,-1.998716,-0.395886,1.667791,Dummy,False,NaT
X,0.117628,-0.595285,-0.233889,-0.230118,Dummy,False,2017-09-22


In [55]:
ts = Series(randn(30))
ts.count()
ts[10:30] = None
ts.count()

10

In [59]:
# 线性插值填充数据
ts.interpolate().count()

30