# pandas 入门

## pandas 数据结构介绍

### series

In [1]:
# series是一种一维的数组对象，且包含了数据标签，又叫索引。

In [22]:
import pandas as pd

In [23]:
import numpy as np

In [3]:
obj=pd.Series([4,-7,-5,3])

In [4]:
obj

0    4
1   -7
2   -5
3    3
dtype: int64

In [5]:
# 索引在左边，值在右边。通过value和index属性分别获得值和索引

In [6]:
obj.values

array([ 4, -7, -5,  3], dtype=int64)

In [7]:
obj.index 

RangeIndex(start=0, stop=4, step=1)

In [8]:
# 可以创建一个索引序列，用标签标识数据点。

In [9]:
obj2=pd.Series([4,7,-5,3],index=['a','b','c','d'])

In [10]:
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [11]:
# 与numpy数组相比，你可以从数组中选择数据的时候使用标签来进行索引

In [12]:
obj2['a']

4

In [15]:
obj2['d']=6

In [16]:
obj2[['c','a','d']]

c   -5
a    4
d    6
dtype: int64

In [17]:
# ['c','a','d'] 作为索引列表

In [18]:
# 将numpy利用起来

In [19]:
obj2[obj2>0]

a    4
b    7
d    6
dtype: int64

In [20]:
obj2*2 # 元素级的运算

a     8
b    14
c   -10
d    12
dtype: int64

In [25]:
np.exp(obj2)

a      54.598150
b    1096.633158
c       0.006738
d     403.428793
dtype: float64

In [26]:
#  Series 和字典相结合 

In [28]:
'b' in obj2


True

In [29]:
'e' in obj2

False

In [32]:
# 可以通过已经有的字典去生成一个series
sdata={'Ohio':35000,'Texas':71000,'Oregon':16000,'Utah':5000}
obj3=pd.Series(sdata)

In [33]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [34]:
# 当把字典传给series构造函数时，产生的series将是排序好的字典键，
# 可以将字典键按想要的顺序传给构造函数，从而生成的series索引顺序符合预期

In [35]:
states=['California','Ohio','Oregon','Texas']

In [36]:
obj4=pd.Series(sdata,index=states)

In [39]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [43]:
#NaN表示缺失数据，pandas中利用 isnull 和notnull来检查数据缺失

In [44]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [38]:
sdata

{'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}

In [45]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [46]:
# 同时isnull 和notnull 也是Series的实例方法

In [48]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [49]:
# 数学操作的自动对齐索引

In [50]:
obj3

Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64

In [51]:
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [52]:
obj3+obj4

California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64

In [53]:
#Series 对象自身和索引都有name属性

In [54]:
obj4.name='population'

In [55]:
obj4.index.name='state'

In [56]:
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

### DATAFrame

In [1]:
# 利用等长度列表或Numpy数组的字典来形成DataFrame

In [2]:
data={'state':['Ohio','Ohio','Ohio','Nevada','Nevada','Nevada'],'year':[2000,2001,2002,2001,2002,2003],'pop':[1.5,1.7,3.6,2.4,2.9,3.2]}

In [4]:
import pandas as pd
frame=pd.DataFrame(data)

In [5]:
frame

Unnamed: 0,state,year,pop
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [6]:
# 这里的列对应了字典中的键，并且以键的顺序进行了排列。同样，我们也可以指定列的排序

In [7]:
pd.DataFrame(data,columns=['year','state','pop'])  # 索引等操作较多的使用列表

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [8]:
# 最左边的索引列是dataframe 自动分配的，我们可以对索引的名称进行指定，当传入的列不存在时就会出现缺失值

In [11]:
frame2=pd.DataFrame(data,columns=['year','state','pop','debt'],index=['one','two','three','four','five','six'])

In [12]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [13]:
#利用键来对Dataframe进行列索引

In [14]:
frame['year']

0    2000
1    2001
2    2002
3    2001
4    2002
5    2003
Name: year, dtype: int64

In [15]:
# 行可以通过位置或特殊属性loc进行选取

In [16]:
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

In [17]:
frame2['debt']=16.5  # 原本是值缺失，整列进行赋值。

In [18]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [21]:
import numpy as np
frame2['debt']=np.arange(6.)

In [22]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


In [25]:
val=pd.Series([-1.2,-1.5,-1.7],index=['two','four','five'])

In [26]:
frame2['debt']=val

In [27]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,-1.2
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,-1.5
five,2002,Nevada,2.9,-1.7
six,2003,Nevada,3.2,


In [44]:
val2=pd.Series([-1.2,-1.5,-1.7,2.1,3.1])

In [45]:
frame2['debt']=val2

In [46]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [34]:
val2

0   -1.2
1   -1.5
2   -1.7
dtype: float64

In [35]:
val

two    -1.2
four   -1.5
five   -1.7
dtype: float64

In [47]:
# Series 索引默认从0开始，可以对索引值进行设置，从而满足特定要求的赋值。比如上述的例子中，要给Dataframe的某一列赋值，那就要按照索引,因为series本身是有索引的。


In [48]:
# 增加一列，该列是布尔值，判断条件是state列是否为'Ohio'

In [50]:
frame2['isOhio']=frame2['state']=='Ohio'

In [51]:
frame2

Unnamed: 0,year,state,pop,debt,isOhio
one,2000,Ohio,1.5,,True
two,2001,Ohio,1.7,,True
three,2002,Ohio,3.6,,True
four,2001,Nevada,2.4,,False
five,2002,Nevada,2.9,,False
six,2003,Nevada,3.2,,False


In [53]:
# 在利用del方法移除新建的列
del frame2['isOhio']

In [54]:
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

*Dataframe中选取的列是数据的视图，不是拷贝，对Series的修改会映射到Dataframe中，如果需要复制，则应当显式地使用Series的copy方法*

In [76]:
# 另一种常用的数据形式是包含字典的嵌套字典。
pop={'Nevada':{2001:2.4,2002:2.9},'Ohio':{2000:1.5,2001:1.7,2002:3.6}}

In [77]:
frame3=pd.DataFrame(pop)

In [78]:
frame3 

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


*嵌套字典赋值给dataframe，键作为列，内部字典的键作为行索引*

In [81]:
#对Dataframe进行转置
frame3.T

Unnamed: 0,2001,2002,2000
Nevada,2.4,2.9,
Ohio,1.7,3.6,1.5


In [82]:
pd.DataFrame(pop,index=[2001,2002,2003])

Unnamed: 0,Nevada,Ohio
2001,2.4,1.7
2002,2.9,3.6
2003,,


In [83]:
pdata={'Ohio':frame3['Ohio'][:-1],'Nevada':frame3['Nevada'][:2]}

In [84]:
pd.DataFrame(pdata)

Unnamed: 0,Ohio,Nevada
2001,1.7,2.4
2002,3.6,2.9


In [85]:
frame3.index.name='year'

In [86]:
frame3.columns.name='state'

In [87]:
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [88]:
#Dataframe的values 属性将会包含在Dataframe中的数据以二维数组的形式返回
frame3.values

array([[2.4, 1.7],
       [2.9, 3.6],
       [nan, 1.5]])

In [89]:
#如果dataframe的列是不同的dtypes values的dtype会自动选择合适所有列的类型
frame2.values

array([[2000, 'Ohio', 1.5, nan],
       [2001, 'Ohio', 1.7, nan],
       [2002, 'Ohio', 3.6, nan],
       [2001, 'Nevada', 2.4, nan],
       [2002, 'Nevada', 2.9, nan],
       [2003, 'Nevada', 3.2, nan]], dtype=object)

### 索引对象

In [90]:
obj=pd.Series(range(3),index=['a','b','c'])

In [91]:
index=obj.index 

In [92]:
index  #此时的index是索引对象，索引对象是不可变的，无法修改索引对象

Index(['a', 'b', 'c'], dtype='object')

In [93]:
index[1:]

Index(['b', 'c'], dtype='object')

In [94]:
#不变性使得在多种数据结构中分享索引对象更为安全。
labels=pd.Index(np.arange(3))

In [95]:
labels

Int64Index([0, 1, 2], dtype='int64')

In [96]:
obj2=pd.Series([1.5,-2.5,0],index=labels)

In [97]:
obj2

0    1.5
1   -2.5
2    0.0
dtype: float64

In [98]:
obj2.index is labels

True

In [99]:
# 索引对象还类似于一个固定大小的集合
frame3

state,Nevada,Ohio
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2001,2.4,1.7
2002,2.9,3.6
2000,,1.5


In [100]:
frame3.columns

Index(['Nevada', 'Ohio'], dtype='object', name='state')

In [101]:
'Ohio' in frame3.columns

True

In [102]:
2003 in frame3.index

False

In [103]:
'Ohio' in frame3.index

False

In [104]:
frame3.index

Int64Index([2001, 2002, 2000], dtype='int64', name='year')

## 基本功能

### 重建索引

In [2]:
import pandas as pd
obj=pd.Series([4.5,7.2,-5.3,3.6],index=['d','b','a','c'])

In [3]:
obj

d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64

In [5]:
obj2=obj.reindex(['a','b','c','d','e'])  #reindex 用于创建新索引的新对象，若该索引不存在，那么就会引入缺失值。

In [6]:
obj2

a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64

In [7]:
# 在reindex中有method可选参数，该参数允许我们使用ffill等方法重建索引时的插值，ffill会将值向前填充。

In [9]:
obj3=pd.Series(['blue','purple','yellow'],index=[0,2,4])

In [10]:
obj3

0      blue
2    purple
4    yellow
dtype: object

In [11]:
obj3.reindex(range(6),method='ffill')

0      blue
1      blue
2    purple
3    purple
4    yellow
5    yellow
dtype: object

In [12]:
# 在dataframe中可以改变行列索引，也可以改变两者，当仅传入一个序列时，对行进行重建索引。

In [14]:
import numpy as np
frame=pd.DataFrame(np.arange(9).reshape((3,3)),index=['a','c','d'],columns=['Ohio','Texas','California'])

In [15]:
frame

Unnamed: 0,Ohio,Texas,California
a,0,1,2
c,3,4,5
d,6,7,8


In [16]:
frame2=frame.reindex(['a','b','c','d'])

In [17]:
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [18]:
#列可以通过columns关键字进行索引

In [19]:
states=['Texas','Utah','California']

In [20]:
frame.reindex(columns=states)

Unnamed: 0,Texas,Utah,California
a,1,,2
c,4,,5
d,7,,8


In [21]:
#P137 提供reindex参数列表

### 轴向上删除条目

In [22]:
obj=pd.Series(np.arange(5.),index=['a','b','c','d','e'])

In [23]:
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64

In [24]:
new_obj=obj.drop('c')  # drop方法返回一个含有指示值或轴向上删除值的新对象，会产生新对象，而不是在源对象上的操作

In [25]:
new_obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [26]:
obj.drop(['d','c'])

a    0.0
b    1.0
e    4.0
dtype: float64

In [27]:
# drop 在dataframe上的应用
data=pd.DataFrame(np.arange(16).reshape((4,4)),index=['Ohio','Colorado','Utah','New York'],columns=['one','two','three','four'])

In [28]:
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [30]:
#调用drop使用标签序列会根据行标签删除值。
data.drop(['Colorado','Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [31]:
# 通过传递axis=1 或者 axis='columns'来从列中删除值
data.drop(['two','four'],axis='columns')# 先要指定一个删除的序列，然后指示要在列索引这些序列进行删除

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [32]:
obj.drop('c',inplace=True) # 这里没有返回一个新的对象，而是在原来对象的基础上将drop出去的行索引清除了。inplace清除被删除的数据

In [33]:
obj

a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64