# Pandas数据操作

In [1]:
import pandas as pd

* Series索引

In [2]:
ser_obj = pd.Series(range(5), index = ['a', 'b', 'c', 'd', 'e'])
print(ser_obj.head())

a    0
b    1
c    2
d    3
e    4
dtype: int32


In [3]:
# 行索引
print(ser_obj['a'])
print(ser_obj[0])

0
0


In [4]:
# 切片索引
print(ser_obj[1:3])
print(ser_obj['b':'d'])

b    1
c    2
dtype: int32
b    1
c    2
d    3
dtype: int32


In [5]:
# 不连续索引
print(ser_obj[[0, 2, 4]])
print(ser_obj[['a', 'e']])

a    0
c    2
e    4
dtype: int32
a    0
e    4
dtype: int32


In [6]:
# 布尔索引
ser_bool = ser_obj > 2
print(ser_bool)
print(ser_obj[ser_bool])

print(ser_obj[ser_obj > 2])

a    False
b    False
c    False
d     True
e     True
dtype: bool
d    3
e    4
dtype: int32
d    3
e    4
dtype: int32


* DataFrame索引

In [7]:
import numpy as np

df_obj = pd.DataFrame(np.random.randn(5,4), columns = ['a', 'b', 'c', 'd'])
print(df_obj.head())

          a         b         c         d
0 -0.595692  0.813699 -0.551327 -0.059703
1  0.339194 -2.335579  0.230472 -0.680213
2 -0.252306  0.212406 -0.979523  0.408522
3  0.216677  0.574524 -0.819607  2.170009
4 -1.099175 -0.665488  0.391421 -0.400642


In [8]:
# 列索引
print('列索引')
print(df_obj['a']) # 返回Series类型
print(type(df_obj[[0]])) # 返回DataFrame类型

# 不连续索引
print('不连续索引')
print(df_obj[['a','c']])
print(df_obj[[1, 3]])

列索引
0   -0.595692
1    0.339194
2   -0.252306
3    0.216677
4   -1.099175
Name: a, dtype: float64
<class 'pandas.core.frame.DataFrame'>
不连续索引
          a         c
0 -0.595692 -0.551327
1  0.339194  0.230472
2 -0.252306 -0.979523
3  0.216677 -0.819607
4 -1.099175  0.391421
          b         d
0  0.813699 -0.059703
1 -2.335579 -0.680213
2  0.212406  0.408522
3  0.574524  2.170009
4 -0.665488 -0.400642


* 三种索引方式

In [9]:
# 标签索引 loc
# Series
print(ser_obj['b':'d'])
print(ser_obj.loc['b':'d'])

# DataFrame
print(df_obj['a'])
print(df_obj.loc[0:2, 'a'])

b    1
c    2
d    3
dtype: int32
b    1
c    2
d    3
dtype: int32
0   -0.595692
1    0.339194
2   -0.252306
3    0.216677
4   -1.099175
Name: a, dtype: float64
0   -0.595692
1    0.339194
2   -0.252306
Name: a, dtype: float64


In [10]:
# 整型位置索引 iloc
print(ser_obj[1:3])
print(ser_obj.iloc[1:3])

# DataFrame
print(df_obj.iloc[0:2, 0]) # 注意和df_obj.loc[0:2, 'a']的区别

b    1
c    2
dtype: int32
b    1
c    2
dtype: int32
0   -0.595692
1    0.339194
Name: a, dtype: float64


In [11]:
# 混合索引 ix
print(ser_obj.ix[1:3])
print(ser_obj.ix['b':'c'])

# DataFrame
print(df_obj.ix[0:2, 0]) # 先按标签索引尝试操作，然后再按位置索引尝试操作

b    1
c    2
dtype: int32
b    1
c    2
dtype: int32
0   -0.595692
1    0.339194
2   -0.252306
Name: a, dtype: float64


* 运算与对齐

In [12]:
s1 = pd.Series(range(10, 20), index = range(10))
s2 = pd.Series(range(20, 25), index = range(5))

print('s1: ' )
print(s1)

print('') 

print('s2: ')
print(s2)

s1: 
0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int32

s2: 
0    20
1    21
2    22
3    23
4    24
dtype: int32


In [13]:
# Series 对齐运算
s1 + s2

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64

In [14]:
import numpy as np

df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b'])
df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c'])

print('df1: ')
print(df1)

print('') 
print('df2: ')
print(df2)

df1: 
     a    b
0  1.0  1.0
1  1.0  1.0

df2: 
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0


In [15]:
# DataFrame对齐操作
df1 + df2

Unnamed: 0,a,b,c
0,2.0,2.0,
1,2.0,2.0,
2,,,


In [16]:
# 填充未对齐的数据进行运算
print(s1)
print(s2)

s1.add(s2, fill_value = -1)

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int32
0    20
1    21
2    22
3    23
4    24
dtype: int32


0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    14.0
6    15.0
7    16.0
8    17.0
9    18.0
dtype: float64

In [17]:
df1.sub(df2, fill_value = 2.)

Unnamed: 0,a,b,c
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,1.0,1.0,1.0


In [18]:
# 填充NaN
s3 = s1 + s2
print(s3)

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64


In [19]:
s3_filled = s3.fillna(-1)
print(s3_filled)

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    -1.0
6    -1.0
7    -1.0
8    -1.0
9    -1.0
dtype: float64


In [20]:
df3 = df1 + df2
print(df3)

     a    b   c
0  2.0  2.0 NaN
1  2.0  2.0 NaN
2  NaN  NaN NaN


In [21]:
df3.fillna(100, inplace = True)
print(df3)

       a      b      c
0    2.0    2.0  100.0
1    2.0    2.0  100.0
2  100.0  100.0  100.0


* 函数应用

In [22]:
# Numpy ufunc 函数
df = pd.DataFrame(np.random.randn(5,4) - 1)
print(df)

print(np.abs(df))

          0         1         2         3
0 -2.193022 -2.090432 -2.288651 -0.026022
1 -0.720957 -1.501025 -1.734828 -1.858286
2  0.300216 -3.391127 -0.872570 -0.686669
3 -2.552131 -1.452268 -1.188845 -0.597845
4  2.111044 -1.203676 -1.143487 -0.542755
          0         1         2         3
0  2.193022  2.090432  2.288651  0.026022
1  0.720957  1.501025  1.734828  1.858286
2  0.300216  3.391127  0.872570  0.686669
3  2.552131  1.452268  1.188845  0.597845
4  2.111044  1.203676  1.143487  0.542755


In [23]:
# 使用apply应用行或列数据
#f = lambda x : x.max()
print(df.apply(lambda x : x.max()))

0    2.111044
1   -1.203676
2   -0.872570
3   -0.026022
dtype: float64


In [24]:
# 指定轴方向
print(df.apply(lambda x : x.max(), axis=1))

0   -0.026022
1   -0.720957
2    0.300216
3   -0.597845
4    2.111044
dtype: float64


In [25]:
# 使用applymap应用到每个数据
f2 = lambda x : '%.2f' % x
print(df.applymap(f2))

       0      1      2      3
0  -2.19  -2.09  -2.29  -0.03
1  -0.72  -1.50  -1.73  -1.86
2   0.30  -3.39  -0.87  -0.69
3  -2.55  -1.45  -1.19  -0.60
4   2.11  -1.20  -1.14  -0.54


* 排序

In [26]:
s4 = pd.Series(range(10, 15), index = np.random.randint(5, size=5))
print(s4)

4    10
3    11
1    12
4    13
4    14
dtype: int32


In [27]:
# 索引排序
s4.sort_index()

1    12
3    11
4    10
4    13
4    14
dtype: int32

In [28]:
df4 = pd.DataFrame(np.random.randn(3, 4), 
                   index=np.random.randint(3, size=3),
                   columns=np.random.randint(4, size=4))
print(df4)

          3         2         2         1
2  0.244068 -1.977220  0.045238 -2.064546
2  0.218196 -0.419284 -0.698839  0.241649
2  0.296747 -0.021311  0.225724 -0.325439


In [29]:
#df4.sort_index(ascending=False)
df4.sort_index(axis=1)

Unnamed: 0,1,2,2.1,3
2,-2.064546,-1.97722,0.045238,0.244068
2,0.241649,-0.419284,-0.698839,0.218196
2,-0.325439,-0.021311,0.225724,0.296747


In [30]:
# 按值排序
df4.sort_values(by=1)

Unnamed: 0,3,2,2.1,1
2,0.244068,-1.97722,0.045238,-2.064546
2,0.296747,-0.021311,0.225724,-0.325439
2,0.218196,-0.419284,-0.698839,0.241649


* 处理缺失数据

In [31]:
df_data = pd.DataFrame([np.random.randn(3), [1., np.nan, np.nan],
                       [4., np.nan, np.nan], [1., np.nan, 2.]])
df_data.head()

Unnamed: 0,0,1,2
0,1.619463,0.548047,-1.027003
1,1.0,,
2,4.0,,
3,1.0,,2.0


In [32]:
# isnull
df_data.isnull()

Unnamed: 0,0,1,2
0,False,False,False
1,False,True,True
2,False,True,True
3,False,True,False


In [33]:
# dropna
df_data.dropna()
#df_data.dropna(axis=1)

Unnamed: 0,0,1,2
0,1.619463,0.548047,-1.027003


In [34]:
# fillna
df_data.fillna(-100.)

Unnamed: 0,0,1,2
0,1.619463,0.548047,-1.027003
1,1.0,-100.0,-100.0
2,4.0,-100.0,-100.0
3,1.0,-100.0,2.0
