# Pandas数据操作

In [1]:
import pandas as pd

* Series索引

In [2]:
ser_obj = pd.Series(range(5), index = ['a', 'b', 'c', 'd', 'e'])
print(ser_obj.head())

a    0
b    1
c    2
d    3
e    4
dtype: int32


In [3]:
# 行索引
print(ser_obj['a'])
print(ser_obj[0])

0
0


In [4]:
# 切片索引
print(ser_obj[1:3])
print(ser_obj['b':'d'])

b    1
c    2
dtype: int32
b    1
c    2
d    3
dtype: int32


In [5]:
# 不连续索引
print(ser_obj[[0, 2, 4]])
print(ser_obj[['a', 'e']])

a    0
c    2
e    4
dtype: int32
a    0
e    4
dtype: int32


In [6]:
# 布尔索引
ser_bool = ser_obj > 2
print(ser_bool)
print(ser_obj[ser_bool])

print(ser_obj[ser_obj > 2])

a    False
b    False
c    False
d     True
e     True
dtype: bool
d    3
e    4
dtype: int32
d    3
e    4
dtype: int32


* DataFrame索引

In [7]:
import numpy as np

df_obj = pd.DataFrame(np.random.randn(5,4), columns = ['a', 'b', 'c', 'd'])
print(df_obj.head())

          a         b         c         d
0  1.319634  1.179927  0.035954 -1.732345
1 -0.782866 -0.912715 -0.086812  0.809214
2  1.922533  0.321756  0.139710 -0.254937
3 -0.850643 -1.375166  1.097990 -0.395013
4 -0.048961 -1.527386 -0.942182  1.077464


In [8]:
# 列索引
print('列索引')
print(df_obj['a']) # 返回Series类型
print(type(df_obj[[0]])) # 返回DataFrame类型

# 不连续索引
print('不连续索引')
print(df_obj[['a','c']])
print(df_obj[[1, 3]])

列索引
0    1.319634
1   -0.782866
2    1.922533
3   -0.850643
4   -0.048961
Name: a, dtype: float64
<class 'pandas.core.frame.DataFrame'>
不连续索引
          a         c
0  1.319634  0.035954
1 -0.782866 -0.086812
2  1.922533  0.139710
3 -0.850643  1.097990
4 -0.048961 -0.942182
          b         d
0  1.179927 -1.732345
1 -0.912715  0.809214
2  0.321756 -0.254937
3 -1.375166 -0.395013
4 -1.527386  1.077464


* 三种索引方式

In [9]:
# 标签索引 loc
# Series
print(ser_obj['b':'d'])
print(ser_obj.loc['b':'d'])

# DataFrame
print(df_obj['a'])
print(df_obj.loc[0:2, 'a'])

b    1
c    2
d    3
dtype: int32
b    1
c    2
d    3
dtype: int32
0    1.319634
1   -0.782866
2    1.922533
3   -0.850643
4   -0.048961
Name: a, dtype: float64
0    1.319634
1   -0.782866
2    1.922533
Name: a, dtype: float64


In [10]:
# 整型位置索引 iloc
print(ser_obj[1:3])
print(ser_obj.iloc[1:3])

# DataFrame
print(df_obj.iloc[0:2, 0]) # 注意和df_obj.loc[0:2, 'a']的区别

b    1
c    2
dtype: int32
b    1
c    2
dtype: int32
0    1.319634
1   -0.782866
Name: a, dtype: float64


In [11]:
# 混合索引 ix
print(ser_obj.ix[1:3])
print(ser_obj.ix['b':'c'])

# DataFrame
print(df_obj.ix[0:2, 0]) # 先按标签索引尝试操作，然后再按位置索引尝试操作

b    1
c    2
dtype: int32
b    1
c    2
dtype: int32
0    1.319634
1   -0.782866
2    1.922533
Name: a, dtype: float64


* 运算与对齐

In [12]:
s1 = pd.Series(range(10, 20), index = range(10))
s2 = pd.Series(range(20, 25), index = range(5))

print('s1: ' )
print(s1)

print('') 

print('s2: ')
print(s2)

s1: 
0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int32

s2: 
0    20
1    21
2    22
3    23
4    24
dtype: int32


In [14]:
# Series 对齐运算
s1 + s2

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64

In [17]:
import numpy as np

df1 = pd.DataFrame(np.ones((2,2)), columns = ['a', 'b'])
df2 = pd.DataFrame(np.ones((3,3)), columns = ['a', 'b', 'c'])

print('df1: ')
print(df1)

print('') 
print('df2: ')
print(df2)

df1: 
     a    b
0  1.0  1.0
1  1.0  1.0

df2: 
     a    b    c
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0


In [18]:
# DataFrame对齐操作
df1 + df2

Unnamed: 0,a,b,c
0,2.0,2.0,
1,2.0,2.0,
2,,,


In [19]:
# 填充未对齐的数据进行运算
print(s1)
print(s2)

s1.add(s2, fill_value = -1)

0    10
1    11
2    12
3    13
4    14
5    15
6    16
7    17
8    18
9    19
dtype: int32
0    20
1    21
2    22
3    23
4    24
dtype: int32


0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    14.0
6    15.0
7    16.0
8    17.0
9    18.0
dtype: float64

In [20]:
df1.sub(df2, fill_value = 2.)

Unnamed: 0,a,b,c
0,0.0,0.0,1.0
1,0.0,0.0,1.0
2,1.0,1.0,1.0


In [21]:
# 填充NaN
s3 = s1 + s2
print(s3)

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5     NaN
6     NaN
7     NaN
8     NaN
9     NaN
dtype: float64


In [22]:
s3_filled = s3.fillna(-1)
print(s3_filled)

0    30.0
1    32.0
2    34.0
3    36.0
4    38.0
5    -1.0
6    -1.0
7    -1.0
8    -1.0
9    -1.0
dtype: float64


In [23]:
df3 = df1 + df2
print(df3)

     a    b   c
0  2.0  2.0 NaN
1  2.0  2.0 NaN
2  NaN  NaN NaN


In [24]:
df3.fillna(100, inplace = True)
print(df3)

       a      b      c
0    2.0    2.0  100.0
1    2.0    2.0  100.0
2  100.0  100.0  100.0


* 函数应用

In [25]:
# Numpy ufunc 函数
df = pd.DataFrame(np.random.randn(5,4) - 1)
print(df)

print(np.abs(df))

          0         1         2         3
0 -0.821704 -1.596440 -1.889291 -0.374076
1 -1.283000 -0.490058 -0.598786 -2.525205
2 -0.627565 -0.304207 -0.090379  0.152559
3 -1.468792 -1.100645 -0.413771 -2.213179
4 -0.329328 -1.995626 -0.845139 -2.523275
          0         1         2         3
0  0.821704  1.596440  1.889291  0.374076
1  1.283000  0.490058  0.598786  2.525205
2  0.627565  0.304207  0.090379  0.152559
3  1.468792  1.100645  0.413771  2.213179
4  0.329328  1.995626  0.845139  2.523275


In [27]:
# 使用apply应用行或列数据
#f = lambda x : x.max()
print(df.apply(lambda x : x.max()))#按列来选最大

0   -0.329328
1   -0.304207
2   -0.090379
3    0.152559
dtype: float64


In [28]:
# 指定轴方向
print(df.apply(lambda x : x.max(), axis=1))

0   -0.374076
1   -0.490058
2    0.152559
3   -0.413771
4   -0.329328
dtype: float64


In [29]:
# 使用applymap应用到每个数据
f2 = lambda x : '%.2f' % x
print(df.applymap(f2))

       0      1      2      3
0  -0.82  -1.60  -1.89  -0.37
1  -1.28  -0.49  -0.60  -2.53
2  -0.63  -0.30  -0.09   0.15
3  -1.47  -1.10  -0.41  -2.21
4  -0.33  -2.00  -0.85  -2.52


* 排序

In [30]:
s4 = pd.Series(range(10, 15), index = np.random.randint(5, size=5))
print(s4)

1    10
1    11
0    12
4    13
0    14
dtype: int32


In [31]:
# 索引排序
s4.sort_index()

0    12
0    14
1    10
1    11
4    13
dtype: int32

In [32]:
df4 = pd.DataFrame(np.random.randn(3, 4), 
                   index=np.random.randint(3, size=3),
                   columns=np.random.randint(4, size=4))
print(df4)

          3         3         3         0
2 -0.398480 -1.311104  0.308048  0.640970
1 -1.410596  0.816201  0.626191 -0.882190
1 -2.609838 -0.826512 -0.588171 -1.039597


In [37]:
#df4.sort_index(ascending=False)
df4.sort_index(axis=1)

Unnamed: 0,0,3,3.1,3.2
2,0.64097,-0.39848,-1.311104,0.308048
1,-0.88219,-1.410596,0.816201,0.626191
1,-1.039597,-2.609838,-0.826512,-0.588171


In [38]:
# 按值排序
df4.sort_values(by=1)

KeyError: 1

* 处理缺失数据

In [40]:
df_data = pd.DataFrame([np.random.randn(3), [1., np.nan, np.nan],
                       [4., np.nan, np.nan], [1., np.nan, 2.]])
df_data.head()

Unnamed: 0,0,1,2
0,-1.578358,1.097546,1.036639
1,1.0,,
2,4.0,,
3,1.0,,2.0


In [41]:
# isnull
df_data.isnull()

Unnamed: 0,0,1,2
0,False,False,False
1,False,True,True
2,False,True,True
3,False,True,False


In [42]:
# dropna
df_data.dropna()
#df_data.dropna(axis=1)

Unnamed: 0,0,1,2
0,-1.578358,1.097546,1.036639


In [None]:
# fillna
df_data.fillna(-100.)