## Pandas 库

Numpy库在处理向量化数值计算时性能优异，

但是在处理更灵活、更复杂的数据时显得力不从心。

如为数据加标签、处理缺失值、分组和透视表方面。

基于Numpy构建的Pandas库，提供了使得数据分析更快、更简单高效的工具。


In [1]:
import pandas as pd 

In [2]:
pd.Series([1, 2, "3", 4], index=["a", "b", "c", "d"])

a    1
b    2
c    3
d    4
dtype: object

In [3]:
pd.Series([1, 2, "3", 4], index=["a", "b", "c", "d"], dtype=float)

a    1.0
b    2.0
c    3.0
d    4.0
dtype: float64

In [4]:
pd.Series(5, index=[100, 200, 300])

100    5
200    5
300    5
dtype: int64

In [5]:
population_dict = {"Beijing": 2154, "Shanghai":2454, "Shenzhen":2017, "Chengdu":2019}
population_dict

{'Beijing': 2154, 'Shanghai': 2454, 'Shenzhen': 2017, 'Chengdu': 2019}

In [6]:
population_ser= pd.Series(population_dict)
population_ser

Beijing     2154
Shanghai    2454
Shenzhen    2017
Chengdu     2019
dtype: int64

In [7]:
pd.DataFrame({"population": population_ser})

Unnamed: 0,population
Beijing,2154
Shanghai,2454
Shenzhen,2017
Chengdu,2019


In [8]:
gdp_dict={"Beijing":465780, "Shanghai":48654, "Chengdu":32189, "Shenzhen":39865}
gdp_dict

{'Beijing': 465780, 'Shanghai': 48654, 'Chengdu': 32189, 'Shenzhen': 39865}

In [9]:
gdp_ser = pd.Series(gdp_dict)
gdp_ser

Beijing     465780
Shanghai     48654
Chengdu      32189
Shenzhen     39865
dtype: int64

In [10]:
df1 = pd.DataFrame({"population":population_ser, "gdp": gdp_ser})
df1 

Unnamed: 0,population,gdp
Beijing,2154,465780
Chengdu,2019,32189
Shanghai,2454,48654
Shenzhen,2017,39865


In [11]:
df2 = pd.DataFrame({"population":population_ser, "gdp": gdp_ser, "country":"China"})
df2

Unnamed: 0,population,gdp,country
Beijing,2154,465780,China
Chengdu,2019,32189,China
Shanghai,2454,48654,China
Shenzhen,2017,39865,China


In [12]:
df1.values

array([[  2154, 465780],
       [  2019,  32189],
       [  2454,  48654],
       [  2017,  39865]], dtype=int64)

In [13]:
df1.index

Index(['Beijing', 'Chengdu', 'Shanghai', 'Shenzhen'], dtype='object')

In [14]:
df1.columns

Index(['population', 'gdp'], dtype='object')

In [15]:
df1.dtypes

population    int64
gdp           int64
dtype: object

In [16]:
df1.size

8

In [17]:
df1.shape

(4, 2)

**获取列**

In [18]:
df1[["population", "gdp"]]

Unnamed: 0,population,gdp
Beijing,2154,465780
Chengdu,2019,32189
Shanghai,2454,48654
Shenzhen,2017,39865


In [19]:
df1.gdp

Beijing     465780
Chengdu      32189
Shanghai     48654
Shenzhen     39865
Name: gdp, dtype: int64

**获取行**

In [20]:
df1.loc['Beijing']

population      2154
gdp           465780
Name: Beijing, dtype: int64

In [21]:
df1.loc[["Beijing", "Chengdu"]]

Unnamed: 0,population,gdp
Beijing,2154,465780
Chengdu,2019,32189


In [22]:
df1.iloc[[0,1,2,3]]

Unnamed: 0,population,gdp
Beijing,2154,465780
Chengdu,2019,32189
Shanghai,2454,48654
Shenzhen,2017,39865


In [23]:
df1.loc['Chengdu',"population"]

2019

In [24]:
df1.iloc[1,0]

2019

In [25]:
df1.values[1][0]

2019

In [26]:
df1.gdp

Beijing     465780
Chengdu      32189
Shanghai     48654
Shenzhen     39865
Name: gdp, dtype: int64

In [27]:
df1.gdp['Chengdu']

32189

**切片**

In [28]:
dates = pd.date_range("2021-09-01", periods=7)
dates

DatetimeIndex(['2021-09-01', '2021-09-02', '2021-09-03', '2021-09-04',
               '2021-09-05', '2021-09-06', '2021-09-07'],
              dtype='datetime64[ns]', freq='D')

In [29]:
import numpy as np

df = pd.DataFrame(np.random.randn(7,4), index = dates, columns = ["A", "B", "C", "D"])
df 

Unnamed: 0,A,B,C,D
2021-09-01,1.180321,0.596487,-1.556691,-1.369338
2021-09-02,0.358413,0.097602,-0.25643,0.183887
2021-09-03,0.823817,-0.479739,0.794905,0.868695
2021-09-04,0.185277,-0.010888,-0.778074,-0.450014
2021-09-05,-0.115247,0.780321,-0.985932,-2.043655
2021-09-06,-0.671191,-0.908506,-0.767999,-1.808127
2021-09-07,1.314032,0.61258,-1.367673,-1.330548


In [30]:
df['2021-09-01':'2021-09-03']

Unnamed: 0,A,B,C,D
2021-09-01,1.180321,0.596487,-1.556691,-1.369338
2021-09-02,0.358413,0.097602,-0.25643,0.183887
2021-09-03,0.823817,-0.479739,0.794905,0.868695


In [31]:
df.loc["2021-09-01":"2021-09-03",:]

Unnamed: 0,A,B,C,D
2021-09-01,1.180321,0.596487,-1.556691,-1.369338
2021-09-02,0.358413,0.097602,-0.25643,0.183887
2021-09-03,0.823817,-0.479739,0.794905,0.868695


In [32]:
df.iloc[0:3,:]

Unnamed: 0,A,B,C,D
2021-09-01,1.180321,0.596487,-1.556691,-1.369338
2021-09-02,0.358413,0.097602,-0.25643,0.183887
2021-09-03,0.823817,-0.479739,0.794905,0.868695


In [33]:
df.loc[:, "A":"C"]

Unnamed: 0,A,B,C
2021-09-01,1.180321,0.596487,-1.556691
2021-09-02,0.358413,0.097602,-0.25643
2021-09-03,0.823817,-0.479739,0.794905
2021-09-04,0.185277,-0.010888,-0.778074
2021-09-05,-0.115247,0.780321,-0.985932
2021-09-06,-0.671191,-0.908506,-0.767999
2021-09-07,1.314032,0.61258,-1.367673


In [34]:
df.iloc[:,0:3]

Unnamed: 0,A,B,C
2021-09-01,1.180321,0.596487,-1.556691
2021-09-02,0.358413,0.097602,-0.25643
2021-09-03,0.823817,-0.479739,0.794905
2021-09-04,0.185277,-0.010888,-0.778074
2021-09-05,-0.115247,0.780321,-0.985932
2021-09-06,-0.671191,-0.908506,-0.767999
2021-09-07,1.314032,0.61258,-1.367673


In [35]:
df.loc["2021-09-04":"2021-09-07", ["A","D"]]

Unnamed: 0,A,D
2021-09-04,0.185277,-0.450014
2021-09-05,-0.115247,-2.043655
2021-09-06,-0.671191,-1.808127
2021-09-07,1.314032,-1.330548


In [36]:
df

Unnamed: 0,A,B,C,D
2021-09-01,1.180321,0.596487,-1.556691,-1.369338
2021-09-02,0.358413,0.097602,-0.25643,0.183887
2021-09-03,0.823817,-0.479739,0.794905,0.868695
2021-09-04,0.185277,-0.010888,-0.778074,-0.450014
2021-09-05,-0.115247,0.780321,-0.985932,-2.043655
2021-09-06,-0.671191,-0.908506,-0.767999,-1.808127
2021-09-07,1.314032,0.61258,-1.367673,-1.330548


In [37]:
df>0

Unnamed: 0,A,B,C,D
2021-09-01,True,True,False,False
2021-09-02,True,True,False,True
2021-09-03,True,False,True,True
2021-09-04,True,False,False,False
2021-09-05,False,True,False,False
2021-09-06,False,False,False,False
2021-09-07,True,True,False,False


In [38]:
df[df > 0]

Unnamed: 0,A,B,C,D
2021-09-01,1.180321,0.596487,,
2021-09-02,0.358413,0.097602,,0.183887
2021-09-03,0.823817,,0.794905,0.868695
2021-09-04,0.185277,,,
2021-09-05,,0.780321,,
2021-09-06,,,,
2021-09-07,1.314032,0.61258,,


In [39]:
df.A > 0

2021-09-01     True
2021-09-02     True
2021-09-03     True
2021-09-04     True
2021-09-05    False
2021-09-06    False
2021-09-07     True
Freq: D, Name: A, dtype: bool

In [40]:
df[df.A > 0]    #常用

Unnamed: 0,A,B,C,D
2021-09-01,1.180321,0.596487,-1.556691,-1.369338
2021-09-02,0.358413,0.097602,-0.25643,0.183887
2021-09-03,0.823817,-0.479739,0.794905,0.868695
2021-09-04,0.185277,-0.010888,-0.778074,-0.450014
2021-09-07,1.314032,0.61258,-1.367673,-1.330548


In [41]:
df2 = df.copy()
df2['E'] = ["one", "two","one", "two","three", "three", "four"]
df2

Unnamed: 0,A,B,C,D,E
2021-09-01,1.180321,0.596487,-1.556691,-1.369338,one
2021-09-02,0.358413,0.097602,-0.25643,0.183887,two
2021-09-03,0.823817,-0.479739,0.794905,0.868695,one
2021-09-04,0.185277,-0.010888,-0.778074,-0.450014,two
2021-09-05,-0.115247,0.780321,-0.985932,-2.043655,three
2021-09-06,-0.671191,-0.908506,-0.767999,-1.808127,three
2021-09-07,1.314032,0.61258,-1.367673,-1.330548,four


In [42]:
ind = df2['E'].isin(['two',"three"])
ind 

2021-09-01    False
2021-09-02     True
2021-09-03    False
2021-09-04     True
2021-09-05     True
2021-09-06     True
2021-09-07    False
Freq: D, Name: E, dtype: bool

In [43]:
df2[ind]

Unnamed: 0,A,B,C,D,E
2021-09-02,0.358413,0.097602,-0.25643,0.183887,two
2021-09-04,0.185277,-0.010888,-0.778074,-0.450014,two
2021-09-05,-0.115247,0.780321,-0.985932,-2.043655,three
2021-09-06,-0.671191,-0.908506,-0.767999,-1.808127,three


**增加列**

In [44]:
df 

Unnamed: 0,A,B,C,D
2021-09-01,1.180321,0.596487,-1.556691,-1.369338
2021-09-02,0.358413,0.097602,-0.25643,0.183887
2021-09-03,0.823817,-0.479739,0.794905,0.868695
2021-09-04,0.185277,-0.010888,-0.778074,-0.450014
2021-09-05,-0.115247,0.780321,-0.985932,-2.043655
2021-09-06,-0.671191,-0.908506,-0.767999,-1.808127
2021-09-07,1.314032,0.61258,-1.367673,-1.330548


In [45]:
ser = pd.Series(np.arange(1,8), index=pd.date_range("2021-09-01", periods=7))
ser

2021-09-01    1
2021-09-02    2
2021-09-03    3
2021-09-04    4
2021-09-05    5
2021-09-06    6
2021-09-07    7
Freq: D, dtype: int32

In [46]:
df['E'] = ser
df 

Unnamed: 0,A,B,C,D,E
2021-09-01,1.180321,0.596487,-1.556691,-1.369338,1
2021-09-02,0.358413,0.097602,-0.25643,0.183887,2
2021-09-03,0.823817,-0.479739,0.794905,0.868695,3
2021-09-04,0.185277,-0.010888,-0.778074,-0.450014,4
2021-09-05,-0.115247,0.780321,-0.985932,-2.043655,5
2021-09-06,-0.671191,-0.908506,-0.767999,-1.808127,6
2021-09-07,1.314032,0.61258,-1.367673,-1.330548,7


**修改列**

In [47]:
df['D'] = np.array([5] * len(df))
df 

Unnamed: 0,A,B,C,D,E
2021-09-01,1.180321,0.596487,-1.556691,5,1
2021-09-02,0.358413,0.097602,-0.25643,5,2
2021-09-03,0.823817,-0.479739,0.794905,5,3
2021-09-04,0.185277,-0.010888,-0.778074,5,4
2021-09-05,-0.115247,0.780321,-0.985932,5,5
2021-09-06,-0.671191,-0.908506,-0.767999,5,6
2021-09-07,1.314032,0.61258,-1.367673,5,7


In [48]:
df2.index = [i for i in range(len(df2))]
df2 

Unnamed: 0,A,B,C,D,E
0,1.180321,0.596487,-1.556691,-1.369338,one
1,0.358413,0.097602,-0.25643,0.183887,two
2,0.823817,-0.479739,0.794905,0.868695,one
3,0.185277,-0.010888,-0.778074,-0.450014,two
4,-0.115247,0.780321,-0.985932,-2.043655,three
5,-0.671191,-0.908506,-0.767999,-1.808127,three
6,1.314032,0.61258,-1.367673,-1.330548,four


In [49]:
df2.columns = [i for i in range(df2.shape[1])]
df2 

Unnamed: 0,0,1,2,3,4
0,1.180321,0.596487,-1.556691,-1.369338,one
1,0.358413,0.097602,-0.25643,0.183887,two
2,0.823817,-0.479739,0.794905,0.868695,one
3,0.185277,-0.010888,-0.778074,-0.450014,two
4,-0.115247,0.780321,-0.985932,-2.043655,three
5,-0.671191,-0.908506,-0.767999,-1.808127,three
6,1.314032,0.61258,-1.367673,-1.330548,four


**数值运算及统计分析**

**数据的查看**

In [50]:
dates = pd.date_range("2021-09-01", periods=7)
df = pd.DataFrame(np.random.randn(7,4), index=dates, columns = ["A", "B", "C", "D"])
df 

Unnamed: 0,A,B,C,D
2021-09-01,1.318841,1.202397,-0.047778,1.242356
2021-09-02,1.793207,-0.884195,1.014005,-1.453543
2021-09-03,0.595173,1.013125,-0.695444,0.06116
2021-09-04,0.074408,0.381177,0.837214,2.464736
2021-09-05,0.063832,0.311642,0.09154,-0.580485
2021-09-06,0.05405,0.806723,2.271444,0.860333
2021-09-07,-0.562715,-1.490288,-1.607665,-0.884811


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7 entries, 2021-09-01 to 2021-09-07
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       7 non-null      float64
 1   B       7 non-null      float64
 2   C       7 non-null      float64
 3   D       7 non-null      float64
dtypes: float64(4)
memory usage: 280.0 bytes


**一般来说，纯粹的计算在Numpy里执行更快**

**Numpy更加侧重于计算，Pandas侧重于数据处理**

广播运算


In [52]:
np.random.seed(0)
df = pd.DataFrame(np.random.randint(1,10,size=(3,3)), columns = list("ABC"))
df 

Unnamed: 0,A,B,C
0,6,1,4
1,4,8,4
2,6,3,5


按行广播

In [53]:
df.iloc[0]

A    6
B    1
C    4
Name: 0, dtype: int32

In [54]:
df / df.iloc[0]

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,0.666667,8.0,1.0
2,1.0,3.0,1.25


按列广播

In [55]:
df.A

0    6
1    4
2    6
Name: A, dtype: int32

In [56]:
df.div(df.A, axis=0)    #按列

Unnamed: 0,A,B,C
0,1.0,0.166667,0.666667
1,1.0,2.0,1.0
2,1.0,0.5,0.833333


In [57]:
df.div(df.iloc[0], axis=1)    #按行

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,0.666667,8.0,1.0
2,1.0,3.0,1.25


**Pandas新用法**

**索引对齐**


In [58]:
A = pd.DataFrame(np.random.randint(1,11, size=(2,2)), columns = list("AB"))
A 

Unnamed: 0,A,B
0,8,7
1,9,9


In [59]:
B = pd.DataFrame(np.random.randint(1,11, size=(3,3)), columns = list("ABC"))
B

Unnamed: 0,A,B,C
0,2,7,8
1,8,9,2
2,6,10,9


Pandas会自动对齐两个对象的索引，没有的值用np.NaN表示

In [60]:
A+B

Unnamed: 0,A,B,C
0,10.0,14.0,
1,17.0,18.0,
2,,,


缺失值可以用fill_value填充 

In [61]:
A.add(B, fill_value = 0)

Unnamed: 0,A,B,C
0,10.0,14.0,8.0
1,17.0,18.0,2.0
2,6.0,10.0,9.0


In [62]:
B.add(A, fill_value = 0)

Unnamed: 0,A,B,C
0,10.0,14.0,8.0
1,17.0,18.0,2.0
2,6.0,10.0,9.0


**统计相关**

**数据种类统计**

In [63]:
arr = np.random.randint(3, size=10)
arr

array([1, 0, 0, 1, 2, 0, 2, 0, 1, 1])

In [64]:
np.unique(arr)

array([0, 1, 2])

In [65]:
from collections import Counter
Counter(arr)

Counter({1: 4, 0: 4, 2: 2})

In [66]:
df = pd.DataFrame(arr, columns = ["A"])
df 

Unnamed: 0,A
0,1
1,0
2,0
3,1
4,2
5,0
6,2
7,0
8,1
9,1


In [67]:
np.unique(df)

array([0, 1, 2])

In [68]:
df["A"].value_counts()    #更好的办法

1    4
0    4
2    2
Name: A, dtype: int64

产生新的结果，并进行排序

In [69]:
population_dict = {"Beijing": 2154, "Shanghai":2454, "Shenzhen":2017, "Chengdu":2019}
#population_dict

population_ser= pd.Series(population_dict)
#population_ser

gdp_dict={"Beijing":465780, "Shanghai":48654, "Chengdu":32189, "Shenzhen":39865}
#gdp_dict
gdp_ser = pd.Series(gdp_dict)
#gdp_ser

city_info = pd.DataFrame({"population":population_ser, "gdp": gdp_ser})
city_info

Unnamed: 0,population,gdp
Beijing,2154,465780
Chengdu,2019,32189
Shanghai,2454,48654
Shenzhen,2017,39865


In [70]:
city_info["gdp_per_capita"] = city_info['gdp'] / city_info['population']
city_info

Unnamed: 0,population,gdp,gdp_per_capita
Beijing,2154,465780,216.239554
Chengdu,2019,32189,15.943041
Shanghai,2454,48654,19.826406
Shenzhen,2017,39865,19.764502


增序排序

In [71]:
city_info.sort_values(by = "gdp_per_capita")

Unnamed: 0,population,gdp,gdp_per_capita
Chengdu,2019,32189,15.943041
Shenzhen,2017,39865,19.764502
Shanghai,2454,48654,19.826406
Beijing,2154,465780,216.239554


降序排序

In [72]:
city_info.sort_values(by = "gdp_per_capita", ascending= False)

Unnamed: 0,population,gdp,gdp_per_capita
Beijing,2154,465780,216.239554
Shanghai,2454,48654,19.826406
Shenzhen,2017,39865,19.764502
Chengdu,2019,32189,15.943041


按进行排序

In [73]:
data = pd.DataFrame(np.random.randint(20, size=(3,3)), index=[2,1,0], columns=list("CBA"))
data

Unnamed: 0,C,B,A
2,19,19,14
1,7,0,1
0,9,0,10


In [74]:
# 按index排序
data.sort_index()

Unnamed: 0,C,B,A
0,9,0,10
1,7,0,1
2,19,19,14


In [75]:
# 按column排序
data.sort_index(axis=1,ascending= False)

Unnamed: 0,C,B,A
2,19,19,14
1,7,0,1
0,9,0,10


In [76]:
data.count()    # 非空个数， 0不是空

C    3
B    3
A    3
dtype: int64

In [77]:
data.iloc[0,0] = np.NaN
data.count()

C    2
B    3
A    3
dtype: int64

求和


In [78]:
data

Unnamed: 0,C,B,A
2,,19,14
1,7.0,0,1
0,9.0,0,10


In [79]:
data.sum()    #默认按列求和

C    16.0
B    19.0
A    25.0
dtype: float64

In [80]:
data.sum(axis=1)    #按行求和

2    33.0
1     8.0
0    19.0
dtype: float64

In [81]:
data.max()

C     9.0
B    19.0
A    14.0
dtype: float64

In [82]:
data.min(axis=1)

2    14.0
1     0.0
0     0.0
dtype: float64

In [83]:
data.idxmax()    #最大值所在的index

C    0
B    2
A    2
dtype: int64

In [84]:
data.mean()

C    8.000000
B    6.333333
A    8.333333
dtype: float64

In [85]:
data.std()

C     1.414214
B    10.969655
A     6.658328
dtype: float64

In [86]:
data.var()

C      2.000000
B    120.333333
A     44.333333
dtype: float64

In [87]:
data.median()

C     8.0
B     0.0
A    10.0
dtype: float64

In [88]:
data.mode()

Unnamed: 0,C,B,A
0,7.0,0.0,1
1,9.0,,10
2,,,14


In [89]:
data.describe()

Unnamed: 0,C,B,A
count,2.0,3.0,3.0
mean,8.0,6.333333,8.333333
std,1.414214,10.969655,6.658328
min,7.0,0.0,1.0
25%,7.5,0.0,5.5
50%,8.0,0.0,10.0
75%,8.5,9.5,12.0
max,9.0,19.0,14.0


相关性系数和协方差

In [90]:
data.corr()

Unnamed: 0,C,B,A
C,1.0,,1.0
B,,1.0,0.737043
A,1.0,0.737043,1.0


In [91]:
data.corrwith(data["A"])

C    1.000000
B    0.737043
A    1.000000
dtype: float64

apply()方法

apply(method)的用法：使用method方法默认对每一列进行相应的操作

In [92]:
np.random.seed(0)

df = pd.DataFrame(np.random.randint(1,11, size=(3,3)), columns = list("CBA"))
df 

Unnamed: 0,C,B,A
0,6,1,4
1,4,8,10
2,4,6,3


In [93]:
df.apply(np.cumsum)

Unnamed: 0,C,B,A
0,6,1,4
1,10,9,14
2,14,15,17


In [94]:
df.apply(np.cumsum, axis=1)

Unnamed: 0,C,B,A
0,6,7,11
1,4,12,22
2,4,10,13


In [95]:
df.apply(sum)

C    14
B    15
A    17
dtype: int64

In [96]:
df.apply(lambda x: x.max() - x.min())

C    2
B    7
A    7
dtype: int64

In [102]:
def my_describe(df):
    return pd.Series([df.max(), df.idxmax(), df.min(), df.idxmin(),df.std()], index=['max', "max_idx", "min", "min_idx", "std"])

df.apply(my_describe)

Unnamed: 0,C,B,A
max,6.0,8.0,10.0
max_idx,0.0,1.0,1.0
min,4.0,1.0,3.0
min_idx,1.0,0.0,2.0
std,1.154701,3.605551,3.785939


**缺失值处理**

- 发现缺失值


In [106]:
data = pd.DataFrame(np.array([[1, np.nan, 2],
                             [np.nan, 4, 5],
                             [4,7, None]]), columns = list("ABC"))
data

Unnamed: 0,A,B,C
0,1.0,,2.0
1,,4.0,5.0
2,4.0,7.0,


**注意： 有None， 字符串等，数据类型全部变为object, 它比int, float更消耗资源

In [107]:
data.dtypes

A    object
B    object
C    object
dtype: object

In [109]:
data.isnull()

Unnamed: 0,A,B,C
0,False,True,False
1,True,False,False
2,False,False,True


In [110]:
data.notnull()

Unnamed: 0,A,B,C
0,True,False,True
1,False,True,True
2,True,True,False


- 删除缺失值


In [114]:
data= pd.DataFrame(np.array([[1, np.nan, 2,3],
                            [np.nan, 2,4,3],
                            [7,8,5, np.nan],
                            [4,8,9,10]]), columns = ["A", "B", "C", "D"])
data

Unnamed: 0,A,B,C,D
0,1.0,,2.0,3.0
1,,2.0,4.0,3.0
2,7.0,8.0,5.0,
3,4.0,8.0,9.0,10.0


In [113]:
data.dtypes

A    float64
B    float64
C    float64
D    float64
dtype: object

In [116]:
# 删除缺失值所在的行(缺省)
data.dropna()

Unnamed: 0,A,B,C,D
3,4.0,8.0,9.0,10.0


In [117]:
# 删除缺失值所在的列
data.dropna(axis = "columns")

Unnamed: 0,C
0,2.0
1,4.0
2,5.0
3,9.0


In [119]:
data['D'] = np.nan
data

Unnamed: 0,A,B,C,D
0,1.0,,2.0,
1,,2.0,4.0,
2,7.0,8.0,5.0,
3,4.0,8.0,9.0,


In [120]:
# 当列中 全是 缺失值的时候才删除
data.dropna(axis="columns", how="all")    

Unnamed: 0,A,B,C
0,1.0,,2.0
1,,2.0,4.0
2,7.0,8.0,5.0
3,4.0,8.0,9.0


In [122]:
# 当列中 有一个 缺失值的时候才删除
data.dropna(axis="columns", how="any")  

Unnamed: 0,C
0,2.0
1,4.0
2,5.0
3,9.0


In [123]:
data 

Unnamed: 0,A,B,C,D
0,1.0,,2.0,
1,,2.0,4.0,
2,7.0,8.0,5.0,
3,4.0,8.0,9.0,


In [125]:
data.loc[3]=np.nan
data.dropna(how="all")

Unnamed: 0,A,B,C,D
0,1.0,,2.0,
1,,2.0,4.0,
2,7.0,8.0,5.0,


- 填充缺失值

In [127]:
data.fillna(value=999)

Unnamed: 0,A,B,C,D
0,1.0,999.0,2.0,999.0
1,999.0,2.0,4.0,999.0
2,7.0,8.0,5.0,999.0
3,999.0,999.0,999.0,999.0


In [128]:
data.loc[3] = [12,13,14,15]

- 用均值进行填充

In [137]:
data 

Unnamed: 0,A,B,C,D
0,1.0,,2.0,
1,,2.0,4.0,
2,7.0,8.0,5.0,
3,12.0,13.0,14.0,15.0


In [141]:
# axis =0 (index) 说明对每行上的值求平均值，所以是每列的平均值
fill = data.mean(axis=0)    
fill

A     6.666667
B     7.666667
C     6.250000
D    15.000000
dtype: float64

In [143]:
# skipna=False时，对有NaN的行或列则不计算均值
fill2 = data.mean(axis=1, skipna=False)    
fill2

0     NaN
1     NaN
2     NaN
3    13.5
dtype: float64

In [145]:
data.fillna(value=fill)

Unnamed: 0,A,B,C,D
0,1.0,7.666667,2.0,15.0
1,6.666667,2.0,4.0,15.0
2,7.0,8.0,5.0,15.0
3,12.0,13.0,14.0,15.0


In [149]:
# 把数字全部摊平求平均值
fill = data.stack().mean()
fill

7.545454545454546

In [150]:
data.stack()

0  A     1.0
   C     2.0
1  B     2.0
   C     4.0
2  A     7.0
   B     8.0
   C     5.0
3  A    12.0
   B    13.0
   C    14.0
   D    15.0
dtype: float64

**合并数据**

- 构造一个生产DataFrame的函数

In [151]:
def make_df(cols, ind):
    """生产一个简单的DataFrame"""
    data = {c: [str(c) + str(i) for i in ind] for c in cols}
    print(data)
    return pd.DataFrame(data, ind)

make_df("ABC", range(3))

{'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2'], 'C': ['C0', 'C1', 'C2']}


Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


- 垂直合并

In [155]:
df_1 = make_df("AB", [1,3])
df_2 = make_df("AB", [2,5])
print(df_1)
print(df_2)

{'A': ['A1', 'A3'], 'B': ['B1', 'B3']}
{'A': ['A2', 'A5'], 'B': ['B2', 'B5']}
    A   B
1  A1  B1
3  A3  B3
    A   B
2  A2  B2
5  A5  B5


In [156]:
pd.concat([df_1, df_2])

Unnamed: 0,A,B
1,A1,B1
3,A3,B3
2,A2,B2
5,A5,B5


- 水平合并

In [157]:
df_1 = make_df("AB", [0,1])
df_2 = make_df("CD", [0,1])
print(df_1)
print(df_2)

{'A': ['A0', 'A1'], 'B': ['B0', 'B1']}
{'C': ['C0', 'C1'], 'D': ['D0', 'D1']}
    A   B
0  A0  B0
1  A1  B1
    C   D
0  C0  D0
1  C1  D1


In [158]:
pd.concat([df_1, df_2], axis=1)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


- 行标签冲突的处理

In [159]:
df_5 = make_df("AB", [0,1])
df_6 = make_df("AB", [0,1])
pd.concat([df_5, df_6], ignore_index=True)    #重新排行标签

{'A': ['A0', 'A1'], 'B': ['B0', 'B1']}
{'A': ['A0', 'A1'], 'B': ['B0', 'B1']}


Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A0,B0
3,A1,B1


**列重叠**

In [165]:
df_7 = make_df("ABC", [1,2])
df_8 = make_df("BCD", [3,4])
print(df_7)
print(df_8)

{'A': ['A1', 'A2'], 'B': ['B1', 'B2'], 'C': ['C1', 'C2']}
{'B': ['B3', 'B4'], 'C': ['C3', 'C4'], 'D': ['D3', 'D4']}
    A   B   C
1  A1  B1  C1
2  A2  B2  C2
    B   C   D
3  B3  C3  D3
4  B4  C4  D4


In [167]:
pd.concat([df_7, df_8])

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


In [168]:
pd.concat([df_7, df_8], sort=False)

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


In [169]:
pd.concat([df_7, df_8],axis=1)

Unnamed: 0,A,B,C,B.1,C.1,D
1,A1,B1,C1,,,
2,A2,B2,C2,,,
3,,,,B3,C3,D3
4,,,,B4,C4,D4


In [170]:
pd.concat([df_7, df_8],axis=1, ignore_index=True)    #重复的列名重新排

Unnamed: 0,0,1,2,3,4,5
1,A1,B1,C1,,,
2,A2,B2,C2,,,
3,,,,B3,C3,D3
4,,,,B4,C4,D4


- 对齐合并merge()

In [161]:
df_9 = make_df("AB", [0,1])
df_a = make_df("BC", [0,1])
print(df_9)
print(df_a)

{'A': ['A0', 'A1'], 'B': ['B0', 'B1']}
{'B': ['B0', 'B1'], 'C': ['C0', 'C1']}
    A   B
0  A0  B0
1  A1  B1
    B   C
0  B0  C0
1  B1  C1


In [162]:
pd.merge(df_9, df_a)

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1


In [163]:
df_9 = make_df("AB", [0,1])
df_a = make_df("BC", [1,0])
print(df_9)
print(df_a)

{'A': ['A0', 'A1'], 'B': ['B0', 'B1']}
{'B': ['B1', 'B0'], 'C': ['C1', 'C0']}
    A   B
0  A0  B0
1  A1  B1
    B   C
1  B1  C1
0  B0  C0


In [164]:
pd.merge(df_9, df_a)    #自动的关联相应的C0,C1

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1


【例】 合并城市信息

In [175]:
population_dict = {"city": ["Beijing", "Chengdu", "Shanghai"],
                  "population": [2154, 2016, 2278]}
population = pd.DataFrame(population_dict)
population 

Unnamed: 0,city,population
0,Beijing,2154
1,Chengdu,2016
2,Shanghai,2278


In [176]:
gdp_dict = {"city": ["Beijing", "Shanghai", "Shenzhen"],
            "gdp": [30320, 32680, 13468]}
gdp = pd.DataFrame(gdp_dict)
gdp 

Unnamed: 0,city,gdp
0,Beijing,30320
1,Shanghai,32680
2,Shenzhen,13468


In [177]:
city_info = pd.merge(population, gdp)    # 交集
city_info

Unnamed: 0,city,population,gdp
0,Beijing,2154,30320
1,Shanghai,2278,32680


In [178]:
city_info = pd.merge(population, gdp, how="outer")    # 并集
city_info

Unnamed: 0,city,population,gdp
0,Beijing,2154.0,30320.0
1,Chengdu,2016.0,
2,Shanghai,2278.0,32680.0
3,Shenzhen,,13468.0


**分组和数据透视表**

In [180]:
df = pd.DataFrame({"key": ["A", "B", "C", "C", "B", "A"],
                  "data1": range(6),
                  "data2": np.random.randint(10, size=6)})
df 

Unnamed: 0,key,data1,data2
0,A,0,6
1,B,1,7
2,C,2,7
3,C,3,8
4,B,4,1
5,A,5,5


- 分组

延迟计算

In [181]:
df.groupby("key")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000002626EDA5880>

In [182]:
df.groupby("key").sum()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,5,11
B,5,8
C,5,15


In [183]:
df.groupby("key").mean()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2.5,5.5
B,2.5,4.0
C,2.5,7.5


In [184]:
for i in df.groupby("key"):
    print(i)

('A',   key  data1  data2
0   A      0      6
5   A      5      5)
('B',   key  data1  data2
1   B      1      7
4   B      4      1)
('C',   key  data1  data2
2   C      2      7
3   C      3      8)


- 按列取值

In [186]:
df.groupby("key")["data2"].sum()

key
A    11
B     8
C    15
Name: data2, dtype: int32