## Pandas 库

Numpy库在处理向量化数值计算时性能优异，

但是在处理更灵活、更复杂的数据时显得力不从心。

如为数据加标签、处理缺失值、分组和透视表方面。

基于Numpy构建的Pandas库，提供了使得数据分析更快、更简单高效的工具。


In [1]:
import pandas as pd 

In [2]:
pd.Series([1, 2, "3", 4], index=["a", "b", "c", "d"])

a    1
b    2
c    3
d    4
dtype: object

In [3]:
pd.Series([1, 2, "3", 4], index=["a", "b", "c", "d"], dtype=float)

a    1.0
b    2.0
c    3.0
d    4.0
dtype: float64

In [4]:
pd.Series(5, index=[100, 200, 300])

100    5
200    5
300    5
dtype: int64

In [5]:
population_dict = {"Beijing": 2154, "Shanghai":2454, "Shenzhen":2017, "Chengdu":2019}
population_dict

{'Beijing': 2154, 'Shanghai': 2454, 'Shenzhen': 2017, 'Chengdu': 2019}

In [6]:
population_ser= pd.Series(population_dict)
population_ser

Beijing     2154
Shanghai    2454
Shenzhen    2017
Chengdu     2019
dtype: int64

In [7]:
pd.DataFrame({"population": population_ser})

Unnamed: 0,population
Beijing,2154
Shanghai,2454
Shenzhen,2017
Chengdu,2019


In [8]:
gdp_dict={"Beijing":465780, "Shanghai":48654, "Chengdu":32189, "Shenzhen":39865}
gdp_dict

{'Beijing': 465780, 'Shanghai': 48654, 'Chengdu': 32189, 'Shenzhen': 39865}

In [9]:
gdp_ser = pd.Series(gdp_dict)
gdp_ser

Beijing     465780
Shanghai     48654
Chengdu      32189
Shenzhen     39865
dtype: int64

In [10]:
df1 = pd.DataFrame({"population":population_ser, "gdp": gdp_ser})
df1 

Unnamed: 0,population,gdp
Beijing,2154,465780
Chengdu,2019,32189
Shanghai,2454,48654
Shenzhen,2017,39865


In [11]:
df2 = pd.DataFrame({"population":population_ser, "gdp": gdp_ser, "country":"China"})
df2

Unnamed: 0,population,gdp,country
Beijing,2154,465780,China
Chengdu,2019,32189,China
Shanghai,2454,48654,China
Shenzhen,2017,39865,China


In [12]:
df1.values

array([[  2154, 465780],
       [  2019,  32189],
       [  2454,  48654],
       [  2017,  39865]], dtype=int64)

In [13]:
df1.index

Index(['Beijing', 'Chengdu', 'Shanghai', 'Shenzhen'], dtype='object')

In [14]:
df1.columns

Index(['population', 'gdp'], dtype='object')

In [15]:
df1.dtypes

population    int64
gdp           int64
dtype: object

In [16]:
df1.size

8

In [17]:
df1.shape

(4, 2)

**获取列**

In [18]:
df1[["population", "gdp"]]

Unnamed: 0,population,gdp
Beijing,2154,465780
Chengdu,2019,32189
Shanghai,2454,48654
Shenzhen,2017,39865


In [19]:
df1.gdp

Beijing     465780
Chengdu      32189
Shanghai     48654
Shenzhen     39865
Name: gdp, dtype: int64

**获取行**

In [20]:
df1.loc['Beijing']

population      2154
gdp           465780
Name: Beijing, dtype: int64

In [21]:
df1.loc[["Beijing", "Chengdu"]]

Unnamed: 0,population,gdp
Beijing,2154,465780
Chengdu,2019,32189


In [22]:
df1.iloc[[0,1,2,3]]

Unnamed: 0,population,gdp
Beijing,2154,465780
Chengdu,2019,32189
Shanghai,2454,48654
Shenzhen,2017,39865


In [23]:
df1.loc['Chengdu',"population"]

2019

In [24]:
df1.iloc[1,0]

2019

In [25]:
df1.values[1][0]

2019

In [26]:
df1.gdp

Beijing     465780
Chengdu      32189
Shanghai     48654
Shenzhen     39865
Name: gdp, dtype: int64

In [27]:
df1.gdp['Chengdu']

32189

**切片**

In [28]:
dates = pd.date_range("2021-09-01", periods=7)
dates

DatetimeIndex(['2021-09-01', '2021-09-02', '2021-09-03', '2021-09-04',
               '2021-09-05', '2021-09-06', '2021-09-07'],
              dtype='datetime64[ns]', freq='D')

In [29]:
import numpy as np

df = pd.DataFrame(np.random.randn(7,4), index = dates, columns = ["A", "B", "C", "D"])
df 

Unnamed: 0,A,B,C,D
2021-09-01,-1.031584,0.286248,-0.781269,0.31707
2021-09-02,-1.164944,-1.81836,0.551225,-1.180479
2021-09-03,0.154908,-0.206827,0.616561,0.598239
2021-09-04,1.000279,-0.088727,1.708009,0.669216
2021-09-05,-0.11669,-0.560096,0.595706,-1.098146
2021-09-06,1.667857,0.126256,0.251574,-1.169267
2021-09-07,0.394512,-1.624111,-1.114556,-0.417763


In [30]:
df['2021-09-01':'2021-09-03']

Unnamed: 0,A,B,C,D
2021-09-01,-1.031584,0.286248,-0.781269,0.31707
2021-09-02,-1.164944,-1.81836,0.551225,-1.180479
2021-09-03,0.154908,-0.206827,0.616561,0.598239


In [31]:
df.loc["2021-09-01":"2021-09-03",:]

Unnamed: 0,A,B,C,D
2021-09-01,-1.031584,0.286248,-0.781269,0.31707
2021-09-02,-1.164944,-1.81836,0.551225,-1.180479
2021-09-03,0.154908,-0.206827,0.616561,0.598239


In [32]:
df.iloc[0:3,:]

Unnamed: 0,A,B,C,D
2021-09-01,-1.031584,0.286248,-0.781269,0.31707
2021-09-02,-1.164944,-1.81836,0.551225,-1.180479
2021-09-03,0.154908,-0.206827,0.616561,0.598239


In [33]:
df.loc[:, "A":"C"]

Unnamed: 0,A,B,C
2021-09-01,-1.031584,0.286248,-0.781269
2021-09-02,-1.164944,-1.81836,0.551225
2021-09-03,0.154908,-0.206827,0.616561
2021-09-04,1.000279,-0.088727,1.708009
2021-09-05,-0.11669,-0.560096,0.595706
2021-09-06,1.667857,0.126256,0.251574
2021-09-07,0.394512,-1.624111,-1.114556


In [34]:
df.iloc[:,0:3]

Unnamed: 0,A,B,C
2021-09-01,-1.031584,0.286248,-0.781269
2021-09-02,-1.164944,-1.81836,0.551225
2021-09-03,0.154908,-0.206827,0.616561
2021-09-04,1.000279,-0.088727,1.708009
2021-09-05,-0.11669,-0.560096,0.595706
2021-09-06,1.667857,0.126256,0.251574
2021-09-07,0.394512,-1.624111,-1.114556


In [35]:
df.loc["2021-09-04":"2021-09-07", ["A","D"]]

Unnamed: 0,A,D
2021-09-04,1.000279,0.669216
2021-09-05,-0.11669,-1.098146
2021-09-06,1.667857,-1.169267
2021-09-07,0.394512,-0.417763


In [36]:
df

Unnamed: 0,A,B,C,D
2021-09-01,-1.031584,0.286248,-0.781269,0.31707
2021-09-02,-1.164944,-1.81836,0.551225,-1.180479
2021-09-03,0.154908,-0.206827,0.616561,0.598239
2021-09-04,1.000279,-0.088727,1.708009,0.669216
2021-09-05,-0.11669,-0.560096,0.595706,-1.098146
2021-09-06,1.667857,0.126256,0.251574,-1.169267
2021-09-07,0.394512,-1.624111,-1.114556,-0.417763


In [37]:
df>0

Unnamed: 0,A,B,C,D
2021-09-01,False,True,False,True
2021-09-02,False,False,True,False
2021-09-03,True,False,True,True
2021-09-04,True,False,True,True
2021-09-05,False,False,True,False
2021-09-06,True,True,True,False
2021-09-07,True,False,False,False


In [38]:
df[df > 0]

Unnamed: 0,A,B,C,D
2021-09-01,,0.286248,,0.31707
2021-09-02,,,0.551225,
2021-09-03,0.154908,,0.616561,0.598239
2021-09-04,1.000279,,1.708009,0.669216
2021-09-05,,,0.595706,
2021-09-06,1.667857,0.126256,0.251574,
2021-09-07,0.394512,,,


In [39]:
df.A > 0

2021-09-01    False
2021-09-02    False
2021-09-03     True
2021-09-04     True
2021-09-05    False
2021-09-06     True
2021-09-07     True
Freq: D, Name: A, dtype: bool

In [40]:
df[df.A > 0]    #常用

Unnamed: 0,A,B,C,D
2021-09-03,0.154908,-0.206827,0.616561,0.598239
2021-09-04,1.000279,-0.088727,1.708009,0.669216
2021-09-06,1.667857,0.126256,0.251574,-1.169267
2021-09-07,0.394512,-1.624111,-1.114556,-0.417763


In [41]:
df2 = df.copy()
df2['E'] = ["one", "two","one", "two","three", "three", "four"]
df2

Unnamed: 0,A,B,C,D,E
2021-09-01,-1.031584,0.286248,-0.781269,0.31707,one
2021-09-02,-1.164944,-1.81836,0.551225,-1.180479,two
2021-09-03,0.154908,-0.206827,0.616561,0.598239,one
2021-09-04,1.000279,-0.088727,1.708009,0.669216,two
2021-09-05,-0.11669,-0.560096,0.595706,-1.098146,three
2021-09-06,1.667857,0.126256,0.251574,-1.169267,three
2021-09-07,0.394512,-1.624111,-1.114556,-0.417763,four


In [42]:
ind = df2['E'].isin(['two',"three"])
ind 

2021-09-01    False
2021-09-02     True
2021-09-03    False
2021-09-04     True
2021-09-05     True
2021-09-06     True
2021-09-07    False
Freq: D, Name: E, dtype: bool

In [43]:
df2[ind]

Unnamed: 0,A,B,C,D,E
2021-09-02,-1.164944,-1.81836,0.551225,-1.180479,two
2021-09-04,1.000279,-0.088727,1.708009,0.669216,two
2021-09-05,-0.11669,-0.560096,0.595706,-1.098146,three
2021-09-06,1.667857,0.126256,0.251574,-1.169267,three


**增加列**

In [44]:
df 

Unnamed: 0,A,B,C,D
2021-09-01,-1.031584,0.286248,-0.781269,0.31707
2021-09-02,-1.164944,-1.81836,0.551225,-1.180479
2021-09-03,0.154908,-0.206827,0.616561,0.598239
2021-09-04,1.000279,-0.088727,1.708009,0.669216
2021-09-05,-0.11669,-0.560096,0.595706,-1.098146
2021-09-06,1.667857,0.126256,0.251574,-1.169267
2021-09-07,0.394512,-1.624111,-1.114556,-0.417763


In [45]:
ser = pd.Series(np.arange(1,8), index=pd.date_range("2021-09-01", periods=7))
ser

2021-09-01    1
2021-09-02    2
2021-09-03    3
2021-09-04    4
2021-09-05    5
2021-09-06    6
2021-09-07    7
Freq: D, dtype: int32

In [46]:
df['E'] = ser
df 

Unnamed: 0,A,B,C,D,E
2021-09-01,-1.031584,0.286248,-0.781269,0.31707,1
2021-09-02,-1.164944,-1.81836,0.551225,-1.180479,2
2021-09-03,0.154908,-0.206827,0.616561,0.598239,3
2021-09-04,1.000279,-0.088727,1.708009,0.669216,4
2021-09-05,-0.11669,-0.560096,0.595706,-1.098146,5
2021-09-06,1.667857,0.126256,0.251574,-1.169267,6
2021-09-07,0.394512,-1.624111,-1.114556,-0.417763,7


**修改列**

In [47]:
df['D'] = np.array([5] * len(df))
df 

Unnamed: 0,A,B,C,D,E
2021-09-01,-1.031584,0.286248,-0.781269,5,1
2021-09-02,-1.164944,-1.81836,0.551225,5,2
2021-09-03,0.154908,-0.206827,0.616561,5,3
2021-09-04,1.000279,-0.088727,1.708009,5,4
2021-09-05,-0.11669,-0.560096,0.595706,5,5
2021-09-06,1.667857,0.126256,0.251574,5,6
2021-09-07,0.394512,-1.624111,-1.114556,5,7


In [48]:
df2.index = [i for i in range(len(df2))]
df2 

Unnamed: 0,A,B,C,D,E
0,-1.031584,0.286248,-0.781269,0.31707,one
1,-1.164944,-1.81836,0.551225,-1.180479,two
2,0.154908,-0.206827,0.616561,0.598239,one
3,1.000279,-0.088727,1.708009,0.669216,two
4,-0.11669,-0.560096,0.595706,-1.098146,three
5,1.667857,0.126256,0.251574,-1.169267,three
6,0.394512,-1.624111,-1.114556,-0.417763,four


In [49]:
df2.columns = [i for i in range(df2.shape[1])]
df2 

Unnamed: 0,0,1,2,3,4
0,-1.031584,0.286248,-0.781269,0.31707,one
1,-1.164944,-1.81836,0.551225,-1.180479,two
2,0.154908,-0.206827,0.616561,0.598239,one
3,1.000279,-0.088727,1.708009,0.669216,two
4,-0.11669,-0.560096,0.595706,-1.098146,three
5,1.667857,0.126256,0.251574,-1.169267,three
6,0.394512,-1.624111,-1.114556,-0.417763,four


**数值运算及统计分析**

**数据的查看**

In [50]:
dates = pd.date_range("2021-09-01", periods=7)
df = pd.DataFrame(np.random.randn(7,4), index=dates, columns = ["A", "B", "C", "D"])
df 

Unnamed: 0,A,B,C,D
2021-09-01,1.182822,0.315259,-1.238694,-1.278169
2021-09-02,0.466008,-1.457544,0.31512,0.341698
2021-09-03,0.186543,0.271554,-1.06537,0.040713
2021-09-04,-0.919961,0.619589,0.572204,0.569783
2021-09-05,-0.947921,-0.034977,0.218291,-0.321906
2021-09-06,0.165422,0.461112,1.814427,1.5383
2021-09-07,-0.689202,-0.51867,0.190238,0.306855


In [51]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 7 entries, 2021-09-01 to 2021-09-07
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       7 non-null      float64
 1   B       7 non-null      float64
 2   C       7 non-null      float64
 3   D       7 non-null      float64
dtypes: float64(4)
memory usage: 280.0 bytes


**一般来说，纯粹的计算在Numpy里执行更快**

**Numpy更加侧重于计算，Pandas侧重于数据处理**

广播运算


In [52]:
np.random.seed(0)
df = pd.DataFrame(np.random.randint(1,10,size=(3,3)), columns = list("ABC"))
df 

Unnamed: 0,A,B,C
0,6,1,4
1,4,8,4
2,6,3,5


按行广播

In [53]:
df.iloc[0]

A    6
B    1
C    4
Name: 0, dtype: int32

In [54]:
df / df.iloc[0]

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,0.666667,8.0,1.0
2,1.0,3.0,1.25


按列广播

In [55]:
df.A

0    6
1    4
2    6
Name: A, dtype: int32

In [56]:
df.div(df.A, axis=0)    #按列

Unnamed: 0,A,B,C
0,1.0,0.166667,0.666667
1,1.0,2.0,1.0
2,1.0,0.5,0.833333


In [57]:
df.div(df.iloc[0], axis=1)    #按行

Unnamed: 0,A,B,C
0,1.0,1.0,1.0
1,0.666667,8.0,1.0
2,1.0,3.0,1.25


**Pandas新用法**

**索引对齐**


In [58]:
A = pd.DataFrame(np.random.randint(1,11, size=(2,2)), columns = list("AB"))
A 

Unnamed: 0,A,B
0,8,7
1,9,9


In [59]:
B = pd.DataFrame(np.random.randint(1,11, size=(3,3)), columns = list("ABC"))
B

Unnamed: 0,A,B,C
0,2,7,8
1,8,9,2
2,6,10,9


Pandas会自动对齐两个对象的索引，没有的值用np.NaN表示

In [60]:
A+B

Unnamed: 0,A,B,C
0,10.0,14.0,
1,17.0,18.0,
2,,,


缺失值可以用fill_value填充 

In [61]:
A.add(B, fill_value = 0)

Unnamed: 0,A,B,C
0,10.0,14.0,8.0
1,17.0,18.0,2.0
2,6.0,10.0,9.0


In [62]:
B.add(A, fill_value = 0)

Unnamed: 0,A,B,C
0,10.0,14.0,8.0
1,17.0,18.0,2.0
2,6.0,10.0,9.0


**统计相关**

**数据种类统计**

In [63]:
arr = np.random.randint(3, size=10)
arr

array([1, 0, 0, 1, 2, 0, 2, 0, 1, 1])

In [64]:
np.unique(arr)

array([0, 1, 2])

In [65]:
from collections import Counter
Counter(arr)

Counter({1: 4, 0: 4, 2: 2})

In [66]:
df = pd.DataFrame(arr, columns = ["A"])
df 

Unnamed: 0,A
0,1
1,0
2,0
3,1
4,2
5,0
6,2
7,0
8,1
9,1


In [67]:
np.unique(df)

array([0, 1, 2])

In [68]:
df["A"].value_counts()    #更好的办法

1    4
0    4
2    2
Name: A, dtype: int64

产生新的结果，并进行排序

In [69]:
population_dict = {"Beijing": 2154, "Shanghai":2454, "Shenzhen":2017, "Chengdu":2019}
#population_dict

population_ser= pd.Series(population_dict)
#population_ser

gdp_dict={"Beijing":465780, "Shanghai":48654, "Chengdu":32189, "Shenzhen":39865}
#gdp_dict
gdp_ser = pd.Series(gdp_dict)
#gdp_ser

city_info = pd.DataFrame({"population":population_ser, "gdp": gdp_ser})
city_info

Unnamed: 0,population,gdp
Beijing,2154,465780
Chengdu,2019,32189
Shanghai,2454,48654
Shenzhen,2017,39865


In [70]:
city_info["gdp_per_capita"] = city_info['gdp'] / city_info['population']
city_info

Unnamed: 0,population,gdp,gdp_per_capita
Beijing,2154,465780,216.239554
Chengdu,2019,32189,15.943041
Shanghai,2454,48654,19.826406
Shenzhen,2017,39865,19.764502


增序排序

In [71]:
city_info.sort_values(by = "gdp_per_capita")

Unnamed: 0,population,gdp,gdp_per_capita
Chengdu,2019,32189,15.943041
Shenzhen,2017,39865,19.764502
Shanghai,2454,48654,19.826406
Beijing,2154,465780,216.239554


降序排序

In [72]:
city_info.sort_values(by = "gdp_per_capita", ascending= False)

Unnamed: 0,population,gdp,gdp_per_capita
Beijing,2154,465780,216.239554
Shanghai,2454,48654,19.826406
Shenzhen,2017,39865,19.764502
Chengdu,2019,32189,15.943041


按进行排序

In [73]:
data = pd.DataFrame(np.random.randint(20, size=(3,3)), index=[2,1,0], columns=list("CBA"))
data

Unnamed: 0,C,B,A
2,19,19,14
1,7,0,1
0,9,0,10


In [74]:
# 按index排序
data.sort_index()

Unnamed: 0,C,B,A
0,9,0,10
1,7,0,1
2,19,19,14


In [75]:
# 按column排序
data.sort_index(axis=1,ascending= False)

Unnamed: 0,C,B,A
2,19,19,14
1,7,0,1
0,9,0,10


In [76]:
data.count()    # 非空个数， 0不是空

C    3
B    3
A    3
dtype: int64

In [77]:
data.iloc[0,0] = np.NaN
data.count()

C    2
B    3
A    3
dtype: int64

求和


In [78]:
data

Unnamed: 0,C,B,A
2,,19,14
1,7.0,0,1
0,9.0,0,10


In [79]:
data.sum()    #默认按列求和

C    16.0
B    19.0
A    25.0
dtype: float64

In [80]:
data.sum(axis=1)    #按行求和

2    33.0
1     8.0
0    19.0
dtype: float64

In [81]:
data.max()

C     9.0
B    19.0
A    14.0
dtype: float64

In [82]:
data.min(axis=1)

2    14.0
1     0.0
0     0.0
dtype: float64

In [83]:
data.idxmax()    #最大值所在的index

C    0
B    2
A    2
dtype: int64

In [84]:
data.mean()

C    8.000000
B    6.333333
A    8.333333
dtype: float64

In [85]:
data.std()

C     1.414214
B    10.969655
A     6.658328
dtype: float64

In [86]:
data.var()

C      2.000000
B    120.333333
A     44.333333
dtype: float64

In [87]:
data.median()

C     8.0
B     0.0
A    10.0
dtype: float64

In [88]:
data.mode()

Unnamed: 0,C,B,A
0,7.0,0.0,1
1,9.0,,10
2,,,14


In [89]:
data.describe()

Unnamed: 0,C,B,A
count,2.0,3.0,3.0
mean,8.0,6.333333,8.333333
std,1.414214,10.969655,6.658328
min,7.0,0.0,1.0
25%,7.5,0.0,5.5
50%,8.0,0.0,10.0
75%,8.5,9.5,12.0
max,9.0,19.0,14.0


相关性系数和协方差

In [90]:
data.corr()

Unnamed: 0,C,B,A
C,1.0,,1.0
B,,1.0,0.737043
A,1.0,0.737043,1.0


In [91]:
data.corrwith(data["A"])

C    1.000000
B    0.737043
A    1.000000
dtype: float64

apply()方法

apply(method)的用法：使用method方法默认对每一列进行相应的操作

In [92]:
np.random.seed(0)

df = pd.DataFrame(np.random.randint(1,11, size=(3,3)), columns = list("CBA"))
df 

Unnamed: 0,C,B,A
0,6,1,4
1,4,8,10
2,4,6,3


In [93]:
df.apply(np.cumsum)

Unnamed: 0,C,B,A
0,6,1,4
1,10,9,14
2,14,15,17


In [94]:
df.apply(np.cumsum, axis=1)

Unnamed: 0,C,B,A
0,6,7,11
1,4,12,22
2,4,10,13


In [95]:
df.apply(sum)

C    14
B    15
A    17
dtype: int64

In [96]:
df.apply(lambda x: x.max() - x.min())

C    2
B    7
A    7
dtype: int64

In [97]:
def my_describe(df):
    return pd.Series([df.max(), df.idxmax(), df.min(), df.idxmin(),df.std()], index=['max', "max_idx", "min", "min_idx", "std"])

df.apply(my_describe)

Unnamed: 0,C,B,A
max,6.0,8.0,10.0
max_idx,0.0,1.0,1.0
min,4.0,1.0,3.0
min_idx,1.0,0.0,2.0
std,1.154701,3.605551,3.785939


**缺失值处理**

- 发现缺失值


In [98]:
data = pd.DataFrame(np.array([[1, np.nan, 2],
                             [np.nan, 4, 5],
                             [4,7, None]]), columns = list("ABC"))
data

Unnamed: 0,A,B,C
0,1.0,,2.0
1,,4.0,5.0
2,4.0,7.0,


**注意： 有None， 字符串等，数据类型全部变为object, 它比int, float更消耗资源

In [99]:
data.dtypes

A    object
B    object
C    object
dtype: object

In [100]:
data.isnull()

Unnamed: 0,A,B,C
0,False,True,False
1,True,False,False
2,False,False,True


In [101]:
data.notnull()

Unnamed: 0,A,B,C
0,True,False,True
1,False,True,True
2,True,True,False


- 删除缺失值


In [102]:
data= pd.DataFrame(np.array([[1, np.nan, 2,3],
                            [np.nan, 2,4,3],
                            [7,8,5, np.nan],
                            [4,8,9,10]]), columns = ["A", "B", "C", "D"])
data

Unnamed: 0,A,B,C,D
0,1.0,,2.0,3.0
1,,2.0,4.0,3.0
2,7.0,8.0,5.0,
3,4.0,8.0,9.0,10.0


In [103]:
data.dtypes

A    float64
B    float64
C    float64
D    float64
dtype: object

In [104]:
# 删除缺失值所在的行(缺省)
data.dropna()

Unnamed: 0,A,B,C,D
3,4.0,8.0,9.0,10.0


In [105]:
# 删除缺失值所在的列
data.dropna(axis = "columns")

Unnamed: 0,C
0,2.0
1,4.0
2,5.0
3,9.0


In [106]:
data['D'] = np.nan
data

Unnamed: 0,A,B,C,D
0,1.0,,2.0,
1,,2.0,4.0,
2,7.0,8.0,5.0,
3,4.0,8.0,9.0,


In [107]:
# 当列中 全是 缺失值的时候才删除
data.dropna(axis="columns", how="all")    

Unnamed: 0,A,B,C
0,1.0,,2.0
1,,2.0,4.0
2,7.0,8.0,5.0
3,4.0,8.0,9.0


In [108]:
# 当列中 有一个 缺失值的时候才删除
data.dropna(axis="columns", how="any")  

Unnamed: 0,C
0,2.0
1,4.0
2,5.0
3,9.0


In [109]:
data 

Unnamed: 0,A,B,C,D
0,1.0,,2.0,
1,,2.0,4.0,
2,7.0,8.0,5.0,
3,4.0,8.0,9.0,


In [110]:
data.loc[3]=np.nan
data.dropna(how="all")

Unnamed: 0,A,B,C,D
0,1.0,,2.0,
1,,2.0,4.0,
2,7.0,8.0,5.0,


- 填充缺失值

In [111]:
data.fillna(value=999)

Unnamed: 0,A,B,C,D
0,1.0,999.0,2.0,999.0
1,999.0,2.0,4.0,999.0
2,7.0,8.0,5.0,999.0
3,999.0,999.0,999.0,999.0


In [112]:
data.loc[3] = [12,13,14,15]

- 用均值进行填充

In [113]:
data 

Unnamed: 0,A,B,C,D
0,1.0,,2.0,
1,,2.0,4.0,
2,7.0,8.0,5.0,
3,12.0,13.0,14.0,15.0


In [114]:
# axis =0 (index) 说明对每行上的值求平均值，所以是每列的平均值
fill = data.mean(axis=0)    
fill

A     6.666667
B     7.666667
C     6.250000
D    15.000000
dtype: float64

In [115]:
# skipna=False时，对有NaN的行或列则不计算均值
fill2 = data.mean(axis=1, skipna=False)    
fill2

0     NaN
1     NaN
2     NaN
3    13.5
dtype: float64

In [116]:
data.fillna(value=fill)

Unnamed: 0,A,B,C,D
0,1.0,7.666667,2.0,15.0
1,6.666667,2.0,4.0,15.0
2,7.0,8.0,5.0,15.0
3,12.0,13.0,14.0,15.0


In [117]:
# 把数字全部摊平求平均值
fill = data.stack().mean()
fill

7.545454545454546

In [118]:
data.stack()

0  A     1.0
   C     2.0
1  B     2.0
   C     4.0
2  A     7.0
   B     8.0
   C     5.0
3  A    12.0
   B    13.0
   C    14.0
   D    15.0
dtype: float64

**合并数据**

- 构造一个生产DataFrame的函数

In [119]:
def make_df(cols, ind):
    """生产一个简单的DataFrame"""
    data = {c: [str(c) + str(i) for i in ind] for c in cols}
    print(data)
    return pd.DataFrame(data, ind)

make_df("ABC", range(3))

{'A': ['A0', 'A1', 'A2'], 'B': ['B0', 'B1', 'B2'], 'C': ['C0', 'C1', 'C2']}


Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


- 垂直合并

In [120]:
df_1 = make_df("AB", [1,3])
df_2 = make_df("AB", [2,5])
print(df_1)
print(df_2)

{'A': ['A1', 'A3'], 'B': ['B1', 'B3']}
{'A': ['A2', 'A5'], 'B': ['B2', 'B5']}
    A   B
1  A1  B1
3  A3  B3
    A   B
2  A2  B2
5  A5  B5


In [121]:
pd.concat([df_1, df_2])

Unnamed: 0,A,B
1,A1,B1
3,A3,B3
2,A2,B2
5,A5,B5


- 水平合并

In [122]:
df_1 = make_df("AB", [0,1])
df_2 = make_df("CD", [0,1])
print(df_1)
print(df_2)

{'A': ['A0', 'A1'], 'B': ['B0', 'B1']}
{'C': ['C0', 'C1'], 'D': ['D0', 'D1']}
    A   B
0  A0  B0
1  A1  B1
    C   D
0  C0  D0
1  C1  D1


In [123]:
pd.concat([df_1, df_2], axis=1)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1


- 行标签冲突的处理

In [124]:
df_5 = make_df("AB", [0,1])
df_6 = make_df("AB", [0,1])
pd.concat([df_5, df_6], ignore_index=True)    #重新排行标签

{'A': ['A0', 'A1'], 'B': ['B0', 'B1']}
{'A': ['A0', 'A1'], 'B': ['B0', 'B1']}


Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A0,B0
3,A1,B1


**列重叠**

In [125]:
df_7 = make_df("ABC", [1,2])
df_8 = make_df("BCD", [3,4])
print(df_7)
print(df_8)

{'A': ['A1', 'A2'], 'B': ['B1', 'B2'], 'C': ['C1', 'C2']}
{'B': ['B3', 'B4'], 'C': ['C3', 'C4'], 'D': ['D3', 'D4']}
    A   B   C
1  A1  B1  C1
2  A2  B2  C2
    B   C   D
3  B3  C3  D3
4  B4  C4  D4


In [126]:
pd.concat([df_7, df_8])

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


In [127]:
pd.concat([df_7, df_8], sort=False)

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


In [128]:
pd.concat([df_7, df_8],axis=1)

Unnamed: 0,A,B,C,B.1,C.1,D
1,A1,B1,C1,,,
2,A2,B2,C2,,,
3,,,,B3,C3,D3
4,,,,B4,C4,D4


In [129]:
pd.concat([df_7, df_8],axis=1, ignore_index=True)    #重复的列名重新排

Unnamed: 0,0,1,2,3,4,5
1,A1,B1,C1,,,
2,A2,B2,C2,,,
3,,,,B3,C3,D3
4,,,,B4,C4,D4


- 对齐合并merge()

In [130]:
df_9 = make_df("AB", [0,1])
df_a = make_df("BC", [0,1])
print(df_9)
print(df_a)

{'A': ['A0', 'A1'], 'B': ['B0', 'B1']}
{'B': ['B0', 'B1'], 'C': ['C0', 'C1']}
    A   B
0  A0  B0
1  A1  B1
    B   C
0  B0  C0
1  B1  C1


In [131]:
pd.merge(df_9, df_a)

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1


In [132]:
df_9 = make_df("AB", [0,1])
df_a = make_df("BC", [1,0])
print(df_9)
print(df_a)

{'A': ['A0', 'A1'], 'B': ['B0', 'B1']}
{'B': ['B1', 'B0'], 'C': ['C1', 'C0']}
    A   B
0  A0  B0
1  A1  B1
    B   C
1  B1  C1
0  B0  C0


In [133]:
pd.merge(df_9, df_a)    #自动的关联相应的C0,C1

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1


【例】 合并城市信息

In [134]:
population_dict = {"city": ["Beijing", "Chengdu", "Shanghai"],
                  "population": [2154, 2016, 2278]}
population = pd.DataFrame(population_dict)
population 

Unnamed: 0,city,population
0,Beijing,2154
1,Chengdu,2016
2,Shanghai,2278


In [135]:
gdp_dict = {"city": ["Beijing", "Shanghai", "Shenzhen"],
            "gdp": [30320, 32680, 13468]}
gdp = pd.DataFrame(gdp_dict)
gdp 

Unnamed: 0,city,gdp
0,Beijing,30320
1,Shanghai,32680
2,Shenzhen,13468


In [136]:
city_info = pd.merge(population, gdp)    # 交集
city_info

Unnamed: 0,city,population,gdp
0,Beijing,2154,30320
1,Shanghai,2278,32680


In [137]:
city_info = pd.merge(population, gdp, how="outer")    # 并集
city_info

Unnamed: 0,city,population,gdp
0,Beijing,2154.0,30320.0
1,Chengdu,2016.0,
2,Shanghai,2278.0,32680.0
3,Shenzhen,,13468.0


**分组和数据透视表**

In [138]:
df = pd.DataFrame({"key": ["A", "B", "C", "C", "B", "A"],
                  "data1": range(6),
                  "data2": np.random.randint(10, size=6)})
df 

Unnamed: 0,key,data1,data2
0,A,0,4
1,B,1,7
2,C,2,6
3,C,3,8
4,B,4,8
5,A,5,1


- 分组

延迟计算

In [139]:
df.groupby("key")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x000001BCAC016910>

In [140]:
df.groupby("key").sum()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,5,5
B,5,15
C,5,14


In [141]:
df.groupby("key").mean()

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,2.5,2.5
B,2.5,7.5
C,2.5,7.0


In [142]:
for i in df.groupby("key"):
    print(i)

('A',   key  data1  data2
0   A      0      4
5   A      5      1)
('B',   key  data1  data2
1   B      1      7
4   B      4      8)
('C',   key  data1  data2
2   C      2      6
3   C      3      8)


- 按列取值

In [143]:
df.groupby("key")["data2"].sum()

key
A     5
B    15
C    14
Name: data2, dtype: int32

- 按组迭代

In [144]:
for data, group in df.groupby("key"):
    print("{0:5} shape={1}".format(data, group.shape))

A     shape=(2, 3)
B     shape=(2, 3)
C     shape=(2, 3)


- 调用方法

In [145]:
df.groupby("key")["data1"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
key,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,2.0,2.5,3.535534,0.0,1.25,2.5,3.75,5.0
B,2.0,2.5,2.12132,1.0,1.75,2.5,3.25,4.0
C,2.0,2.5,0.707107,2.0,2.25,2.5,2.75,3.0


- 支持更复杂的操作

In [146]:
df.groupby("key").aggregate(["min", "median", "max"])

Unnamed: 0_level_0,data1,data1,data1,data2,data2,data2
Unnamed: 0_level_1,min,median,max,min,median,max
key,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,0,2.5,5,1,2.5,4
B,1,2.5,4,7,7.5,8
C,2,2.5,3,6,7.0,8


- 过滤

In [147]:
def filter_func(x):
    return x["data2"].std() > 1
df.groupby("key")['data2'].std()

key
A    2.121320
B    0.707107
C    1.414214
Name: data2, dtype: float64

In [148]:
df.groupby("key").filter(filter_func)

Unnamed: 0,key,data1,data2
0,A,0,4
2,C,2,6
3,C,3,8
5,A,5,1


- 转换

In [149]:
df 

Unnamed: 0,key,data1,data2
0,A,0,4
1,B,1,7
2,C,2,6
3,C,3,8
4,B,4,8
5,A,5,1


In [150]:
df.groupby("key").transform(lambda x: x - x.mean())  # 作用在每个组上

Unnamed: 0,data1,data2
0,-2.5,1.5
1,-1.5,-0.5
2,-0.5,-1.0
3,0.5,1.0
4,1.5,0.5
5,2.5,-1.5


In [151]:
df.groupby("key")["data2"].transform(lambda x: x - x.mean())

0    1.5
1   -0.5
2   -1.0
3    1.0
4    0.5
5   -1.5
Name: data2, dtype: float64

- apply()

In [152]:
def norm_by_data2(x):
    x['data1'] /= x['data2'].sum()
    return x


In [153]:
df.groupby("key").apply(norm_by_data2)    #apply中的函数应用于每个组

Unnamed: 0,key,data1,data2
0,A,0.0,4
1,B,0.066667,7
2,C,0.142857,6
3,C,0.214286,8
4,B,0.266667,8
5,A,1.0,1


- 将列表，数组设为分组键

In [154]:
L = [0,1,0, 1, 2, 0]    # 用L作为分组键对df进行分组
df 

Unnamed: 0,key,data1,data2
0,A,0,4
1,B,1,7
2,C,2,6
3,C,3,8
4,B,4,8
5,A,5,1


In [155]:
df.groupby(L).sum()

Unnamed: 0,data1,data2
0,7,11
1,4,15
2,4,8


- 用字典将索引映射到分组

In [156]:
df2 = df.set_index("key")
df2 

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,4
B,1,7
C,2,6
C,3,8
B,4,8
A,5,1


In [157]:
mapping = {"A":"first", "B":"constant", "C":"constant"}
df2.groupby(mapping).sum()

Unnamed: 0,data1,data2
constant,10,29
first,5,5


- 任意Python函数

In [158]:
df2 

Unnamed: 0_level_0,data1,data2
key,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,4
B,1,7
C,2,6
C,3,8
B,4,8
A,5,1


In [159]:
df2.groupby(str.lower).mean()

Unnamed: 0,data1,data2
a,2.5,2.5
b,2.5,7.5
c,2.5,7.0


- 多个条件组成的多个列表进行分组

In [160]:
df2.groupby([str.lower, mapping]).mean()

Unnamed: 0,Unnamed: 1,data1,data2
a,first,2.5,2.5
b,constant,2.5,7.5
c,constant,2.5,7.0


【例】 行星观测数据处理 

In [163]:
import seaborn as sns
planets = sns.load_dataset("planets")
planets

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.300000,7.10,77.40,2006
1,Radial Velocity,1,874.774000,2.21,56.95,2008
2,Radial Velocity,1,763.000000,2.60,19.84,2011
3,Radial Velocity,1,326.030000,19.40,110.62,2007
4,Radial Velocity,1,516.220000,10.50,119.47,2009
...,...,...,...,...,...,...
1030,Transit,1,3.941507,,172.00,2006
1031,Transit,1,2.615864,,148.00,2007
1032,Transit,1,3.191524,,174.00,2007
1033,Transit,1,4.125083,,293.00,2008


In [166]:
planets.shape

(1035, 6)

In [168]:
planets.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [169]:
planets.describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,1035.0,992.0,513.0,808.0,1035.0
mean,1.785507,2002.917596,2.638161,264.069282,2009.070531
std,1.240976,26014.728304,3.818617,733.116493,3.972567
min,1.0,0.090706,0.0036,1.35,1989.0
25%,1.0,5.44254,0.229,32.56,2007.0
50%,1.0,39.9795,1.26,55.25,2010.0
75%,2.0,526.005,3.04,178.5,2012.0
max,7.0,730000.0,25.0,8500.0,2014.0


In [170]:
planets.isnull().sum()

method              0
number              0
orbital_period     43
mass              522
distance          227
year                0
dtype: int64

In [171]:
decade = 10 * (planets["year"] // 10)
decade = decade.astype(str) + "s"
decade.name = "decade"
decade.head()

0    2000s
1    2000s
2    2010s
3    2000s
4    2000s
Name: decade, dtype: object

In [174]:
planets.groupby(["method", decade]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,number,orbital_period,mass,distance,year
method,decade,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Astrometry,2010s,2,1262.36,0.0,35.75,4023
Eclipse Timing Variations,2000s,5,19308.0,6.05,261.44,6025
Eclipse Timing Variations,2010s,10,23456.8,4.2,1000.0,12065
Imaging,2000s,29,1350935.0,0.0,956.83,40139
Imaging,2010s,21,68037.5,0.0,1210.08,36208
Microlensing,2000s,12,17325.0,0.0,0.0,20070
Microlensing,2010s,15,4750.0,0.0,41440.0,26155
Orbital Brightness Modulation,2010s,5,2.12792,0.0,2360.0,6035
Pulsar Timing,1990s,9,190.0153,0.0,0.0,5978
Pulsar Timing,2000s,1,36525.0,0.0,0.0,2003


In [175]:
# unstack() 就是把原来的两个分类中的一个展开作为列
planets.groupby(["method", decade])['number'].sum().unstack().fillna(0)

decade,1980s,1990s,2000s,2010s
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Astrometry,0.0,0.0,0.0,2.0
Eclipse Timing Variations,0.0,0.0,5.0,10.0
Imaging,0.0,0.0,29.0,21.0
Microlensing,0.0,0.0,12.0,15.0
Orbital Brightness Modulation,0.0,0.0,0.0,5.0
Pulsar Timing,0.0,9.0,1.0,1.0
Pulsation Timing Variations,0.0,0.0,1.0,0.0
Radial Velocity,1.0,52.0,475.0,424.0
Transit,0.0,0.0,64.0,712.0
Transit Timing Variations,0.0,0.0,0.0,9.0


**数据透视表**
【例】Titanic号乘客数据分析

In [176]:
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [177]:
titanic.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [185]:
# 两个中括号是DataFrame的数据
titanic.groupby("sex")[['survived']].sum()

Unnamed: 0_level_0,survived
sex,Unnamed: 1_level_1
female,233
male,109


In [186]:
# 一个中括号是Series数据
titanic.groupby("sex")['survived'].mean()


sex
female    0.742038
male      0.188908
Name: survived, dtype: float64

In [184]:
titanic.groupby("sex")['survived'].mean()

sex
female    0.742038
male      0.188908
Name: survived, dtype: float64

In [187]:
titanic.groupby(["sex", "class"])['survived'].aggregate("mean").unstack()

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [188]:
titanic.groupby(["sex", "class"])[["survived"]].mean().unstack()

Unnamed: 0_level_0,survived,survived,survived
class,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


**数据透视表**

In [189]:
# "survived"是指要分析的数据
titanic.pivot_table("survived", index="sex", columns="class")

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


In [190]:
titanic.pivot_table("survived", index="sex", columns="class", aggfunc="mean", margins=True)

class,First,Second,Third,All
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.968085,0.921053,0.5,0.742038
male,0.368852,0.157407,0.135447,0.188908
All,0.62963,0.472826,0.242363,0.383838


In [191]:
titanic.pivot_table(index="sex", columns="class", aggfunc={"survived": "sum", "fare":"mean"})

Unnamed: 0_level_0,fare,fare,fare,survived,survived,survived
class,First,Second,Third,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,106.125798,21.970121,16.11881,91,70,72
male,67.226127,19.741782,12.661633,45,17,47


**多级索引-多用于多维数组**

In [192]:
base_data = np.array([[1771, 11115],
                     [2154, 30320],
                     [2141, 14070],
                     [2424, 32680],
                     [1077, 7806],
                     [1303, 24222],
                     [798, 4789],
                     [981, 13468]])
data = pd.DataFrame(base_data, index=[['Beijing', "Beijing", 'Shanghai', 'Shanghai', 'Shenzhen', "Shenzhen", 'Hangzhou', 'Hangzhou'],
                                     [2010, 2020]*4], columns = ['population', 'GDP'])
data 

Unnamed: 0,Unnamed: 1,population,GDP
Beijing,2010,1771,11115
Beijing,2020,2154,30320
Shanghai,2010,2141,14070
Shanghai,2020,2424,32680
Shenzhen,2010,1077,7806
Shenzhen,2020,1303,24222
Hangzhou,2010,798,4789
Hangzhou,2020,981,13468


In [193]:
data.index.names = ['city', 'year']
data 

Unnamed: 0_level_0,Unnamed: 1_level_0,population,GDP
city,year,Unnamed: 2_level_1,Unnamed: 3_level_1
Beijing,2010,1771,11115
Beijing,2020,2154,30320
Shanghai,2010,2141,14070
Shanghai,2020,2424,32680
Shenzhen,2010,1077,7806
Shenzhen,2020,1303,24222
Hangzhou,2010,798,4789
Hangzhou,2020,981,13468


In [194]:

data['GDP']

city      year
Beijing   2010    11115
          2020    30320
Shanghai  2010    14070
          2020    32680
Shenzhen  2010     7806
          2020    24222
Hangzhou  2010     4789
          2020    13468
Name: GDP, dtype: int32

In [195]:
data.loc['Shanghai', "population"]

year
2010    2141
2020    2424
Name: population, dtype: int32

In [196]:
data.loc["Shanghai", 2020]

population     2424
GDP           32680
Name: (Shanghai, 2020), dtype: int32

In [197]:
data.loc["Shanghai", 2010]['GDP']

14070