## Pandas 库

Numpy库在处理向量化数值计算时性能优异，

但是在处理更灵活、更复杂的数据时显得力不从心。

如为数据加标签、处理缺失值、分组和透视表方面。

基于Numpy构建的Pandas库，提供了使得数据分析更快、更简单高效的工具。


In [1]:
import pandas as pd 

In [2]:
pd.Series([1, 2, "3", 4], index=["a", "b", "c", "d"])

a    1
b    2
c    3
d    4
dtype: object

In [3]:
pd.Series([1, 2, "3", 4], index=["a", "b", "c", "d"], dtype=float)

a    1.0
b    2.0
c    3.0
d    4.0
dtype: float64

In [4]:
pd.Series(5, index=[100, 200, 300])

100    5
200    5
300    5
dtype: int64

In [5]:
population_dict = {"Beijing": 2154, "Shanghai":2454, "Shenzhen":2017, "Chengdu":2019}
population_dict

{'Beijing': 2154, 'Shanghai': 2454, 'Shenzhen': 2017, 'Chengdu': 2019}

In [6]:
population_ser= pd.Series(population_dict)
population_ser

Beijing     2154
Shanghai    2454
Shenzhen    2017
Chengdu     2019
dtype: int64

In [7]:
pd.DataFrame({"population": population_ser})

Unnamed: 0,population
Beijing,2154
Shanghai,2454
Shenzhen,2017
Chengdu,2019


In [12]:
gdp_dict={"Beijing":465780, "Shanghai":48654, "Chengdu":32189, "Shenzhen":39865}
gdp_dict

{'Beijing': 465780, 'Shanghai': 48654, 'Chengdu': 32189, 'Shenzhen': 39865}

In [13]:
gdp_ser = pd.Series(gdp_dict)
gdp_ser

Beijing     465780
Shanghai     48654
Chengdu      32189
Shenzhen     39865
dtype: int64

In [21]:
df1 = pd.DataFrame({"population":population_ser, "gdp": gdp_ser})
df1 

Unnamed: 0,population,gdp
Beijing,2154,465780
Chengdu,2019,32189
Shanghai,2454,48654
Shenzhen,2017,39865


In [22]:
df2 = pd.DataFrame({"population":population_ser, "gdp": gdp_ser, "country":"China"})
df2

Unnamed: 0,population,gdp,country
Beijing,2154,465780,China
Chengdu,2019,32189,China
Shanghai,2454,48654,China
Shenzhen,2017,39865,China


In [23]:
df1.values

array([[  2154, 465780],
       [  2019,  32189],
       [  2454,  48654],
       [  2017,  39865]])

In [27]:
df1.index

Index(['Beijing', 'Chengdu', 'Shanghai', 'Shenzhen'], dtype='object')

In [26]:
df1.columns

Index(['population', 'gdp'], dtype='object')

In [28]:
df1.dtypes

population    int64
gdp           int64
dtype: object

In [29]:
df1.size

8

In [30]:
df1.shape

(4, 2)

**获取列**

In [31]:
df1[["population", "gdp"]]

Unnamed: 0,population,gdp
Beijing,2154,465780
Chengdu,2019,32189
Shanghai,2454,48654
Shenzhen,2017,39865


In [34]:
df1.gdp

Beijing     465780
Chengdu      32189
Shanghai     48654
Shenzhen     39865
Name: gdp, dtype: int64

**获取行**

In [36]:
df1.loc['Beijing']

population      2154
gdp           465780
Name: Beijing, dtype: int64

In [37]:
df1.loc[["Beijing", "Chengdu"]]

Unnamed: 0,population,gdp
Beijing,2154,465780
Chengdu,2019,32189


In [38]:
df1.iloc[[0,1,2,3]]

Unnamed: 0,population,gdp
Beijing,2154,465780
Chengdu,2019,32189
Shanghai,2454,48654
Shenzhen,2017,39865


In [40]:
df1.loc['Chengdu',"population"]

2019

In [41]:
df1.iloc[1,0]

2019

In [45]:
df1.values[1][0]

2019

In [46]:
df1.gdp

Beijing     465780
Chengdu      32189
Shanghai     48654
Shenzhen     39865
Name: gdp, dtype: int64

In [49]:
df1.gdp['Chengdu']

32189

**切片**

In [50]:
dates = pd.date_range("2021-09-01", periods=7)
dates

DatetimeIndex(['2021-09-01', '2021-09-02', '2021-09-03', '2021-09-04',
               '2021-09-05', '2021-09-06', '2021-09-07'],
              dtype='datetime64[ns]', freq='D')

In [52]:
import numpy as np

df = pd.DataFrame(np.random.randn(7,4), index = dates, columns = ["A", "B", "C", "D"])
df 

Unnamed: 0,A,B,C,D
2021-09-01,-0.573962,-1.261348,-1.34088,2.248228
2021-09-02,-0.60734,0.324767,1.381333,0.326401
2021-09-03,0.759956,-0.402003,0.205071,0.568046
2021-09-04,-0.743258,-2.377915,-0.64016,0.610901
2021-09-05,1.012022,0.298562,0.679321,0.660535
2021-09-06,0.00341,-0.732401,-0.235786,2.305059
2021-09-07,-1.151633,1.547033,1.782118,0.212563


In [57]:
df['2021-09-01':'2021-09-03']

Unnamed: 0,A,B,C,D
2021-09-01,-0.573962,-1.261348,-1.34088,2.248228
2021-09-02,-0.60734,0.324767,1.381333,0.326401
2021-09-03,0.759956,-0.402003,0.205071,0.568046


In [58]:
df.loc["2021-09-01":"2021-09-03",:]

Unnamed: 0,A,B,C,D
2021-09-01,-0.573962,-1.261348,-1.34088,2.248228
2021-09-02,-0.60734,0.324767,1.381333,0.326401
2021-09-03,0.759956,-0.402003,0.205071,0.568046


In [59]:
df.iloc[0:3,:]

Unnamed: 0,A,B,C,D
2021-09-01,-0.573962,-1.261348,-1.34088,2.248228
2021-09-02,-0.60734,0.324767,1.381333,0.326401
2021-09-03,0.759956,-0.402003,0.205071,0.568046


In [60]:
df.loc[:, "A":"C"]

Unnamed: 0,A,B,C
2021-09-01,-0.573962,-1.261348,-1.34088
2021-09-02,-0.60734,0.324767,1.381333
2021-09-03,0.759956,-0.402003,0.205071
2021-09-04,-0.743258,-2.377915,-0.64016
2021-09-05,1.012022,0.298562,0.679321
2021-09-06,0.00341,-0.732401,-0.235786
2021-09-07,-1.151633,1.547033,1.782118


In [61]:
df.iloc[:,0:3]

Unnamed: 0,A,B,C
2021-09-01,-0.573962,-1.261348,-1.34088
2021-09-02,-0.60734,0.324767,1.381333
2021-09-03,0.759956,-0.402003,0.205071
2021-09-04,-0.743258,-2.377915,-0.64016
2021-09-05,1.012022,0.298562,0.679321
2021-09-06,0.00341,-0.732401,-0.235786
2021-09-07,-1.151633,1.547033,1.782118


In [62]:
df.loc["2021-09-04":"2021-09-07", ["A","D"]]

Unnamed: 0,A,D
2021-09-04,-0.743258,0.610901
2021-09-05,1.012022,0.660535
2021-09-06,0.00341,2.305059
2021-09-07,-1.151633,0.212563


In [63]:
df

Unnamed: 0,A,B,C,D
2021-09-01,-0.573962,-1.261348,-1.34088,2.248228
2021-09-02,-0.60734,0.324767,1.381333,0.326401
2021-09-03,0.759956,-0.402003,0.205071,0.568046
2021-09-04,-0.743258,-2.377915,-0.64016,0.610901
2021-09-05,1.012022,0.298562,0.679321,0.660535
2021-09-06,0.00341,-0.732401,-0.235786,2.305059
2021-09-07,-1.151633,1.547033,1.782118,0.212563


In [64]:
df>0

Unnamed: 0,A,B,C,D
2021-09-01,False,False,False,True
2021-09-02,False,True,True,True
2021-09-03,True,False,True,True
2021-09-04,False,False,False,True
2021-09-05,True,True,True,True
2021-09-06,True,False,False,True
2021-09-07,False,True,True,True


In [65]:
df[df > 0]

Unnamed: 0,A,B,C,D
2021-09-01,,,,2.248228
2021-09-02,,0.324767,1.381333,0.326401
2021-09-03,0.759956,,0.205071,0.568046
2021-09-04,,,,0.610901
2021-09-05,1.012022,0.298562,0.679321,0.660535
2021-09-06,0.00341,,,2.305059
2021-09-07,,1.547033,1.782118,0.212563


In [66]:
df.A > 0

2021-09-01    False
2021-09-02    False
2021-09-03     True
2021-09-04    False
2021-09-05     True
2021-09-06     True
2021-09-07    False
Freq: D, Name: A, dtype: bool

In [67]:
df[df.A > 0]    #常用

Unnamed: 0,A,B,C,D
2021-09-03,0.759956,-0.402003,0.205071,0.568046
2021-09-05,1.012022,0.298562,0.679321,0.660535
2021-09-06,0.00341,-0.732401,-0.235786,2.305059


In [68]:
df2 = df.copy()
df2['E'] = ["one", "two","one", "two","three", "three", "four"]
df2

Unnamed: 0,A,B,C,D,E
2021-09-01,-0.573962,-1.261348,-1.34088,2.248228,one
2021-09-02,-0.60734,0.324767,1.381333,0.326401,two
2021-09-03,0.759956,-0.402003,0.205071,0.568046,one
2021-09-04,-0.743258,-2.377915,-0.64016,0.610901,two
2021-09-05,1.012022,0.298562,0.679321,0.660535,three
2021-09-06,0.00341,-0.732401,-0.235786,2.305059,three
2021-09-07,-1.151633,1.547033,1.782118,0.212563,four


In [69]:
ind = df2['E'].isin(['two',"three"])
ind 

2021-09-01    False
2021-09-02     True
2021-09-03    False
2021-09-04     True
2021-09-05     True
2021-09-06     True
2021-09-07    False
Freq: D, Name: E, dtype: bool

In [70]:
df2[ind]

Unnamed: 0,A,B,C,D,E
2021-09-02,-0.60734,0.324767,1.381333,0.326401,two
2021-09-04,-0.743258,-2.377915,-0.64016,0.610901,two
2021-09-05,1.012022,0.298562,0.679321,0.660535,three
2021-09-06,0.00341,-0.732401,-0.235786,2.305059,three


**增加列**

In [71]:
df 

Unnamed: 0,A,B,C,D
2021-09-01,-0.573962,-1.261348,-1.34088,2.248228
2021-09-02,-0.60734,0.324767,1.381333,0.326401
2021-09-03,0.759956,-0.402003,0.205071,0.568046
2021-09-04,-0.743258,-2.377915,-0.64016,0.610901
2021-09-05,1.012022,0.298562,0.679321,0.660535
2021-09-06,0.00341,-0.732401,-0.235786,2.305059
2021-09-07,-1.151633,1.547033,1.782118,0.212563


In [72]:
ser = pd.Series(np.arange(1,8), index=pd.date_range("2021-09-01", periods=7))
ser

2021-09-01    1
2021-09-02    2
2021-09-03    3
2021-09-04    4
2021-09-05    5
2021-09-06    6
2021-09-07    7
Freq: D, dtype: int64

In [73]:
df['E'] = ser
df 

Unnamed: 0,A,B,C,D,E
2021-09-01,-0.573962,-1.261348,-1.34088,2.248228,1
2021-09-02,-0.60734,0.324767,1.381333,0.326401,2
2021-09-03,0.759956,-0.402003,0.205071,0.568046,3
2021-09-04,-0.743258,-2.377915,-0.64016,0.610901,4
2021-09-05,1.012022,0.298562,0.679321,0.660535,5
2021-09-06,0.00341,-0.732401,-0.235786,2.305059,6
2021-09-07,-1.151633,1.547033,1.782118,0.212563,7


**修改列**

In [74]:
df['D'] = np.array([5] * len(df))
df 

Unnamed: 0,A,B,C,D,E
2021-09-01,-0.573962,-1.261348,-1.34088,5,1
2021-09-02,-0.60734,0.324767,1.381333,5,2
2021-09-03,0.759956,-0.402003,0.205071,5,3
2021-09-04,-0.743258,-2.377915,-0.64016,5,4
2021-09-05,1.012022,0.298562,0.679321,5,5
2021-09-06,0.00341,-0.732401,-0.235786,5,6
2021-09-07,-1.151633,1.547033,1.782118,5,7


In [76]:
df2.index = [i for i in range(len(df2))]
df2 

Unnamed: 0,A,B,C,D,E
0,-0.573962,-1.261348,-1.34088,2.248228,one
1,-0.60734,0.324767,1.381333,0.326401,two
2,0.759956,-0.402003,0.205071,0.568046,one
3,-0.743258,-2.377915,-0.64016,0.610901,two
4,1.012022,0.298562,0.679321,0.660535,three
5,0.00341,-0.732401,-0.235786,2.305059,three
6,-1.151633,1.547033,1.782118,0.212563,four


In [77]:
df2.columns = [i for i in range(df2.shape[1])]
df2 

Unnamed: 0,0,1,2,3,4
0,-0.573962,-1.261348,-1.34088,2.248228,one
1,-0.60734,0.324767,1.381333,0.326401,two
2,0.759956,-0.402003,0.205071,0.568046,one
3,-0.743258,-2.377915,-0.64016,0.610901,two
4,1.012022,0.298562,0.679321,0.660535,three
5,0.00341,-0.732401,-0.235786,2.305059,three
6,-1.151633,1.547033,1.782118,0.212563,four


**数值运算及统计分析**

**数据的查看**

In [79]:
dates = pd.date_range("2021-09-01", periods=7)
df = pd.DataFrame(np.random.randn(7,4), index=dates, columns = ["A", "B", "C", "D"])
df 

Unnamed: 0,A,B,C,D
2021-09-01,0.83357,1.123485,-0.198066,-0.990053
2021-09-02,1.358083,0.021782,-0.116204,-0.478483
2021-09-03,-1.867148,-2.019402,0.305374,0.424939
2021-09-04,-0.007706,1.369419,0.333742,-1.283127
2021-09-05,-0.692552,1.41574,0.809994,-0.48988
2021-09-06,-1.926937,0.720577,-0.996797,0.156819
2021-09-07,-1.356093,-2.198209,-0.926177,-0.503798
