# Pandas

In [45]:
import pandas as pd
import numpy as np

## 1. Pandas基础：Series，DataFrame

### 1.1 Series

In [2]:
s1 = pd.Series([1, 2, 3]) # 创建一个Series，索引为默认值
print(s1)

0    1
1    2
2    3
dtype: int64


In [3]:
s1.values # 查看Series的值

array([1, 2, 3], dtype=int64)

In [4]:
s1.index # 查看Series的索引

RangeIndex(start=0, stop=3, step=1)

In [5]:
s2 = pd.Series([1, 2, 3, 4, 4], index=["a", "b", "c", "d", "e"])
print(s2)

a    1
b    2
c    3
d    4
e    4
dtype: int64


In [6]:
s2["e"] # 根据索引提取值

4

In [7]:
s2[["a", "c", "d"]] # 提取多个值，注意使用list形式

a    1
c    3
d    4
dtype: int64

In [8]:
'b' in s2 # 判断索引b是否存在s2中

True

In [9]:
# Series可以看成定长的有序字典
dic1 = {'a1':1, 'a2':2, 'a3':3, 'a4':4, 'a5': 4} # 四个key，四个value
s3 = pd.Series(dic1)
print(s3)

a1    1
a2    2
a3    3
a4    4
a5    4
dtype: int64


### 1.2 DataFrame

In [10]:
data = {
    'year': [2017, 2018, 2019],
    'income': [11, 22, 33],
    'pay': [1, 2, 3]
}
df1 = pd.DataFrame(data) # 使用字典的数据创建DataFrame，未指定索引则为默认索引
print(df1)

   year  income  pay
0  2017      11    1
1  2018      22    2
2  2019      33    3


In [11]:
df2 = pd.DataFrame(np.arange(12).reshape(3, 4)) # 使用Numpy创建DataFrame
print(df2)

   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11


In [27]:
df3 = pd.DataFrame(
    np.arange(24).reshape(4, 6), 
    index=[1, 3, 2, 4], 
    columns=['b', 'a', 'c', 'd', 'e', 'f'])
print(df3)

    b   a   c   d   e   f
1   0   1   2   3   4   5
3   6   7   8   9  10  11
2  12  13  14  15  16  17
4  18  19  20  21  22  23


In [28]:
df1.columns # 调用列属性

Index(['year', 'income', 'pay'], dtype='object')

In [29]:
df2.index # 调用行索引

RangeIndex(start=0, stop=3, step=1)

In [30]:
df1.values # 调用值

array([[2017,   11,    1],
       [2018,   22,    2],
       [2019,   33,    3]], dtype=int64)

In [31]:
df3.describe() # 显示DataFrame的一些基础统计结果，例如求和，平均，偏差等

Unnamed: 0,b,a,c,d,e,f
count,4.0,4.0,4.0,4.0,4.0,4.0
mean,9.0,10.0,11.0,12.0,13.0,14.0
std,7.745967,7.745967,7.745967,7.745967,7.745967,7.745967
min,0.0,1.0,2.0,3.0,4.0,5.0
25%,4.5,5.5,6.5,7.5,8.5,9.5
50%,9.0,10.0,11.0,12.0,13.0,14.0
75%,13.5,14.5,15.5,16.5,17.5,18.5
max,18.0,19.0,20.0,21.0,22.0,23.0


In [34]:
df3.T # DataFrame转置

Unnamed: 0,1,3,2,4
b,0,6,12,18
a,1,7,13,19
c,2,8,14,20
d,3,9,15,21
e,4,10,16,22
f,5,11,17,23


In [40]:
df3.sort_index(axis=1) # axis=1表示对列进行从小到大排序

Unnamed: 0,a,b,c,d,e,f
1,1,0,2,3,4,5
3,7,6,8,9,10,11
2,13,12,14,15,16,17
4,19,18,20,21,22,23


In [41]:
df3.sort_index(axis=0) # axis=0表示对行进行排序

Unnamed: 0,b,a,c,d,e,f
1,0,1,2,3,4,5
2,12,13,14,15,16,17
3,6,7,8,9,10,11
4,18,19,20,21,22,23


In [44]:
df3.sort_values(by="a") # 对DataFrame的值进行排列

Unnamed: 0,b,a,c,d,e,f
1,0,1,2,3,4,5
3,6,7,8,9,10,11
2,12,13,14,15,16,17
4,18,19,20,21,22,23


## 2. Pandas选择数据

In [56]:
dates = pd.date_range('20200221', periods=6) # pands生成时间序列
df1 = pd.DataFrame(
    np.arange(24).reshape(6,4), 
    index=dates, 
    columns=['A', 'B', 'C', 'D'])
print(df1)

             A   B   C   D
2020-02-21   0   1   2   3
2020-02-22   4   5   6   7
2020-02-23   8   9  10  11
2020-02-24  12  13  14  15
2020-02-25  16  17  18  19
2020-02-26  20  21  22  23


In [62]:
type(df1["A"]) # 表中的每个列都可以看成一个Series

pandas.core.series.Series

In [63]:
df1.A # 选择列数据

2020-02-21     0
2020-02-22     4
2020-02-23     8
2020-02-24    12
2020-02-25    16
2020-02-26    20
Freq: D, Name: A, dtype: int32

In [65]:
df1[0:2] # 获取DataFrame的0-1行

Unnamed: 0,A,B,C,D
2020-02-21,0,1,2,3
2020-02-22,4,5,6,7


In [68]:
df1['20200221':'20200224']

Unnamed: 0,A,B,C,D
2020-02-21,0,1,2,3
2020-02-22,4,5,6,7
2020-02-23,8,9,10,11
2020-02-24,12,13,14,15


In [69]:
# 通过标签选择数据
df1.loc['2020-02-22']

A    4
B    5
C    6
D    7
Name: 2020-02-22 00:00:00, dtype: int32

In [71]:
df1.loc['2020-02-22', ["A", "D"]] # 第一个位置代表行，第二个位置代表列

A    4
D    7
Name: 2020-02-22 00:00:00, dtype: int32

In [73]:
df1.loc[:, ["A", "C"]] # “:”表示所有行或列

Unnamed: 0,A,C
2020-02-21,0,2
2020-02-22,4,6
2020-02-23,8,10
2020-02-24,12,14
2020-02-25,16,18
2020-02-26,20,22


In [75]:
df1.iloc[1] # 比loc前面多了一个i，表示index，按照位置选择

A    4
B    5
C    6
D    7
Name: 2020-02-22 00:00:00, dtype: int32

In [82]:
 df1.iloc[0:3, [2,3]] # 按照数字提取行和列，前行后列，如果提取不连续的，使用list即可

Unnamed: 0,C,D
2020-02-21,2,3
2020-02-22,6,7
2020-02-23,10,11


In [89]:
df1.A > 2 # 将某列的数据提取出来判断值大小

2020-02-21    False
2020-02-22     True
2020-02-23     True
2020-02-24     True
2020-02-25     True
2020-02-26     True
Freq: D, Name: A, dtype: bool

In [92]:
df1[df1.A > 10] # 将A列值大于10的行显示出来

Unnamed: 0,A,B,C,D
2020-02-24,12,13,14,15
2020-02-25,16,17,18,19
2020-02-26,20,21,22,23


## 3. Pandas赋值及操作

In [168]:
dates = np.arange(20200221, 20200227) # Numpy生成时间序列
df1 = pd.DataFrame(
    np.arange(24).reshape(6,4),
    index=dates, 
    columns=['A', 'B', 'C', 'D'])
print(df1)

           A   B   C   D
20200221   0   1   2   3
20200222   4   5   6   7
20200223   8   9  10  11
20200224  12  13  14  15
20200225  16  17  18  19
20200226  20  21  22  23


In [169]:
df1.iloc[2,2] = 100 # 将三行第三列赋值为100
print(df1)

           A   B    C   D
20200221   0   1    2   3
20200222   4   5    6   7
20200223   8   9  100  11
20200224  12  13   14  15
20200225  16  17   18  19
20200226  20  21   22  23


In [170]:
df1.loc[20200222, 'C'] = 60

In [171]:
df1[df1 > 10] = 0 # 将大于10的数赋值为0
print(df1)

          A  B  C  D
20200221  0  1  2  3
20200222  4  5  0  7
20200223  8  9  0  0
20200224  0  0  0  0
20200225  0  0  0  0
20200226  0  0  0  0


In [172]:
df1.A[df1.A == 0] = 1 # 将A列等于0的值赋值为1
print(df1)

          A  B  C  D
20200221  1  1  2  3
20200222  4  5  0  7
20200223  8  9  0  0
20200224  1  0  0  0
20200225  1  0  0  0
20200226  1  0  0  0


In [173]:
df1['E'] = 10 # 插入一列
print(df1)

          A  B  C  D   E
20200221  1  1  2  3  10
20200222  4  5  0  7  10
20200223  8  9  0  0  10
20200224  1  0  0  0  10
20200225  1  0  0  0  10
20200226  1  0  0  0  10


In [174]:
df1['F'] = pd.Series(range(6), index=dates) # 因为每一列是个Serises所以插入得插Series
print(df1)

          A  B  C  D   E  F
20200221  1  1  2  3  10  0
20200222  4  5  0  7  10  1
20200223  8  9  0  0  10  2
20200224  1  0  0  0  10  3
20200225  1  0  0  0  10  4
20200226  1  0  0  0  10  5


In [175]:
df1.loc[20200227, ['A']]=1 # 插入行
print(df1)

            A    B    C    D     E    F
20200221  1.0  1.0  2.0  3.0  10.0  0.0
20200222  4.0  5.0  0.0  7.0  10.0  1.0
20200223  8.0  9.0  0.0  0.0  10.0  2.0
20200224  1.0  0.0  0.0  0.0  10.0  3.0
20200225  1.0  0.0  0.0  0.0  10.0  4.0
20200226  1.0  0.0  0.0  0.0  10.0  5.0
20200227  1.0  NaN  NaN  NaN   NaN  NaN


In [176]:
s1 = pd.Series(range(6), index=['A','B','C','D','E','F'])
s1.name = 'S1'
df2 = df1.append(s1)
print(df1)

            A    B    C    D     E    F
20200221  1.0  1.0  2.0  3.0  10.0  0.0
20200222  4.0  5.0  0.0  7.0  10.0  1.0
20200223  8.0  9.0  0.0  0.0  10.0  2.0
20200224  1.0  0.0  0.0  0.0  10.0  3.0
20200225  1.0  0.0  0.0  0.0  10.0  4.0
20200226  1.0  0.0  0.0  0.0  10.0  5.0
20200227  1.0  NaN  NaN  NaN   NaN  NaN


In [184]:
df1.insert(1, 'H', df2['E']) # 在df1的第一列插入df2的E列
print(df1)

            A     H    B    C    D     E    F
20200221  1.0  10.0  1.0  2.0  3.0  10.0  0.0
20200222  4.0  10.0  5.0  0.0  7.0  10.0  1.0
20200223  8.0  10.0  9.0  0.0  0.0  10.0  2.0
20200224  1.0  10.0  0.0  0.0  0.0  10.0  3.0
20200225  1.0  10.0  0.0  0.0  0.0  10.0  4.0
20200226  1.0  10.0  0.0  0.0  0.0  10.0  5.0
20200227  1.0   NaN  NaN  NaN  NaN   NaN  NaN


In [190]:
g = df1.pop('H') # 弹出H列
df1.insert(6, 'H', g) # 插入H列在最后
print(df1)

            A    B    C    D     E    F     H
20200221  1.0  1.0  2.0  3.0  10.0  0.0  10.0
20200222  4.0  5.0  0.0  7.0  10.0  1.0  10.0
20200223  8.0  9.0  0.0  0.0  10.0  2.0  10.0
20200224  1.0  0.0  0.0  0.0  10.0  3.0  10.0
20200225  1.0  0.0  0.0  0.0  10.0  4.0  10.0
20200226  1.0  0.0  0.0  0.0  10.0  5.0  10.0
20200227  1.0  NaN  NaN  NaN   NaN  NaN   NaN


In [191]:
del df1['H'] # 删除H列
print(df1)

            A    B    C    D     E    F
20200221  1.0  1.0  2.0  3.0  10.0  0.0
20200222  4.0  5.0  0.0  7.0  10.0  1.0
20200223  8.0  9.0  0.0  0.0  10.0  2.0
20200224  1.0  0.0  0.0  0.0  10.0  3.0
20200225  1.0  0.0  0.0  0.0  10.0  4.0
20200226  1.0  0.0  0.0  0.0  10.0  5.0
20200227  1.0  NaN  NaN  NaN   NaN  NaN
