In [1]:
import numpy as np
import pandas as pd

In [4]:
d = pd.date_range('20200301', periods=10)
df = pd.DataFrame(np.random.randn(10, 4), index=d, columns=['A', 'B', 'C', 'D'])
df

Unnamed: 0,A,B,C,D
2020-03-01,0.27775,-0.564579,-0.804803,1.320366
2020-03-02,0.201037,0.430116,-1.218913,-1.212097
2020-03-03,0.747128,-0.093989,-0.408096,0.3262
2020-03-04,-0.594905,0.932909,-0.780018,-1.182389
2020-03-05,-1.850293,-0.379102,-0.31797,0.111205
2020-03-06,1.599403,0.425211,0.22861,0.196597
2020-03-07,0.50533,-0.359633,-0.042903,1.775852
2020-03-08,0.865323,0.221063,0.39785,0.433431
2020-03-09,-0.273269,1.343671,-0.25985,0.444786
2020-03-10,0.212389,-0.689069,-1.557302,0.011357


In [5]:
# 求每一列的平均值
df.mean()

A    0.168989
B    0.126660
C   -0.476340
D    0.222531
dtype: float64

In [6]:
# 求每一行的平均值
df.mean(axis=1)

2020-03-01    0.057184
2020-03-02   -0.449964
2020-03-03    0.142810
2020-03-04   -0.406101
2020-03-05   -0.609040
2020-03-06    0.612455
2020-03-07    0.469661
2020-03-08    0.479417
2020-03-09    0.313834
2020-03-10   -0.505656
Freq: D, dtype: float64

In [14]:
# Dataframe和Series做减法，会自动广播Series到每一列
s = pd.Series([1, 2, 3, np.nan, 4, 5, 6, 7, 7, 9], index=d).shift(2)
s

2020-03-01    NaN
2020-03-02    NaN
2020-03-03    1.0
2020-03-04    2.0
2020-03-05    3.0
2020-03-06    NaN
2020-03-07    4.0
2020-03-08    5.0
2020-03-09    6.0
2020-03-10    7.0
Freq: D, dtype: float64

In [15]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D
2020-03-01,,,,
2020-03-02,,,,
2020-03-03,-0.252872,-1.093989,-1.408096,-0.6738
2020-03-04,-2.594905,-1.067091,-2.780018,-3.182389
2020-03-05,-4.850293,-3.379102,-3.31797,-2.888795
2020-03-06,,,,
2020-03-07,-3.49467,-4.359633,-4.042903,-2.224148
2020-03-08,-4.134677,-4.778937,-4.60215,-4.566569
2020-03-09,-6.273269,-4.656329,-6.25985,-5.555214
2020-03-10,-6.787611,-7.689069,-8.557302,-6.988643


In [17]:
# 在Dataframe上应用函数
# 逐列累加
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2020-03-01,0.27775,-0.564579,-0.804803,1.320366
2020-03-02,0.478788,-0.134463,-2.023717,0.108269
2020-03-03,1.225916,-0.228453,-2.431813,0.434469
2020-03-04,0.631011,0.704456,-3.211831,-0.74792
2020-03-05,-1.219283,0.325355,-3.529801,-0.636715
2020-03-06,0.38012,0.750565,-3.301191,-0.440118
2020-03-07,0.88545,0.390932,-3.344094,1.335734
2020-03-08,1.750773,0.611995,-2.946244,1.769165
2020-03-09,1.477503,1.955667,-3.206095,2.213951
2020-03-10,1.689893,1.266598,-4.763397,2.225308


In [18]:
# Apply 一个lambda函数，求每列的最大值-最小值
df.apply(lambda x: x.max() - x.min())

A    3.449696
B    2.032741
C    1.955152
D    2.987948
dtype: float64

In [19]:
# value_counts 每个值出现了几次
s.value_counts()

1.0    1
2.0    1
3.0    1
4.0    1
5.0    1
6.0    1
7.0    1
Name: count, dtype: int64

In [22]:
# string series的操作
s_s = pd.Series(['edureka', 'python', 'jupyter', np.nan, 'football', 'world'])
s_s.str.upper()

0     EDUREKA
1      PYTHON
2     JUPYTER
3         NaN
4    FOOTBALL
5       WORLD
dtype: object

In [23]:
# union 多个DataFrame连接成1个，按相同的列concat
df_u = pd.DataFrame(np.random.randn(10, 4))
df_pieces = [df_u[:3], df_u[3:7], df_u[7:]]
df_pieces

[          0         1         2         3
 0  0.246274 -0.341634 -0.328080  0.940109
 1 -0.150006  1.349679  0.704236 -0.635245
 2 -0.608664 -0.160128  0.943316  0.713887,
           0         1         2         3
 3  0.481974  0.225676 -0.774541  0.435179
 4  0.526311  0.586677 -0.156885  0.925175
 5 -0.805844  2.297524 -0.670114  0.376062
 6 -1.649636 -0.725083  1.088575  0.416930,
           0         1         2         3
 7 -1.313430  0.122037 -0.997698 -0.660084
 8  1.124259 -0.676840 -0.645520 -0.815803
 9  1.115562 -1.293456 -0.519894  0.172832]

In [24]:
pd.concat(df_pieces)

Unnamed: 0,0,1,2,3
0,0.246274,-0.341634,-0.32808,0.940109
1,-0.150006,1.349679,0.704236,-0.635245
2,-0.608664,-0.160128,0.943316,0.713887
3,0.481974,0.225676,-0.774541,0.435179
4,0.526311,0.586677,-0.156885,0.925175
5,-0.805844,2.297524,-0.670114,0.376062
6,-1.649636,-0.725083,1.088575,0.41693
7,-1.31343,0.122037,-0.997698,-0.660084
8,1.124259,-0.67684,-0.64552,-0.815803
9,1.115562,-1.293456,-0.519894,0.172832


In [28]:
# merge 按指定列join两个DataFrame
left = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
right = pd.DataFrame({'A': [3, 2], 'D': [4, 5]})

pd.merge(left, right, on='A')

Unnamed: 0,A,B,D
0,2,4,5


In [31]:
# 按列分组，groupby
df.groupby('B').sum()

Unnamed: 0_level_0,A,C,D
B,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-0.689069,0.212389,-1.557302,0.011357
-0.564579,0.27775,-0.804803,1.320366
-0.379102,-1.850293,-0.31797,0.111205
-0.359633,0.50533,-0.042903,1.775852
-0.093989,0.747128,-0.408096,0.3262
0.221063,0.865323,0.39785,0.433431
0.425211,1.599403,0.22861,0.196597
0.430116,0.201037,-1.218913,-1.212097
0.932909,-0.594905,-0.780018,-1.182389
1.343671,-0.273269,-0.25985,0.444786


In [32]:
# 也可以按多列分组
df.groupby(['B', 'C']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,A,D
B,C,Unnamed: 2_level_1,Unnamed: 3_level_1
-0.689069,-1.557302,0.212389,0.011357
-0.564579,-0.804803,0.27775,1.320366
-0.379102,-0.31797,-1.850293,0.111205
-0.359633,-0.042903,0.50533,1.775852
-0.093989,-0.408096,0.747128,0.3262
0.221063,0.39785,0.865323,0.433431
0.425211,0.22861,1.599403,0.196597
0.430116,-1.218913,0.201037,-1.212097
0.932909,-0.780018,-0.594905,-1.182389
1.343671,-0.25985,-0.273269,0.444786
