## Pandas基础命令速查清单

In [None]:
import pandas as pd # 导入pandas库并简写为pd
import numpy as np # 导入numpy库并简写为np

### 速查表内容概要
* 数据的导入
* 数据的导出
* 创建测试对象
* 数据的查看与检查
* 数据的选取
* 数据的清洗
* 数据的过滤(filter),排序(sort)和分组(groupby)
* 数据的连接(join)与组合(combine)
* 数据的统计

### 数据的导入

In [None]:
pd.read_csv(filename) # 导入csv格式文件中的数据
pd.read_table(filename) # 导入有分隔符的文本 (如TSV) 中的数据
pd.read_excel(filename) # 导入Excel格式文件中的数据
pd.read_sql(query, connection_object) # 导入SQL数据表/数据库中的数据
pd.read_json(json_string) # 导入JSON格式的字符，URL地址或者文件中的数据
pd.read_html(url) # 导入经过解析的URL地址中包含的数据框 (DataFrame) 数据
pd.read_clipboard() # 导入系统粘贴板里面的数据
pd.DataFrame(dict)  # 导入Python字典 (dict) 里面的数据，其中key是数据框的表头，value是数据框的内容。

### 数据的导出

In [None]:
df.to_csv(filename) # 将数据框 (DataFrame)中的数据导入csv格式的文件中
df.to_excel(filename) # 将数据框 (DataFrame)中的数据导入Excel格式的文件中
df.to_sql(table_name,connection_object) # 将数据框 (DataFrame)中的数据导入SQL数据表/数据库中
df.to_json(filename) # 将数据框 (DataFrame)中的数据导入JSON格式的文件中

### 创建测试对象

In [4]:
pd.DataFrame(np.random.rand(10,5)) # 创建一个5列10行的由随机浮点数组成的数据框 DataFrame

Unnamed: 0,0,1,2,3,4
0,0.634402,0.355638,0.86357,0.751466,0.194227
1,0.37704,0.851369,0.752765,0.688138,0.668549
2,0.321805,0.975655,0.729042,0.96251,0.074928
3,0.182599,0.111375,0.748255,0.119103,0.03022
4,0.515177,0.135732,0.087984,0.552803,0.690334
5,0.040929,0.702454,0.374012,0.813897,0.427638
6,0.155982,0.247538,0.109571,0.286475,0.822097
7,0.418854,0.541468,0.99461,0.840608,0.073753
8,0.781391,0.817624,0.07223,0.632914,0.111849
9,0.658036,0.546919,0.477704,0.92426,0.207392


In [5]:
my_list = ['Kesci',100,'欢迎来到科赛网']
pd.Series(my_list) # 从一个可迭代的对象 my_list 中创建一个数据组

0      Kesci
1        100
2    欢迎来到科赛网
dtype: object

In [6]:
df = pd.DataFrame(np.random.rand(10,5))
df.index = pd.date_range('2017/1/1', periods=df.shape[0])
df

Unnamed: 0,0,1,2,3,4
2017-01-01,0.963159,0.820896,0.399087,0.304181,0.312486
2017-01-02,0.850595,0.905924,0.995795,0.988338,0.486389
2017-01-03,0.86683,0.292841,0.090886,0.678776,0.379236
2017-01-04,0.09023,0.115961,0.660653,0.919876,0.966759
2017-01-05,0.780378,0.244714,0.632545,0.838008,0.986073
2017-01-06,0.220358,0.538243,0.753736,0.983747,0.621471
2017-01-07,0.859286,0.647312,0.98077,0.418091,0.026598
2017-01-08,0.177553,0.891234,0.225953,0.45088,0.100156
2017-01-09,0.362951,0.115492,0.059787,0.810268,0.342332
2017-01-10,0.345156,0.585243,0.209405,0.911806,0.073228


### 数据的查看与检查

In [7]:
df = pd.DataFrame(np.random.rand(10,5))
df.head(3) # 查看数据框的前n行

Unnamed: 0,0,1,2,3,4
0,0.765292,0.785036,0.103939,0.355682,0.902995
1,0.594721,0.06858,0.657468,0.984913,0.697774
2,0.456092,0.323296,0.444101,0.260772,0.985973


In [8]:
df.tail(3) # 查看数据框的最后n行

Unnamed: 0,0,1,2,3,4
7,0.939819,0.112157,0.705185,0.410222,0.156142
8,0.052433,0.690768,0.65938,0.653929,0.076478
9,0.467074,0.634435,0.463093,0.058645,0.591704


In [9]:
df.shape # 查看数据框的行数与列数

(10, 5)

In [10]:
df.info() # 查看数据框 (DataFrame) 的索引、数据类型及内存信息

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 5 columns):
0    10 non-null float64
1    10 non-null float64
2    10 non-null float64
3    10 non-null float64
4    10 non-null float64
dtypes: float64(5)
memory usage: 528.0 bytes


In [11]:
df.describe() # 对于数据类型为数值型的列，查询其描述性统计的内容

Unnamed: 0,0,1,2,3,4
count,10.0,10.0,10.0,10.0,10.0
mean,0.571427,0.459317,0.4758,0.462532,0.475745
std,0.253615,0.328099,0.216911,0.308714,0.370422
min,0.052433,0.06858,0.103939,0.058645,0.076478
25%,0.458837,0.137268,0.334472,0.265162,0.160341
50%,0.58162,0.478866,0.507766,0.382952,0.387732
75%,0.739192,0.761469,0.658902,0.615392,0.831196
max,0.939819,0.890339,0.705185,0.984913,0.985973


In [12]:
s = pd.Series([1,2,3,3,4,np.nan,5,5,5,6,7])
s.value_counts(dropna=False)

5.0    3
3.0    2
7.0    1
6.0    1
NaN    1
4.0    1
2.0    1
1.0    1
dtype: int64

In [None]:
df.apply(pd.Series.value_counts) # 查询数据框 (Data Frame) 中每个列的独特数据值出现次数统计

### 数据的选取

In [14]:
df = pd.DataFrame(np.random.rand(5,5),columns=list('ABCDE'))
df['C'] # 以数组 Series 的形式返回选取的列

0    0.746377
1    0.065229
2    0.725108
3    0.964221
4    0.586361
Name: C, dtype: float64

In [15]:
df[['B','E']] # 以新的数据框(DataFrame)的形式返回选取的列

Unnamed: 0,B,E
0,0.511264,0.689613
1,0.529829,0.233881
2,0.705478,0.455478
3,0.467087,0.396926
4,0.182365,0.959671


In [16]:
s = pd.Series(np.array(['I','Love','Data']))
s.iloc[0] # 按照位置选取

'I'

In [17]:
s.loc[1] # 按照索引选取

'Love'

In [18]:
df = pd.DataFrame(np.random.rand(5,5),columns=list('ABCDE'))
df.iloc[0,:] # 选取第一行

A    0.872197
B    0.635680
C    0.635601
D    0.703961
E    0.040816
Name: 0, dtype: float64

In [19]:
df.iloc[0,0] # 选取第一行的第一个元素

0.8721967738611884

### 数据的清洗 

In [21]:
df = pd.DataFrame({'A':np.array([1,np.nan,2,3,6,np.nan]),
                 'B':np.array([np.nan,4,np.nan,5,9,np.nan]),
                  'C':'foo'})
df

Unnamed: 0,A,B,C
0,1.0,,foo
1,,4.0,foo
2,2.0,,foo
3,3.0,5.0,foo
4,6.0,9.0,foo
5,,,foo


In [22]:
df.columns = ['a','b','c'] # 重命名数据框的列名称
df

Unnamed: 0,a,b,c
0,1.0,,foo
1,,4.0,foo
2,2.0,,foo
3,3.0,5.0,foo
4,6.0,9.0,foo
5,,,foo


In [24]:
pd.isnull(df) # 检查数据中空值出现的情况，并返回一个由布尔值(True,Fale)组成的列

Unnamed: 0,a,b,c
0,False,True,False
1,True,False,False
2,False,True,False
3,False,False,False
4,False,False,False
5,True,True,False


In [25]:
pd.notnull(df) # 检查数据中非空值出现的情况，并返回一个由布尔值(True,False)组成的列

Unnamed: 0,a,b,c
0,True,False,True
1,False,True,True
2,True,False,True
3,True,True,True
4,True,True,True
5,False,False,True


In [26]:
df.dropna() # 移除数据框 DataFrame 中包含空值的行

Unnamed: 0,a,b,c
3,3.0,5.0,foo
4,6.0,9.0,foo


In [27]:
df = pd.DataFrame({'A':np.array([1,np.nan,2,3,6,np.nan]),
                 'B':np.array([np.nan,4,np.nan,5,9,np.nan]),
                  'C':'foo'})
df.dropna(axis=1) # 移除数据框 DataFrame 中包含空值的列

Unnamed: 0,C
0,foo
1,foo
2,foo
3,foo
4,foo
5,foo


In [35]:
df = pd.DataFrame({'A':np.array([1,np.nan,2,3,6,np.nan]),
                 'B':np.array([np.nan,4,np.nan,5,9,np.nan]),
                  'C':'foo'})
test = df.dropna(axis=1,thresh=4) # 传入thresh=n保留至少有n个非NaN数据的列
test

Unnamed: 0,A,C
0,1.0,foo
1,,foo
2,2.0,foo
3,3.0,foo
4,6.0,foo
5,,foo


In [36]:
df = pd.DataFrame({'A':np.array([1,np.nan,2,3,6,np.nan]),
                 'B':np.array([np.nan,4,np.nan,5,9,np.nan]),
                  'C':'foo'})
df.fillna('Test') # 将数据框 DataFrame 中的所有空值替换为 x

Unnamed: 0,A,B,C
0,1,Test,foo
1,Test,4,foo
2,2,Test,foo
3,3,5,foo
4,6,9,foo
5,Test,Test,foo


In [37]:
s = pd.Series([1,3,5,np.nan,7,9,9])
s.fillna(s.mean()) # 将所有空值替换为平均值

0    1.000000
1    3.000000
2    5.000000
3    5.666667
4    7.000000
5    9.000000
6    9.000000
dtype: float64

In [38]:
s = pd.Series([1,3,5,np.nan,7,9,9])
s.astype(float) # 将数组(Series)的格式转化为浮点数

0    1.0
1    3.0
2    5.0
3    NaN
4    7.0
5    9.0
6    9.0
dtype: float64

In [39]:
s = pd.Series([1,3,5,np.nan,7,9,9])
s.replace(1,'one') # 将数组(Series)中的所有1替换为'one'

0    one
1      3
2      5
3    NaN
4      7
5      9
6      9
dtype: object

In [40]:
s = pd.Series([1,3,5,np.nan,7,9,9])
s.replace([1,3],['one','three']) # 将数组(Series)中所有的1替换为'one', 所有的3替换为'three'

0      one
1    three
2        5
3      NaN
4        7
5        9
6        9
dtype: object

In [41]:
df = pd.DataFrame(np.random.rand(4,4))
df.rename(columns=lambda x: x+ 2) # 将全体列重命名

Unnamed: 0,2,3,4,5
0,0.563575,0.709371,0.265594,0.932939
1,0.235253,0.832427,0.815837,0.270599
2,0.513409,0.125339,0.326002,0.141374
3,0.897966,0.642348,0.070173,0.027051


In [45]:
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.rename(columns={'A':'newA','C':'newC'},inplace=True)
df

Unnamed: 0,newA,B,newC,D,E
0,0.546749,0.201402,0.560444,0.371171,0.632095
1,0.026515,0.926858,0.102794,0.267676,0.122735
2,0.302172,0.741071,0.299166,0.936528,0.510771
3,0.909332,0.873563,0.000669,0.103857,0.377998
4,0.07529,0.777703,0.76105,0.541137,0.446709
5,0.552087,0.140597,0.430146,0.135321,0.515097
6,0.944791,0.612914,0.374802,0.80214,0.833065
7,0.371102,0.617005,0.037582,0.097527,0.479833
8,0.646486,0.299243,0.880716,0.06384,0.490588
9,0.93273,0.297374,0.206253,0.282748,0.995676


In [46]:
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.set_index('B') # 改变索引

Unnamed: 0_level_0,A,C,D,E
B,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.326475,0.984267,0.629362,0.681031,0.85208
0.070654,0.293621,0.411217,0.43855,0.398432
0.516076,0.836976,0.147567,0.459705,0.380413
0.220162,0.124132,0.894449,0.670004,0.084439
0.396473,0.663114,0.754589,0.557761,0.161839
0.926137,0.308237,0.462411,0.82532,0.391589
0.143772,0.181956,0.3853,0.878762,0.847838
0.506345,0.117967,0.585279,0.841842,0.231626
0.104332,0.16958,0.466322,0.338995,0.265032
0.980335,0.608795,0.255042,0.44086,0.627227


In [47]:
df = pd.DataFrame(np.random.rand(10,5))
df.rename(index = lambda x: x+ 1) # 改变全体索引

Unnamed: 0,0,1,2,3,4
1,0.461671,0.107532,0.317705,0.079651,0.888486
2,0.265748,0.462134,0.361422,0.405359,0.824924
3,0.470611,0.834543,0.556082,0.829294,0.109666
4,0.793486,0.373601,0.970351,0.990563,0.931384
5,0.83785,0.080645,0.698769,0.637389,0.621595
6,0.613416,0.34823,0.272087,0.434782,0.446874
7,0.71521,0.204066,0.316334,0.245273,0.718483
8,0.478063,0.777872,0.585363,0.625727,0.081501
9,0.508813,0.630824,0.076113,0.57491,0.50201
10,0.759369,0.379245,0.723166,0.215375,0.903184


### 数据的过滤(filter),排序(sort)和分组(groupby)

In [48]:
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df[df['A'] > 0.5] # 选取数据框df中对应行的数值大于0.5的全部数据

Unnamed: 0,A,B,C,D,E
0,0.909252,0.109838,0.126994,0.716031,0.988304
3,0.721844,0.517243,0.634656,0.786882,0.816373
6,0.739253,0.133013,0.448689,0.745554,0.476678


In [50]:
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df[(df['C'] > 0.5) & (df['D'] < 0.7)] # 选取数据框df中对应行的数值大于0.5，并且小于0.7的全部列

Unnamed: 0,A,B,C,D,E
0,0.933095,0.33458,0.523989,0.44013,0.805109
3,0.977551,0.22443,0.66287,0.407684,0.569894
4,0.197546,0.287472,0.650065,0.445697,0.519687
5,0.199163,0.550599,0.545962,0.131987,0.180043
8,0.708073,0.442256,0.787694,0.12417,0.453866


In [51]:
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.sort_values('E') # 按照数据框的列col1升序(ascending)的方式对数据框df做排序

Unnamed: 0,A,B,C,D,E
0,0.091952,0.521541,0.565988,0.164624,0.063582
2,0.793279,0.422693,0.019301,0.086624,0.082285
8,0.190326,0.251379,0.198672,0.655567,0.351445
7,0.30217,0.235869,0.586312,0.660696,0.371227
5,0.581432,0.16612,0.506205,0.434296,0.43496
6,0.321996,0.709088,0.070161,0.906377,0.489498
4,0.784812,0.481281,0.759903,0.440438,0.557973
3,0.722797,0.170516,0.264985,0.474116,0.660813
9,0.614661,0.310664,0.341355,0.862849,0.91998
1,0.905036,0.360343,0.694609,0.586409,0.973309


In [52]:
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.sort_values('A',ascending=False) # 按照数据框的列col2降序(descending)的方式对数据框df做排序

Unnamed: 0,A,B,C,D,E
6,0.867114,0.953132,0.758838,0.512169,0.130221
9,0.795766,0.266995,0.890892,0.173177,0.488046
0,0.572045,0.174289,0.257427,0.982038,0.317506
1,0.510722,0.525129,0.604497,0.762856,0.681909
8,0.44012,0.728978,0.164967,0.823076,0.152091
3,0.342193,0.3331,0.068721,0.595269,0.813263
2,0.287219,0.61854,0.01882,0.717538,0.522721
7,0.214976,0.141056,0.262968,0.881247,0.586601
5,0.15468,0.409368,0.282301,0.587636,0.560715
4,0.086726,0.959436,0.141383,0.729765,0.748645


In [53]:
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.sort_values(['A','E'],ascending=[True,False]) # 按照数据框的列col1升序，col2降序的方式对数据框df做排序

Unnamed: 0,A,B,C,D,E
3,0.016943,0.471514,0.932559,0.488036,0.703412
5,0.137504,0.24645,0.959771,0.838838,0.248183
7,0.172073,0.638365,0.311996,0.43503,0.195289
0,0.388613,0.196757,0.971956,0.355482,0.106987
1,0.461504,0.471244,0.768799,0.513544,0.660456
8,0.476283,0.621237,0.897197,0.264962,0.447765
4,0.895001,0.179432,0.115099,0.726711,0.486595
6,0.938014,0.801542,0.794802,0.723436,0.994026
9,0.945632,0.549982,0.948069,0.858895,0.847194
2,0.950061,0.248854,0.906265,0.709788,0.90598


In [54]:
df = pd.DataFrame({'A':np.array(['foo','foo','foo','foo','bar','bar']),
      'B':np.array(['one','one','two','two','three','three']),
     'C':np.array(['small','medium','large','large','small','small']),
     'D':np.array([1,2,2,3,3,5])})
df.groupby('A').count() # 按照某列对数据框df做分组

Unnamed: 0_level_0,B,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,2,2,2
foo,4,4,4


In [55]:
df = pd.DataFrame({'A':np.array(['foo','foo','foo','foo','bar','bar']),
      'B':np.array(['one','one','two','two','three','three']),
     'C':np.array(['small','medium','large','large','small','small']),
     'D':np.array([1,2,2,3,3,5])})

df.groupby(['B','C']).sum() # 按照列col1和col2对数据框df做分组

Unnamed: 0_level_0,Unnamed: 1_level_0,D
B,C,Unnamed: 2_level_1
one,medium,2
one,small,1
three,small,8
two,large,5


In [56]:
df = pd.DataFrame({'A':np.array(['foo','foo','foo','foo','bar','bar']),
      'B':np.array(['one','one','two','two','three','three']),
     'C':np.array(['small','medium','large','large','small','small']),
     'D':np.array([1,2,2,3,3,5])})
df.groupby('B')['D'].mean() # 按照列col1对数据框df做分组处理后，返回对应的col2的平均值

B
one      1.5
three    4.0
two      2.5
Name: D, dtype: float64

In [57]:
df = pd.DataFrame({'A':np.array(['foo','foo','foo','foo','bar','bar']),
      'B':np.array(['one','one','two','two','three','three']),
     'C':np.array(['small','medium','large','large','small','small']),
     'D':np.array([1,2,2,3,3,5])})

df.pivot_table(df,index=['A','B'],
               columns=['C'],aggfunc=np.sum) # 做透视表，索引为col1,针对的数值列为col2和col3，分组函数为平均值

Unnamed: 0_level_0,Unnamed: 1_level_0,D,D,D
Unnamed: 0_level_1,C,large,medium,small
A,B,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bar,three,,,8.0
foo,one,,2.0,1.0
foo,two,5.0,,


In [58]:
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.apply(np.mean) # 对数据框df的每一列求平均值

A    0.406206
B    0.551547
C    0.421584
D    0.418415
E    0.413763
dtype: float64

In [59]:
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.apply(np.max,axis=1) # 对数据框df的每一行求最大值

0    0.758974
1    0.937730
2    0.924770
3    0.966632
4    0.777907
5    0.710353
6    0.921334
7    0.915647
8    0.884015
9    0.833416
dtype: float64

### 数据的连接(join)与组合(combine)

In [60]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']},
                   index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                    'B': ['B4', 'B5', 'B6', 'B7'],
                    'C': ['C4', 'C5', 'C6', 'C7'],
                    'D': ['D4', 'D5', 'D6', 'D7']},
                   index=[4, 5, 6, 7])

df1.append(df2) # 在数据框df2的末尾添加数据框df1，其中df1和df2的列数应该相等

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


In [61]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']},
                   index=[0, 1, 2, 3])
df2 = pd.DataFrame({'A': ['A4', 'A5', 'A6', 'A7'],
                    'B': ['B4', 'B5', 'B6', 'B7'],
                    'C': ['C4', 'C5', 'C6', 'C7'],
                    'D': ['D4', 'D5', 'D6', 'D7']},
                   index=[4, 5, 6, 7])
pd.concat([df1,df2],axis=1) # 在数据框df1的列最后添加数据框df2,其中df1和df2的行数应该相等

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,A0,B0,C0,D0,,,,
1,A1,B1,C1,D1,,,,
2,A2,B2,C2,D2,,,,
3,A3,B3,C3,D3,,,,
4,,,,,A4,B4,C4,D4
5,,,,,A5,B5,C5,D5
6,,,,,A6,B6,C6,D6
7,,,,,A7,B7,C7,D7


In [62]:
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],           
                     'B': ['B0', 'B1', 'B2', 'B3'],
                     'key': ['K0', 'K1', 'K0', 'K1']})
df2 = pd.DataFrame({'C': ['C0', 'C1'],
                      'D': ['D0', 'D1']},
                     index=['K0', 'K1'])
df1.join(df2, on='key') # 对数据框df1和df2做内连接，其中连接的列为col1

Unnamed: 0,A,B,key,C,D
0,A0,B0,K0,C0,D0
1,A1,B1,K1,C1,D1
2,A2,B2,K0,C0,D0
3,A3,B3,K1,C1,D1


### 数据的统计

In [63]:
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.describe() # 得到数据框df每一列的描述性统计

Unnamed: 0,A,B,C,D,E
count,10.0,10.0,10.0,10.0,10.0
mean,0.472611,0.444383,0.438426,0.565316,0.30676
std,0.274014,0.329841,0.222911,0.250163,0.229594
min,0.030337,0.04011,0.13421,0.13204,0.015299
25%,0.374933,0.174824,0.305477,0.472755,0.148285
50%,0.536927,0.415575,0.429983,0.662678,0.2458
75%,0.611253,0.659784,0.504619,0.680887,0.466553
max,0.869924,0.950292,0.888192,0.910587,0.706516


In [64]:
df.mean() # 得到数据框df中每一列的平均值

A    0.472611
B    0.444383
C    0.438426
D    0.565316
E    0.306760
dtype: float64

In [65]:
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.corr() # 得到数据框df中每一列与其他列的相关系数

Unnamed: 0,A,B,C,D,E
A,1.0,-0.37497,-0.288821,-0.367187,0.580316
B,-0.37497,1.0,-0.105793,0.369996,-0.42443
C,-0.288821,-0.105793,1.0,0.167675,-0.014294
D,-0.367187,0.369996,0.167675,1.0,0.029428
E,0.580316,-0.42443,-0.014294,0.029428,1.0


In [66]:
df = pd.DataFrame(np.random.rand(10,5),columns=list('ABCDE'))
df.count() # 得到数据框df中每一列的非空值个数

A    10
B    10
C    10
D    10
E    10
dtype: int64

In [None]:
df.max() # 得到数据框df中每一列的最大值
df.min() # 得到数据框df中每一列的最小值
df.median() # 得到数据框df中每一列的中位数
df.std() # 得到数据框df中每一列的标准差