# 第1节：数据的排序
所有的方法均可以适用于Serise和DataFrame

以下每个例子都给出【Serise和DataFrame】2种方案

---

In [37]:
# 根据索引进行排序
# .sort_index(axis=0, ascending=True)
# axis指定轴，ascending指定升序或者降序【默认：0轴，升序】

#-------------------------------------------------#
# DataFrame例子

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

data = np.arange(20).reshape(4, 5)
index = ['c','a','b','d']
columns = ['A','B','D','E','C']
a = pd.DataFrame(data,index,columns)
a

Unnamed: 0,A,B,D,E,C
c,0,1,2,3,4
a,5,6,7,8,9
b,10,11,12,13,14
d,15,16,17,18,19


In [38]:
a.sort_index() #【默认的】对索引进行了升序排序

Unnamed: 0,A,B,D,E,C
a,5,6,7,8,9
b,10,11,12,13,14
c,0,1,2,3,4
d,15,16,17,18,19


In [39]:
a.sort_index(axis=1, ascending=False) #设置了参数，对【1轴、降序】排序

Unnamed: 0,E,D,C,B,A
c,3,2,4,1,0
a,8,7,9,6,5
b,13,12,14,11,10
d,18,17,19,16,15


In [40]:
# Series的sort_index()方法
b = pd.Series([2,3,1,-1],['c','a','b','d'])

b.sort_index()

a    3
b    1
c    2
d   -1
dtype: int64

In [41]:
# .sort_values()方法在指定轴上根据数值进行排序，【默认：升序、0轴】

# Series.sort_values(axis=0, ascending=True)

b = pd.Series([2,3,1,-1],['c','a','b','d'])
b.sort_values(axis = 0, ascending = False)

a    3
c    2
b    1
d   -1
dtype: int64

In [42]:
# DataFrame.sort_values(by, axis=0, ascending=True)【默认：升序、0轴】
# by : axis轴上的某个索引或索引列表
data = np.arange(20).reshape(4, 5)
index = ['c','a','b','d']
columns = ['A','B','D','E','C']
a = pd.DataFrame(data,index,columns)

a.sort_values(['A','E'], axis=0, ascending=True) #注意'A'和'E'对应的数字

Unnamed: 0,A,B,D,E,C
c,0,1,2,3,4
a,5,6,7,8,9
b,10,11,12,13,14
d,15,16,17,18,19


In [43]:
# NaN统一放到排序末尾
a = pd.DataFrame(np.arange(12).reshape(3,4))
b = pd.DataFrame(np.arange(20).reshape(4,5))
c=a+b
c.sort_values(2, axis=1, ascending=True)

Unnamed: 0,0,1,2,3,4
0,0.0,2.0,4.0,6.0,
1,9.0,11.0,13.0,15.0,
2,18.0,20.0,22.0,24.0,
3,,,,,


---
# 第2节：数据的统计分析
![avatar](./image/day4-img4.PNG)
![avatar](./image/day4-img5.PNG)
![avatar](./image/day4-img6.PNG)

In [44]:
# 最强大的.describe()
b = pd.DataFrame(np.arange(20).reshape(4,5))
b.describe()

Unnamed: 0,0,1,2,3,4
count,4.0,4.0,4.0,4.0,4.0
mean,7.5,8.5,9.5,10.5,11.5
std,6.454972,6.454972,6.454972,6.454972,6.454972
min,0.0,1.0,2.0,3.0,4.0
25%,3.75,4.75,5.75,6.75,7.75
50%,7.5,8.5,9.5,10.5,11.5
75%,11.25,12.25,13.25,14.25,15.25
max,15.0,16.0,17.0,18.0,19.0


In [45]:
# .ix已经被淘汰
# 快速获取max值
b.describe().loc['max']

0    15.0
1    16.0
2    17.0
3    18.0
4    19.0
Name: max, dtype: float64

---
# 第3节：数据的累计分析
![avatar](./image/day4-img7.PNG)
![avatar](./image/day4-img8.PNG)

In [46]:
b = pd.DataFrame(np.arange(20).reshape(4,5),['c','a','d','b'])
b

Unnamed: 0,0,1,2,3,4
c,0,1,2,3,4
a,5,6,7,8,9
d,10,11,12,13,14
b,15,16,17,18,19


In [47]:
b.cumsum()

Unnamed: 0,0,1,2,3,4
c,0,1,2,3,4
a,5,7,9,11,13
d,15,18,21,24,27
b,30,34,38,42,46


In [48]:
# 相邻的定义，【以该数为终点，向前找数】
b.rolling(2).sum()

Unnamed: 0,0,1,2,3,4
c,,,,,
a,5.0,7.0,9.0,11.0,13.0
d,15.0,17.0,19.0,21.0,23.0
b,25.0,27.0,29.0,31.0,33.0


---
# 第4节：数据的相关分析
![avatar](./image/day4-img9.PNG)
![avatar](./image/day4-img10.PNG)
![avatar](./image/day4-img11.PNG)
![avatar](./image/day4-img12.PNG)

In [49]:
b = pd.DataFrame(np.arange(20).reshape(4,5),['c','a','d','b'])

b.cov()

Unnamed: 0,0,1,2,3,4
0,41.666667,41.666667,41.666667,41.666667,41.666667
1,41.666667,41.666667,41.666667,41.666667,41.666667
2,41.666667,41.666667,41.666667,41.666667,41.666667
3,41.666667,41.666667,41.666667,41.666667,41.666667
4,41.666667,41.666667,41.666667,41.666667,41.666667


In [50]:
# DataFrame.corr(method='pearson')
# method：可选值为{‘pearson’, ‘kendall’, ‘spearman’}

b = pd.DataFrame(np.random.randn(20).reshape(4,5),['c','a','d','b'])

b.corr(method = 'kendall')

Unnamed: 0,0,1,2,3,4
0,1.0,1.0,0.333333,-0.666667,-0.333333
1,1.0,1.0,0.333333,-0.666667,-0.333333
2,0.333333,0.333333,1.0,0.0,0.333333
3,-0.666667,-0.666667,0.0,1.0,0.666667
4,-0.333333,-0.333333,0.333333,0.666667,1.0
