In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

## 1. 加法运算

In [2]:
s1 = Series([1,2,3], index=['A','B','C'])
s2 = Series([4,5,6,7], index=['B','C','D','E'])
s1,s2

(A    1
 B    2
 C    3
 dtype: int64, B    4
 C    5
 D    6
 E    7
 dtype: int64)

In [3]:
#按相同索引来相加，如果两个Series索引不同则为NaN
s1 + s2

A    NaN
B    6.0
C    8.0
D    NaN
E    NaN
dtype: float64

In [8]:
df1 = DataFrame(np.arange(4).reshape(2,2), index=['A','B'], columns=['BJ','SH'])
df2 = DataFrame(np.arange(9).reshape(3,3), index=['A','B','C'], columns=['BJ','SH','GZ'])
df1,df2

(   BJ  SH
 A   0   1
 B   2   3,    BJ  SH  GZ
 A   0   1   2
 B   3   4   5
 C   6   7   8)

In [9]:
#按相同索引和标签来相加，如果两个DataFrame索引和标签不同则为NaN
df1 + df2

Unnamed: 0,BJ,GZ,SH
A,0.0,,2.0
B,5.0,,7.0
C,,,


In [11]:
#axis=1按水平求和
df2.sum(axis=1)

A     3
B    12
C    21
dtype: int64

In [15]:
#axis=0是求每列中最小的
df2.min(axis=0)

BJ    0
SH    1
GZ    2
dtype: int32

In [19]:
#按列计算各种指标
df2.describe()

Unnamed: 0,BJ,SH,GZ
count,3.0,3.0,3.0
mean,3.0,4.0,5.0
std,3.0,3.0,3.0
min,0.0,1.0,2.0
25%,1.5,2.5,3.5
50%,3.0,4.0,5.0
75%,4.5,5.5,6.5
max,6.0,7.0,8.0


## 2. 排序

In [21]:
s1 = Series(np.random.randn(10))
s2 = s1.sort_values(ascending=False)
s2

5    2.317977
2    1.853130
0    1.711269
3    0.836145
9    0.298749
7    0.106060
1   -0.100340
6   -0.123620
4   -0.634701
8   -0.729008
dtype: float64

In [22]:
s2.sort_index()

0    1.711269
1   -0.100340
2    1.853130
3    0.836145
4   -0.634701
5    2.317977
6   -0.123620
7    0.106060
8   -0.729008
9    0.298749
dtype: float64

### Dataframe的排序

In [23]:
df1 = DataFrame(np.random.randn(40).reshape(8,5), columns=['A','B','C','D','E'])
df1

Unnamed: 0,A,B,C,D,E
0,-0.466505,0.687861,-0.051717,0.42174,-0.095604
1,-0.937471,1.875467,0.010727,0.249307,0.12915
2,-0.478903,0.338934,-0.674834,-0.503754,0.289106
3,-0.024392,-2.005002,0.394349,1.405161,-0.419832
4,0.884138,-2.687401,-0.450687,0.38124,1.072276
5,-1.025116,-0.657771,0.606858,-0.594983,0.688756
6,-0.733663,-0.337402,-0.325094,-0.873589,-0.486059
7,-1.357362,-0.8748,0.460135,0.682187,0.665752


In [24]:
df1['A'].sort_values()

7   -1.357362
5   -1.025116
1   -0.937471
6   -0.733663
2   -0.478903
0   -0.466505
3   -0.024392
4    0.884138
Name: A, dtype: float64

In [26]:
df2 = df1.sort_values('A')

Unnamed: 0,A,B,C,D,E
7,-1.357362,-0.8748,0.460135,0.682187,0.665752
5,-1.025116,-0.657771,0.606858,-0.594983,0.688756
1,-0.937471,1.875467,0.010727,0.249307,0.12915
6,-0.733663,-0.337402,-0.325094,-0.873589,-0.486059
2,-0.478903,0.338934,-0.674834,-0.503754,0.289106
0,-0.466505,0.687861,-0.051717,0.42174,-0.095604
3,-0.024392,-2.005002,0.394349,1.405161,-0.419832
4,0.884138,-2.687401,-0.450687,0.38124,1.072276


In [27]:
df2.sort_index()

Unnamed: 0,A,B,C,D,E
0,-0.466505,0.687861,-0.051717,0.42174,-0.095604
1,-0.937471,1.875467,0.010727,0.249307,0.12915
2,-0.478903,0.338934,-0.674834,-0.503754,0.289106
3,-0.024392,-2.005002,0.394349,1.405161,-0.419832
4,0.884138,-2.687401,-0.450687,0.38124,1.072276
5,-1.025116,-0.657771,0.606858,-0.594983,0.688756
6,-0.733663,-0.337402,-0.325094,-0.873589,-0.486059
7,-1.357362,-0.8748,0.460135,0.682187,0.665752


## 3. merge(合并)

### 这个操作有点类似于两表关联操作 inner right left outer
https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html

In [29]:
df1 = DataFrame({'key':['X','Y','Z','X'], 'data_set_1':[1,2,3,4]})
df2 = DataFrame({'key':['X','B','C'], 'data_set_2':[4,5,6]})

In [30]:
#默认是按inner合并
pd.merge(df1,df2,on=None)

Unnamed: 0,key,data_set_1,data_set_2
0,X,1,4
1,X,4,4


In [31]:
pd.merge(df1,df2, on='key',how='inner')

Unnamed: 0,key,data_set_1,data_set_2
0,X,1,4
1,X,4,4


In [32]:
pd.merge(df1, df2, on='key', how='left')

Unnamed: 0,key,data_set_1,data_set_2
0,X,1,4.0
1,Y,2,
2,Z,3,
3,X,4,4.0


In [33]:
pd.merge(df1,df2, on='key', how='right')

Unnamed: 0,key,data_set_1,data_set_2
0,X,1.0,4
1,X,4.0,4
2,B,,5
3,C,,6


In [34]:
pd.merge(df1,df2, on='key', how='outer')

Unnamed: 0,key,data_set_1,data_set_2
0,X,1.0,4.0
1,X,4.0,4.0
2,Y,2.0,
3,Z,3.0,
4,B,,5.0
5,C,,6.0


## 4. Concatenate和Combine
#### Concatenate和Combine的区别是前者是合并，后者标签不同则合并，相同则填充

In [37]:
df1 = DataFrame(np.random.randn(4,3), columns=['X','Y','Z'])
df2 = DataFrame(np.random.randn(3,3), columns=['X','Y','A'])

In [40]:
#http://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.concat.html
pd.concat([df1,df2],sort=False)

Unnamed: 0,X,Y,Z,A
0,-0.081634,1.243486,-0.691394,
1,-2.037463,-0.932389,0.261261,
2,0.207831,0.951473,-0.845902,
3,-0.05615,-1.660343,-0.544819,
0,2.000498,-0.927012,,1.656876
1,0.579775,0.261577,,-0.7551
2,0.703565,0.839213,,2.464525


### Combine

In [41]:
s1 = Series([2, np.nan, 4, np.nan], index=['A','B','C','D'])
s2 = Series([1,2,3,4], index=['A','B','C','D'])

In [42]:
#combine_first的意思是用s2非NaN数据填充s1索引所在位置的值
s1.combine_first(s2)

A    2.0
B    2.0
C    4.0
D    4.0
dtype: float64

In [43]:
df1 = DataFrame({
    'X': [1, np.nan, 3, np.nan],
    'Y': [5, np.nan, 7, np.nan],
    'Z': [9, np.nan, 11,np.nan]
})
df2 = DataFrame({
    'Z':[np.nan, 10, np.nan, 12],
    'A':[1,2,3,4]
})

In [44]:
df1.combine_first(df2)

Unnamed: 0,A,X,Y,Z
0,1.0,1.0,5.0,9.0
1,2.0,,,10.0
2,3.0,3.0,7.0,11.0
3,4.0,,,12.0


## 5. 去重

In [49]:
df_unique = pd.read_csv('demo_duplicate.csv')
df_unique.head()

Unnamed: 0.1,Unnamed: 0,Price,Seqno,Symbol,time
0,0,1623.0,0.0,APPL,1473411962
1,1,1623.0,0.0,APPL,1473411962
2,2,1623.0,0.0,APPL,1473411963
3,3,1623.0,0.0,APPL,1473411963
4,4,1649.0,1.0,APPL,1473411963


In [58]:
#删除第一列数据
del df_unique['Unnamed: 0']
df_unique.head()

Unnamed: 0,Price,Seqno,Symbol,time
0,1623.0,0.0,APPL,1473411962
1,1623.0,0.0,APPL,1473411962
2,1623.0,0.0,APPL,1473411963
3,1623.0,0.0,APPL,1473411963
4,1649.0,1.0,APPL,1473411963


In [59]:
df_unique.shape

(3989, 4)

In [60]:
#Seqno特征列唯一的数据
len(df_unique['Seqno'].unique())

1000

In [61]:
#重复的数据
df_unique['Seqno'].duplicated().head()

0    False
1     True
2     True
3     True
4    False
Name: Seqno, dtype: bool

In [62]:
"""
keep参数：
    - ``first`` : 保留重复数据的第一条
    - ``last``  : 保留重复数据的最后一条
    - False : 删除所有重复的.
"""
df_unique.drop_duplicates(['Seqno'],keep='last').head()

Unnamed: 0,Price,Seqno,Symbol,time
3,1623.0,0.0,APPL,1473411963
7,1649.0,1.0,APPL,1473411964
11,1642.0,2.0,APPL,1473411965
15,1636.0,3.0,APPL,1473411966
19,1669.0,4.0,APPL,1473411967


## 6. 时间处理