# 基于 pandas DataFrame 的数据处理

+ 合并数据，将不同的DataFrame数据按行或列合并，形成新的DataFrame
+ 汇总数据，根据关键字对数据进行分组计算

In [2]:
from pandas import Series, DataFrame
import pandas as pd
from numpy.random import randn
import numpy as np
from myfunctions import *
np.set_printoptions(precision=4)

## 数据合并

+ pandas.merge 可根据一个或多个关键字将不同DataFrame中的行就行链接，即进行横向的扩展
+ pandas.concat可以沿着一条轴将多个对象堆叠在一切


In [3]:
df1 = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                 'data1': range(7)})
df2 = DataFrame({'key': ['a', 'b','b', 'd'],
                 'data2': range(4)})
df3 = DataFrame({'key': ['a', 'b','b', 'd','e'],
                 'data2': range(5)})

print(df1.index)

RangeIndex(start=0, stop=7, step=1)


+ 选项 on 给定新DataFrame的键
    + 如果不指定，系统会将多个数据中重叠的列作为键，并此为基础进行连接
    + pd.merge如果不指定合并的形式，则用inner连接，键是交集，
    + 其它方式还包括，"left","right","outer"
    + outer 得到的新表中的键是原数据键的并集 ******

In [7]:
dfmerge1=pd.merge(df1, df2)
print (dfmerge1)


   data1 key  data2
0      0   b      1
1      0   b      2
2      1   b      1
3      1   b      2
4      6   b      1
5      6   b      2
6      2   a      0
7      4   a      0
8      5   a      0


In [4]:
dfmerge1=pd.merge(df1, df2,on='key')
dfmerge2=pd.merge(df1, df2, on='key',how="left")
dfmerge3=pd.merge(df1, df2, on='key',how="outer")
side_by_side(df1,df2,dfmerge1,dfmerge2,dfmerge3)

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,a
6,6,b

Unnamed: 0,data2,key
0,0,a
1,1,b
2,2,b
3,3,d

Unnamed: 0,data1,key,data2
0,0,b,1
1,0,b,2
2,1,b,1
3,1,b,2
4,6,b,1
5,6,b,2
6,2,a,0
7,4,a,0
8,5,a,0

Unnamed: 0,data1,key,data2
0,0,b,1.0
1,0,b,2.0
2,1,b,1.0
3,1,b,2.0
4,2,a,0.0
5,3,c,
6,4,a,0.0
7,5,a,0.0
8,6,b,1.0
9,6,b,2.0

Unnamed: 0,data1,key,data2
0,0.0,b,1.0
1,0.0,b,2.0
2,1.0,b,1.0
3,1.0,b,2.0
4,6.0,b,1.0
5,6.0,b,2.0
6,2.0,a,0.0
7,4.0,a,0.0
8,5.0,a,0.0
9,3.0,c,


+ 按键进行连接，同一个键在表中有多行，这进行两两组合

In [5]:
df1 = DataFrame({'key': ['a', 'b'],
                 'data1': range(2)})
df2 = DataFrame({'key': ['b',  'b' ],
                 'data2': range(2)})
merge4=pd.merge(df1, df2, how='inner')
side_by_side(df1,df2,merge4)

Unnamed: 0,data1,key
0,0,a
1,1,b

Unnamed: 0,data2,key
0,0,b
1,1,b

Unnamed: 0,data1,key,data2
0,1,b,0
1,1,b,1


+ 也可以通过多个键进行合并

In [6]:
left = DataFrame({'代码': ['foo', 'foo', 'bar'],
                  'key2': ['one', 'two', 'one'],
                  'lval': [1, 2, 3]}) 
right = DataFrame({'代码': ['foo', 'foo', 'bar', 'bar'],
                   'key2': ['one', 'one', 'one', 'two'],
                   'rval': [4, 5, 6, 7]})
merge5=pd.merge(left, right, on='代码', how='outer')
merge6=pd.merge(left, right, on=['代码', 'key2'], how='inner')
side_by_side(left,right,merge5,merge6)

Unnamed: 0,key2,lval,代码
0,one,1,foo
1,two,2,foo
2,one,3,bar

Unnamed: 0,key2,rval,代码
0,one,4,foo
1,one,5,foo
2,one,6,bar
3,two,7,bar

Unnamed: 0,key2_x,lval,代码,key2_y,rval
0,one,1,foo,one,4
1,one,1,foo,one,5
2,two,2,foo,one,4
3,two,2,foo,one,5
4,one,3,bar,one,6
5,one,3,bar,two,7

Unnamed: 0,key2,lval,代码,rval
0,one,1,foo,4
1,one,1,foo,5
2,one,3,bar,6


+ 索引作为键合并, 使用选项 left_index=True 或者right_index=True
+ 左右表的键名称可以不同，用left_on 和 right_on 指定

In [7]:
left1 = DataFrame({'代码': ['a', 'b', 'a', 'a', 'b', 'c'],
                  'value': range(6)})
right1 = DataFrame({'平均值': [3.5, 7]}, index=['a', 'b'])


pm2=pd.merge(left1, right1, left_on='代码', right_index=True)
side_by_side(left1,right1,pm2)


Unnamed: 0,value,代码
0,0,a
1,1,b
2,2,a
3,3,a
4,4,b
5,5,c

Unnamed: 0,平均值
a,3.5
b,7.0

Unnamed: 0,value,代码,平均值
0,0,a,3.5
2,2,a,3.5
3,3,a,3.5
1,1,b,7.0
4,4,b,7.0


### 轴向连接

+ numpy中数组连接 np.concatenate
+ pandas 中 concat

In [8]:
arr = np.arange(12).reshape((3, 4))
arr

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [9]:
np.concatenate([arr, arr], axis=0)##axis表示横/竖

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11]])

In [10]:
np.concatenate([arr, arr], axis=1)

array([[ 0,  1,  2,  3,  0,  1,  2,  3],
       [ 4,  5,  6,  7,  4,  5,  6,  7],
       [ 8,  9, 10, 11,  8,  9, 10, 11]])

pd.concat连接Series也受传入的轴的影响
+ axis=0（默认）形成一个Series，axis=1则形成一个DataFrame

In [11]:
s1 = Series([0, 1], index=['a', 'b'])
s2 = Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = Series([5, 6], index=['c', 'g'])
s_all = pd.concat([s1, s2, s3])
print(s1)
print(s2)
print(s3)
print(s_all)

a    0
b    1
dtype: int64
c    2
d    3
e    4
dtype: int64
c    5
g    6
dtype: int64
a    0
b    1
c    2
d    3
e    4
c    5
g    6
dtype: int64


In [12]:
pd.concat([s1, s2, s3], axis=1)

Unnamed: 0,0,1,2
a,0.0,,
b,1.0,,
c,,2.0,5.0
d,,3.0,
e,,4.0,
g,,,6.0


In [13]:
df1 = DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
                columns=['one', 'two'])
df2 = DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
                columns=['three', 'four'])
cdf = pd.concat([df1, df2], axis=1)
print(df1)
print(df2)
print(cdf)

   one  two
a    0    1
b    2    3
c    4    5
   three  four
a      5     6
c      7     8
   one  two  three  four
a    0    1    5.0   6.0
b    2    3    NaN   NaN
c    4    5    7.0   8.0


+ 去掉重复数据

In [14]:
data = DataFrame({'k1': ['one'] * 3 + ['two'] * 4,
                  'k2': [1, 1, 2, 3, 3, 4, 4]})
newdata1=data.drop_duplicates()
side_by_side(data,newdata1)

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [15]:
data['v1'] = range(7)##data is a dataframe
#不希望用所有列判断是否重复,可以指出用于判断的列
newdata2=data.drop_duplicates(['k1'])
newdata3=data.drop_duplicates(['k1', 'k2'], keep='last')
# 使用 drop_duplicates 默认会保留第一个，keep='last'选项则会保留最后一个。
side_by_side(data,newdata2,newdata3)

Unnamed: 0,k1,k2,v1
0,one,1,0
1,one,1,1
2,one,2,2
3,two,3,3
4,two,3,4
5,two,4,5
6,two,4,6

Unnamed: 0,k1,k2,v1
0,one,1,0
3,two,3,3

Unnamed: 0,k1,k2,v1
1,one,1,1
2,one,2,2
4,two,3,4
6,two,4,6


## 数据分组计算

+ 对数据进行分组统计和计算
+ 使用groupby 方法

In [16]:
df = DataFrame({'key1' : ['a', 'a', 'b', 'b', 'a'],
                'key2' : ['one', 'two', 'one', 'two', 'one'],
                'data1' : range(5),
                'data2' : range(5)})
grouped = df['data1'].groupby(df['key1'])##***groupby****
a = df.groupby('key1')

print(df) 
print(a.mean())
print(grouped.mean())

   data1  data2 key1 key2
0      0      0    a  one
1      1      1    a  two
2      2      2    b  one
3      3      3    b  two
4      4      4    a  one
         data1     data2
key1                    
a     1.666667  1.666667
b     2.500000  2.500000
key1
a    1.666667
b    2.500000
Name: data1, dtype: float64


+ 用两列分组

In [17]:
means = df['data1'].groupby([df['key1'], df['key2']]).mean()##[[]]用法
print(means)

key1  key2
a     one     2
      two     1
b     one     2
      two     3
Name: data1, dtype: int64


层次化的索引可以通过unstack转化为行列表示

In [18]:
means.unstack()

key2,one,two
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2,1
b,2,3


In [19]:
dfmeans = df.groupby(['key1', 'key2']).mean()
dfmeans.unstack()

Unnamed: 0_level_0,data1,data1,data2,data2
key2,one,two,one,two
key1,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
a,2,1,2,1
b,2,3,2,3


只选取部分列进行计算

In [20]:
df.groupby([df['key1'], df['key2']])[['data2']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,data2
key1,key2,Unnamed: 2_level_1
a,one,2
a,two,1
b,one,2
b,two,3


其它分组计算
+ 分位点
+ 使用自己的函数
+ 描述统计量 

In [21]:
grouped = df.groupby('key1')
grouped['data1'].quantile(0.9)

key1
a    3.4
b    2.9
Name: data1, dtype: float64

In [22]:
def peak_to_peak(arr):
    return arr.max() - arr.min()
grouped.agg(peak_to_peak)

Unnamed: 0_level_0,data1,data2
key1,Unnamed: 1_level_1,Unnamed: 2_level_1
a,4,4
b,1,1


In [23]:
stocks = pd.read_csv('data/A1.csv',encoding="gbk",na_values=['--  '],skiprows=[2781])
substocks=stocks.ix[:,:13]
substocks.head()


Unnamed: 0,代码,名称,收入同比%%,利润同比%%,营业利润率%%,市销率,市净率,市现率,市盈(动),总金额,量比,细分行业,地区
0,1,平安银行,34.09,15.02,32.78,3.64,1.12,0.93,7.31,3273803000.0,1.4,银行,深圳
1,2,万 科Ａ,22.72,0.77,17.35,3.13,1.8,-23.08,16.21,3218100000.0,1.33,全国地产,深圳
2,4,国农科技,117.08,125.75,10.98,36.85,26.93,-337.1,905.17,175094000.0,0.79,生物制药,深圳
3,5,世纪星源,52.1,8.05,-64.57,148.17,8.32,155.1,,0.0,0.0,区域地产,深圳
4,6,深振业Ａ,311.53,425.18,17.93,6.29,3.01,-20.19,24.66,504695100.0,0.82,区域地产,深圳


In [24]:
substocks.columns=map(lambda y: "F"+str(y),range(13))
substocks.head()

Unnamed: 0,F0,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12
0,1,平安银行,34.09,15.02,32.78,3.64,1.12,0.93,7.31,3273803000.0,1.4,银行,深圳
1,2,万 科Ａ,22.72,0.77,17.35,3.13,1.8,-23.08,16.21,3218100000.0,1.33,全国地产,深圳
2,4,国农科技,117.08,125.75,10.98,36.85,26.93,-337.1,905.17,175094000.0,0.79,生物制药,深圳
3,5,世纪星源,52.1,8.05,-64.57,148.17,8.32,155.1,,0.0,0.0,区域地产,深圳
4,6,深振业Ａ,311.53,425.18,17.93,6.29,3.01,-20.19,24.66,504695100.0,0.82,区域地产,深圳


In [25]:
grouped=substocks.groupby("F12")

In [26]:
results=grouped.describe()
results.F4.unstack()
# 该版本的pandas的quantile函数不能忽略nan,所以相应位置报错



Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
F12,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
上海,218.0,20.434587,111.076893,-555.93,,,,1426.43
云南,30.0,54.184,296.826722,-65.41,-6.115,0.785,8.92,1620.3
内蒙,25.0,274.8452,1508.436832,-1154.43,-3.03,8.24,18.04,7406.67
北京,259.0,8.263552,46.480015,-662.79,2.64,8.67,18.3,79.88
吉林,40.0,12.43,32.093709,-63.17,-0.5775,6.5,18.56,146.28
四川,101.0,-364.761386,3745.902943,-37635.99,-0.02,5.17,17.58,307.37
天津,42.0,2.552143,44.642179,-234.13,0.3325,7.27,15.695,108.63
宁夏,12.0,-58.478333,174.252875,-609.91,-27.1175,-3.69,3.7475,5.51
安徽,88.0,8.131818,17.558783,-35.54,1.0525,5.24,12.745,104.37
山东,160.0,2.423875,63.61793,-692.22,1.4025,5.605,11.72,310.22


+ 显然，上面的例子中数据的清理工作并不完全,需要进一步处理。
+ 关于数据处理和分组计算更详细的内容请参考 Python for Data Analysis 参考书7，9章，对照书附带的ch07.ipynb 和 ch09.ipybn进行深入学习。

## 作业

1. 将A1.csv-A5.csv 读入合并，同一个股票和合并为一行。注意，这些表格中都有相同的代码和名称列,可以用它们作为index。也可能有一些冗余的列和行，对数据进行清理。
2. 按"细分行业" 将资产分组 
   +  统计平均市盈率，平均价格；##P/E
   + 用总资产作权重，计算加权市盈率和加权平均价格；##加权--》两个列做乘法
   + 用流通市值作权重，计算加权市盈率和加权平均价格；
3. 按"地区" 将资产分组，重新计算二中问题
4. 将上述计算结果合并后存为csv文件。