In [1]:
import numpy as np
import pandas as pd
df = pd.read_csv('../joyful-pandas-master/data/table.csv')
df.head()

Unnamed: 0,School,Class,ID,Gender,Address,Height,Weight,Math,Physics
0,S_1,C_1,1101,M,street_1,173,63,34.0,A+
1,S_1,C_1,1102,F,street_2,192,73,32.5,B+
2,S_1,C_1,1103,M,street_2,186,82,87.2,B+
3,S_1,C_1,1104,F,street_2,167,81,80.4,B-
4,S_1,C_1,1105,F,street_4,159,64,84.8,B+


# append：添加行；assign：添加列

## append添加行

### （1）利用序列添加行，必须指定name

In [2]:
df_append = df.loc[:3,['Gender','Height']].copy()
df_append

Unnamed: 0,Gender,Height
0,M,173
1,F,192
2,M,186
3,F,167


In [3]:
s = pd.Series({'Gender':'F','Height':167},name = 'new_row')
df_append.append(s)

Unnamed: 0,Gender,Height
0,M,173
1,F,192
2,M,186
3,F,167
new_row,F,167


### 用DataFrame添加行

In [5]:
tmp = pd.DataFrame({'Gender':['F','M'],'Height':[165,189]},index = ['New1','New2'])
df_append.append(tmp)

Unnamed: 0,Gender,Height
0,M,173
1,F,192
2,M,186
3,F,167
New1,F,165
New2,M,189


## assign添加列，列名由参数指定

In [6]:
new_col = pd.Series(list('abcd'),index = range(4))
df_append.assign(col1 = new_col,
                col2 = list('asdf') )

Unnamed: 0,Gender,Height,col1,col2
0,M,173,a,a
1,F,192,b,s
2,M,186,c,d
3,F,167,d,f


# 填充函数combine与update：可基于某种规则进行填充

## combine：是按照表的顺序轮流进行逐列循环，自动索引对齐

In [11]:
df_combine_1 = df.loc[:1,['Gender','Height']].copy()
df_combine_2 = df.loc[10:11,['Gender','Height']].copy()
df_combine_1.combine(df_combine_2,lambda x,y:print(x,y))

0       M
1       F
10    NaN
11    NaN
Name: Gender, dtype: object 0     NaN
1     NaN
10      M
11      F
Name: Gender, dtype: object
0     173.0
1     192.0
10      NaN
11      NaN
Name: Height, dtype: float64 0       NaN
1       NaN
10    161.0
11    175.0
Name: Height, dtype: float64


Unnamed: 0,Gender,Height
0,,
1,,
10,,
11,,


In [13]:
#根据均值大小填充
df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]})
df2 = pd.DataFrame({'A': [8, 7], 'B': [6, 5]})
df1.combine(df2,lambda x,y : x if x.mean()>y.mean() else y)

Unnamed: 0,A,B
0,8,6
1,7,5


### 索引对齐，后面的表df2没有的行列都会设为NAN

In [22]:
df2 = pd.DataFrame({'B':[8,7],'C':[6,5]},index = [1,2])
df1.combine(df2,lambda x,y : x if x.mean()>y.mean() else y)

Unnamed: 0,A,B,C
0,,,
1,,8.0,6.0
2,,7.0,5.0


### 参数overwrite=False：df1原来符合条件的值不会被覆盖

In [16]:
df1.combine(df2,lambda x,y : x if x.mean()>y.mean() else y,overwrite=False)

Unnamed: 0,A,B,C
0,1.0,,
1,2.0,8.0,6.0
2,,7.0,5.0


### 在新增匹配df2的元素位置填充-1

In [23]:
df1.combine(df2,lambda x,y : x if x.mean()>y.mean() else y,fill_value=-1)

Unnamed: 0,A,B,C
0,1.0,-1.0,-1.0
1,2.0,8.0,6.0
2,-1.0,7.0,5.0


## combine_first方法：用df2填补df1的缺失值

In [26]:
df1 = pd.DataFrame({'A': [None, 0], 'B': [4, None]})
df2 = pd.DataFrame({'B': [3, 3], 'C': [1, 1]}, index=[1, 2])
df1.combine_first(df2)

Unnamed: 0,A,B,C
0,,4.0,
1,0.0,3.0,1.0
2,,3.0,1.0


## update方法

用df2中的值替换df1中的值，直接在df1中操作

- 返回的框索引只与被调用框的一致，左连接
- 第二个框中的NAN不会起作用
- 直接在df上操作，没有返回值

In [29]:
df1 = pd.DataFrame({'A': ['a', 'b', 'c'],
                    'B': ['x', 'y', 'z']})
df2 = pd.DataFrame({'B': ['d', np.nan]}, index=[1,2])
df1.update(df2)

In [30]:
df1

Unnamed: 0,A,B
0,a,x
1,b,d
2,c,z


# concat方法

- 默认axis = 0，即纵向拼接，直接在行末拼接
- 拼接方式 join = 'outer',取并集
- 参数sort = False，列排序
- 参数key给不同的数据框添加一个外索引

In [31]:
df1 = pd.DataFrame({'A': ['A0', 'A1'],
                    'B': ['B0', 'B1']},
                    index = [0,1])
df2 = pd.DataFrame({'A': ['A2', 'A3'],
                    'B': ['B2', 'B3']},
                    index = [2,3])
pd.concat([df1,df2], keys=['x', 'y'])
#pd.concat([df1,df2], keys=['x', 'y']).index

Unnamed: 0,Unnamed: 1,A,B
x,0,A0,B0
x,1,A1,B1
y,2,A2,B2
y,3,A3,B3


# merge与join

#### merge函数的作用是将两个pandas对象横向合并，遇到重复的索引项时会使用笛卡尔积，默认inner连接，可选left、outer、right连接

#### merge/join与concat的不同之处在于on参数，可以指定某一个对象为key来进行连接

### 对于many_to_one模式下的合并，往往join更为方便，同样可以指定key

In [39]:
left = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                     'B': ['B0', 'B1', 'B2', 'B3'],
                     'key': ['K0', 'K1', 'K0', 'K1']})
right = pd.DataFrame({'C': ['C0', 'C1'],
                      'D': ['D0', 'D1']},
                     index=['K0', 'K1'])
left.join(right, on='key')

Unnamed: 0,A,B,key,C,D
0,A0,B0,K0,C0,D0
1,A1,B1,K1,C1,D1
2,A2,B2,K0,C0,D0
3,A3,B3,K1,C1,D1
