In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## 第一章 预备知识

### Ex1：利用列表推导式写矩阵乘法

In [3]:
M1 = np.random.rand(2, 3)
M2 = np.random.rand(3, 4)
res = [[sum([M1[i][k] * M2[k][j] for k in range(M1.shape[1])]) for j in range(M2.shape[1])] for i in range(M1.shape[0])]
(np.abs((M1 @ M2 - res) < 1e-15)).all()

True

### Ex2：更新矩阵


In [10]:
A = np.arange(1, 10).reshape(3, -1)
B = A * (1 / A).sum(1).reshape(-1, 1)
B

array([[1.83333333, 3.66666667, 5.5       ],
       [2.46666667, 3.08333333, 3.7       ],
       [2.65277778, 3.03174603, 3.41071429]])

### Ex3：卡方统计量

In [19]:
np.random.seed(0)
A = np.random.randint(10, 20, (8, 5))
B = A.sum(0) * A.sum(1).reshape(-1, 1) / A.sum()
res = ((A - B) ** 2 / B).sum()
res

11.842696601945802

## 第二章 pandas基础
### Ex1：口袋妖怪数据集
#### 1.

In [15]:
# Ex1：口袋妖怪数据集
# 1.对HP, Attack, Defense, Sp. Atk, Sp. Def, Speed进行加总，验证是否为Total值。
df = pd.read_csv('./data/Pokemon.csv')
df.head(3)

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80


In [16]:
(df[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].sum(1) != df['Total']).mean()

0.0

In [17]:
# 2.对于#重复的妖怪只保留第一条记录
dp_dup = df.drop_duplicates(['#'], keep='first')

In [18]:
# 求第一属性的种类数量和前三多数量对应的种类
dp_dup['Type 1'].nunique()

18

In [19]:
dp_dup['Type 1'].value_counts().index[:3]

Index(['Water', 'Normal', 'Grass'], dtype='object')

In [22]:
# 求第一属性和第二属性的组合种类
attr_dup = dp_dup.drop_duplicates(['Type 1', 'Type 2'])  # 重复记录只保留第一条
attr_dup.shape[0]  # 行数

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45
4,4,Charmander,Fire,,309,39,52,43,60,50,65
6,6,Charizard,Fire,Flying,534,78,84,78,109,85,100
9,7,Squirtle,Water,,314,44,48,65,50,64,43
13,10,Caterpie,Bug,,195,45,30,35,20,20,45
...,...,...,...,...,...,...,...,...,...,...,...
773,703,Carbink,Rock,Fairy,500,50,50,150,50,150,50
778,708,Phantump,Ghost,Grass,309,43,70,48,50,60,38
790,714,Noibat,Flying,Dragon,245,40,30,35,45,40,55
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70


In [26]:
# 求尚未出现过的属性组合，因为Type2 可能为空，所以这里j 要从Type 1 迭代
L_full = [i + ' ' + j if i != j else i for i in df['Type 1'].unique() for j in df['Type 1'].unique()]
# zip创建了一个迭代器，将这两列数据中的元素成对地聚合起来。
L_part = [i + ' ' + j if not isinstance(j, float) else i for i, j in zip(df['Type 1'], df['Type 2'])]
res = set(L_full).difference(set(L_part))
len(res)  # 太多，不打印了

170

In [52]:
# 3.按照下述要求，构造Series：
# 取出物攻，超过120的替换为high，不足50的替换为low，否则设为mid
df['Attack'].mask(df['Attack'] > 120, 'high').mask(df['Attack'] < 50, 'low').mask(
    (50 <= df['Attack']) & (df['Attack'] <= 120), 'mid').head()

0    low
1    mid
2    mid
3    mid
4    mid
Name: Attack, dtype: object

In [55]:
# 取出第一属性，分别用replace和apply替换所有字母为大写
df['Type 1'].replace({i: str.upper(i) for i in df['Type 1'].unique()})
df['Type 1'].apply(lambda x: str.upper(x)).head()

0    GRASS
1    GRASS
2    GRASS
3    GRASS
4     FIRE
Name: Type 1, dtype: object

In [63]:
# 求每个妖怪六项能力的离差，即所有能力中偏离中位数最大的值，添加到df并从大到小排序
df['Deviation'] = df[['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']].apply(
    lambda x: np.max((x - x.mean()).abs()), 1)
df.sort_values('Deviation', ascending=False).head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Deviation
121,113,Chansey,Normal,,450,250,5,5,35,105,50,175.0
261,242,Blissey,Normal,,540,255,10,10,75,135,55,165.0
230,213,Shuckle,Bug,Rock,505,20,10,230,10,230,5,145.833333
224,208,SteelixMega Steelix,Steel,Ground,610,75,125,230,55,95,30,128.333333
333,306,AggronMega Aggron,Steel,,630,70,140,230,60,80,50,125.0


### Ex2：指数加权窗口
#### 1.

In [64]:
#  作为扩张窗口的ewm窗口
np.random.seed(0)
s = pd.Series(np.random.randint(-1, 2, 30).cumsum())
s.head()

0   -1
1   -1
2   -2
3   -2
4   -2
dtype: int64

In [67]:
def ewm_func(x, alpha=0.2):
    # np.arange(x.shape[0])[::-1] 生成了一个长度为x数组长度的序列，然后将该序列反转。
    # 加权窗口使用公式(1-alpha)**n，其中n是上述反转的序列，代表序列中每个元素距离当前值的位置
    win = (1 - alpha) ** np.arange(x.shape[0])[::-1]
    res = (win * x).sum() / win.sum()
    return res


s.expanding().apply(lambda x: ewm_func(x))

0    -1.000000
1    -1.000000
2    -1.409836
3    -1.609756
4    -1.725845
5    -1.529101
6    -1.648273
7    -1.492481
8    -1.609720
9    -1.921223
10   -2.376048
11   -2.510047
12   -2.613738
13   -2.485343
14   -2.177441
15   -2.140925
16   -2.112091
17   -2.089261
18   -2.071148
19   -2.056753
20   -2.247158
21   -2.398846
22   -2.720978
23   -3.178945
24   -3.544537
25   -3.635906
26   -3.909386
27   -3.927544
28   -4.142368
29   -4.314107
dtype: float64

#### 作为滑动窗口的ewm窗口
从第1问中可以看到，ewm作为一种扩张窗口的特例，只能从序列的第一个元素开始加权。现在希望给定一个限制窗口n，只对包含自身的最近的n个元素作为窗口进行滑动加权平滑。请根据滑窗函数，给出新的wi与yt的更新公式，并通过rolling窗口实现这一功能。

新的权重为$w_i = (1 - \alpha)^i, i\in \{0,1,...,n-1\}$，$y_t$更新如下：
$$
\begin{split}y_t &=\frac{\sum_{i=0}^{n-1} w_i x_{t-i}}{\sum_{i=0}^{n-1} w_i} \\
&=\frac{x_t + (1 - \alpha)x_{t-1} + (1 - \alpha)^2 x_{t-2} + ...
+ (1 - \alpha)^{n-1} x_{t-(n-1)}}{1 + (1 - \alpha) + (1 - \alpha)^2 + ...
+ (1 - \alpha)^{n-1}}\\\end{split}
$$


In [68]:
s.rolling(window=4).apply(ewm_func).head()  # 无需对原函数改动

0         NaN
1         NaN
2         NaN
3   -1.609756
4   -1.826558
dtype: float64

## 第三章 索引
### Ex1：公司员工数据集

In [21]:
df = pd.read_csv('./data/Company.csv')
df

Unnamed: 0,EmployeeID,birthdate_key,age,city_name,department,job_title,gender
0,1318,1/3/1954,61,Vancouver,Executive,CEO,M
1,1319,1/3/1957,58,Vancouver,Executive,VP Stores,F
2,1320,1/2/1955,60,Vancouver,Executive,Legal Counsel,F
3,1321,1/2/1959,56,Vancouver,Executive,VP Human Resources,M
4,1322,1/9/1958,57,Vancouver,Executive,VP Finance,M
...,...,...,...,...,...,...,...
6279,8036,8/9/1992,23,New Westminister,Customer Service,Cashier,F
6280,8181,9/26/1993,22,Prince George,Customer Service,Cashier,M
6281,8223,2/11/1994,21,Trail,Customer Service,Cashier,M
6282,8226,2/16/1994,21,Victoria,Customer Service,Cashier,F


#### 1. 分别只使用query和loc选出年龄不超过四十岁且工作部门为Dairy或Bakery的男性。

In [26]:
df.query('((department=="Dairy") | (department == "Bakery")) &(gender=="M")&(age<=40)')

# 更简洁的写法
dpt = ['Dairy', 'Bakery']
df.query('(department == @dpt) &(gender=="M")&(age<=40)')

Unnamed: 0,EmployeeID,birthdate_key,age,city_name,department,job_title,gender
3611,5791,1/14/1975,40,Kelowna,Dairy,Dairy Person,M
3613,5793,1/22/1975,40,Richmond,Bakery,Baker,M
3615,5795,1/30/1975,40,Nanaimo,Dairy,Dairy Person,M
3617,5797,2/3/1975,40,Nanaimo,Dairy,Dairy Person,M
3618,5798,2/4/1975,40,Surrey,Dairy,Dairy Person,M
...,...,...,...,...,...,...,...
6108,8307,10/20/1994,21,Burnaby,Dairy,Dairy Person,M
6113,8312,11/12/1994,21,Burnaby,Dairy,Dairy Person,M
6137,8336,12/31/1994,21,Vancouver,Dairy,Dairy Person,M
6270,6312,5/14/1979,36,Grand Forks,Dairy,Dairy Person,M


In [32]:
df.loc[df.department.isin(['Dairy', 'Bakery']) & (df.gender == "M") & (df.age <= 40)].head()

Unnamed: 0,EmployeeID,birthdate_key,age,city_name,department,job_title,gender
3611,5791,1/14/1975,40,Kelowna,Dairy,Dairy Person,M
3613,5793,1/22/1975,40,Richmond,Bakery,Baker,M
3615,5795,1/30/1975,40,Nanaimo,Dairy,Dairy Person,M
3617,5797,2/3/1975,40,Nanaimo,Dairy,Dairy Person,M
3618,5798,2/4/1975,40,Surrey,Dairy,Dairy Person,M


#### 2.选出员工ID号 为奇数所在行的第1、第3和倒数第2列。

In [38]:
df.iloc[(df.EmployeeID % 2 == 1).values, [0, 2, -2]]

Unnamed: 0,EmployeeID,age,job_title
1,1319,58,VP Stores
3,1321,56,VP Human Resources
5,1323,53,"Exec Assistant, VP Stores"
6,1325,51,"Exec Assistant, Legal Counsel"
8,1329,48,Store Manager
...,...,...,...
6276,7659,26,Cashier
6277,7741,25,Cashier
6278,7801,25,Dairy Person
6280,8181,22,Cashier


#### 按照以下步骤进行索引操作：
* 把后三列设为索引后交换内外两层
* 恢复中间层索引
* 修改外层索引名为Gender
* 用下划线合并两层行索引
* 把行索引拆分为原状态
* 修改索引名为原表名称
* 恢复默认索引并将列保持为原表的相对位置


In [None]:
df_op = df.copy()
df_op = df_op.set_index(df_op.columns[-3:].tolist()).swaplevel(0, 2, axis=0)