## 作业1 - Week02
把提供的 target encoding 代码改为 cython 代码并比较速度区别（如可以实现并行可加分）

### 导入需要的模块

In [18]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [20]:
# coding = 'utf-8'
import numpy as np
import pandas as pd

### 定义函数

In [21]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [97]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1

    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

#### v3
- 导入Cython
- 将变量，用cnp, cdef 替代

In [89]:
%%cython
import numpy as np
cimport numpy as cnp

cpdef target_mean_v3(data, y_name, x_name):
    cdef int n = data.shape[0]
    cdef double[:] result = np.zeros((n,))

    value_dict = dict()
    count_dict = dict()

    for i in range(n):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1

    for i in range(n):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

#### v4
- 每次都会重复查找 `data.iloc[i, x_name]`, 建立一个参数来储存。

In [90]:
%%cython
import numpy as np
cimport numpy as cnp

cpdef target_mean_v4_cython(data, y_name, x_name):
    cdef int n = data.shape[0]
    cdef double[:] result = np.zeros((n,))
    cdef int x = 0
    cdef int y = 0

    value_dict = dict()
    count_dict = dict()

    for i in range(n):
        x = data.loc[i, x_name]
        y = data.loc[i, y_name]

        if x not in value_dict.keys():
            value_dict[x] = y
            count_dict[x] = 1
        else:
            value_dict[x] += y
            count_dict[x] += 1

        x = 0
        y = 0

    for i in range(n):
        x = data.loc[i, x_name]
        result[i] = (value_dict[x] - data.loc[i, y_name]) / (count_dict[x] - 1)
        x = 0
    return result

In [52]:
# 制造数据
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [98]:
# 对比结果: 确保数据准确性

# 默认
result_1 = target_mean_v1(data, 'y', 'x')

# 优化
result_2 = target_mean_v2(data, 'y', 'x')
result_3 = target_mean_v4_cython(data, 'y', 'x')

# 结果对比
diff = np.linalg.norm(result_1 - result_2)
print('diff1,2: {}'.format(diff))

diff = np.linalg.norm(result_1 - result_3)
print('diff1,3: {}'.format(diff))

diff1,2: 0.0
diff1,3: 0.0


In [92]:
# 默认
%%timeit
result_1 = target_mean_v1(data, 'y', 'x')

1 loop, best of 3: 27.5 s per loop


In [99]:
# 初始优化
%%timeit
result_2 = target_mean_v2(data, 'y', 'x')

1 loop, best of 3: 302 ms per loop


In [94]:
# Cython语法
%%timeit
result_3 = target_mean_v4_cython(data, 'y', 'x')

10 loops, best of 3: 164 ms per loop
