<a href="https://colab.research.google.com/github/xx529/Others/blob/main/GeekBang/chap02-homework.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 内容

1. 优化 target mean 算法过程
    * 用 Cython 加速
    * 加入并行
2. 使用 cython 实现对输入多列返回 B-spline basis 的操作。
    * 查看 B-spline 的介绍。 https://www.cs.unc.edu/~dm/UNC/COMP258/Papers/bsplbasic.pdf
    * 禁止使用函数 recursive call。 
    * 必须要处理异常情况（例如缺失值，inf 等）。


# 优化 target mean 算法

### 随机数据

In [2]:
import numpy as np
import pandas as pd
import time

In [3]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [4]:
data

Unnamed: 0,y,x
0,1,3
1,0,1
2,0,2
3,0,9
4,0,6
...,...,...
4995,1,6
4996,0,8
4997,1,3
4998,1,4


### Baseline算法

In [5]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [6]:
%%time
v1_ans = target_mean_v1(data, 'y', 'x')

CPU times: user 22 s, sys: 11.5 ms, total: 22 s
Wall time: 22.1 s


### Python 优化

In [7]:
def target_mean_python(data, y_name, x_name):
    temp = data.groupby(x_name).agg(['sum', 'count']).reset_index()
    compute = lambda y, x, array: (array[x][1] - y) / (array[x][2] - 1)
    data_dict = {(y,x): compute(y, x, temp.values) for y in data[y_name].unique() for x in temp[x_name].unique()}
    result = np.array([data_dict[(y,x)] for _, (y, x) in data.iterrows()])
    return result

In [8]:
%%time
python_ans = target_mean_python(data, 'y', 'x')
assert (v1_ans == python_ans).all()
print('pass')

pass
CPU times: user 358 ms, sys: 1.99 ms, total: 360 ms
Wall time: 361 ms


### Pandas 优化

In [9]:
def target_mean_pandas(data, y_name, x_name):
    temp = data.groupby(x_name).agg(['sum', 'count']).droplevel(0, axis=1).reset_index()
    df_target_mean = pd.DataFrame(dtype=float, columns=temp.columns+[y_name])
    for target in data[y_name].unique():
        temp[y_name] = target
        df_target_mean = pd.concat([temp, df_target_mean], axis=0)
    df_target_mean['result'] = (df_target_mean['sum'] - df_target_mean[y_name]) / (df_target_mean['count'] - 1)
    result = pd.merge(data, df_target_mean, on=[y_name, x_name], how='left')['result'].values
    return result

In [10]:
%%time
pandas_ans = target_mean_pandas(data, 'y', 'x')
assert (v1_ans == pandas_ans).all()
print('pass')

pass
CPU times: user 19.9 ms, sys: 2.98 ms, total: 22.8 ms
Wall time: 28 ms


### Cython 优化

In [88]:
%load_ext Cython

In [288]:
def target_mean_origin(data, y_name, x_name):
    temp_dict = {}
    result = np.zeros(shape=data.shape[0])

    for i, row_data in data.iterrows():
        x_value, y_value = row_data[x_name], row_data[y_name]
        
        if x_value not in temp_dict:
            temp_dict[x_value] = {'sum': 0, 'count': 0}
        
        temp_dict[x_value]['sum'] += y_value
        temp_dict[x_value]['count'] += 1

    for i, row_data in data.iterrows():
        x_value, y_value = row_data[x_name], row_data[y_name]
        result[i] = (temp_dict[x_value]['sum'] - y_value) / (temp_dict[x_value]['count'] - 1)
    
    return result

In [289]:
%%time
cython_ans = target_mean_origin(data, 'y', 'x')
assert (v1_ans == cython_ans).all()
print('pass')

{3: {'sum': 229, 'count': 503}, 1: {'sum': 243, 'count': 509}, 2: {'sum': 273, 'count': 518}, 9: {'sum': 242, 'count': 500}, 6: {'sum': 245, 'count': 497}, 4: {'sum': 258, 'count': 507}, 0: {'sum': 249, 'count': 495}, 5: {'sum': 242, 'count': 475}, 7: {'sum': 251, 'count': 491}, 8: {'sum': 242, 'count': 505}}
pass
CPU times: user 768 ms, sys: 0 ns, total: 768 ms
Wall time: 769 ms


In [306]:
%%cython

cpdef target_mean_cython(data, x_name, y_name):
    cdef int row = data.shape[0]
    cdef int col = data.shape[1]
    cdef int[5000][2] array
    cdef int i, j
    
    # 转换为c数据
    for i in range(row):
        for j in range(col):
            array[i][j] = data.iloc[i,j]

    cdef int[10] count_list = [0,0,0,0,0,0,0,0,0,0]
    cdef int[10] sum_list = [0,0,0,0,0,0,0,0,0,0]
    cdef int x_value
    cdef int y_value


    
    
    # 计算两个list
    for i in range(row):
        x_value = array[i][1]
        y_value = array[i][0]
        count_list[x_value] += 1
        sum_list[x_value] += y_value

    print('sum', sum_list)
    print('count', count_list)
   


    cdef float result[5000]
    for i in range(row):
        x_value = array[i][1]
        y_value = array[i][0]
        # print(sum_list[x_value])
        # print(y_value)
        # print(sum_list[x_value] - y_value)
        # print(count_list[y_value])
        result[i] = <float>(sum_list[x_value] - y_value) / (count_list[y_value] - 1)
    
    # print(array[0])
    # print(row, col)
    return result

In [307]:
%%time
temp = target_mean_cython(data, 'y', 'x')

sum [249, 243, 273, 229, 258, 242, 245, 251, 242, 242]
count [495, 509, 518, 503, 507, 475, 497, 491, 505, 500]
CPU times: user 211 ms, sys: 2.61 ms, total: 214 ms
Wall time: 215 ms


In [290]:
target_mean_origin(data, 'y', 'x')

{3: {'sum': 229, 'count': 503}, 1: {'sum': 243, 'count': 509}, 2: {'sum': 273, 'count': 518}, 9: {'sum': 242, 'count': 500}, 6: {'sum': 245, 'count': 497}, 4: {'sum': 258, 'count': 507}, 0: {'sum': 249, 'count': 495}, 5: {'sum': 242, 'count': 475}, 7: {'sum': 251, 'count': 491}, 8: {'sum': 242, 'count': 505}}


array([0.45418327, 0.47834646, 0.52804642, ..., 0.45418327, 0.50790514,
       0.48496994])

In [264]:
np.array(temp)[:10]

array([0.44881889, 0.49190283, 0.55263156, 0.48987854, 0.49595141,
       0.48987854, 0.52226722, 0.55263156, 0.48987854, 0.55263156])

In [253]:
pandas_ans

array([0.45418327, 0.47834646, 0.52804642, ..., 0.45418327, 0.50790514,
       0.48496994])

In [269]:
data

Unnamed: 0,y,x
0,1,3
1,0,1
2,0,2
3,0,9
4,0,6
...,...,...
4995,1,6
4996,0,8
4997,1,3
4998,1,4


In [287]:
%%cython -a

cpdef test():
    # cdef int abc[4] = {0}
    cdef int[4] b = [0,0,0,0]
    cdef int h=1
    
    # print(abc)
    print(b)

In [281]:
test()

[-762157752, 32523, 5626818, 0]
[72777688, 0, 3, 0]


# B-spline basis 操作
