<a href="https://colab.research.google.com/github/trent87/ML-000/blob/main/Week02/first_cython.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# pure python version

In [24]:
import numpy as np
import pandas as pd
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

def main():
    y = np.random.randint(2, size=(5000, 1))
    x = np.random.randint(10, size=(5000, 1))
    pd_data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])
    result = target_mean_v1(pd_data, 'y', 'x')
    print(result)


if __name__ == '__main__':
    main()


[0.52371917 0.48330059 0.5562249  ... 0.47722343 0.50204918 0.50190114]


In [54]:
%%timeit
main()

[0.51585624 0.53281853 0.48406375 ... 0.53281853 0.48368522 0.48549323]
[0.48507463 0.52653061 0.47302905 ... 0.50583658 0.47302905 0.48507463]
[0.52848723 0.48995984 0.49094567 ... 0.47884615 0.54639175 0.50207469]
[0.49320388 0.48541667 0.5311828  ... 0.48541667 0.49350649 0.51172708]
1 loop, best of 3: 20.9 s per loop


# pure python version with cython


使用基础的数据类型作为参数和返回值，不使用原有的pd.dataframe进行传参，转换为长整型的二维数组。

In [1]:
%load_ext Cython

In [50]:
%%cython -a
cimport numpy as cnp
import pandas as pd
import numpy as np

cpdef cnp.ndarray[double] target_mean_cython(cnp.ndarray[cnp.int64_t,ndim=2] data):
  cdef:
    int row_count = data.shape[0]
    int col_count = data.shape[1]
    cnp.ndarray[double] result = np.zeros(data.shape[0],dtype=np.double)
    # use index as key to save sum
    cnp.ndarray[cnp.int64_t] value_group_sum = np.zeros(10,dtype=np.int)
    cnp.ndarray[cnp.int64_t] value_group_count = np.zeros(10,dtype=np.int)
  # group by cal sum and count
  for i in range(row_count):
    x_value = data[i][1]
    y_value = data[i][0]
    value_group_sum[x_value] += y_value
    value_group_count[x_value] += 1 
  # cal result  
  for i in range(row_count):
    x = data[i][1]
    y = data[i][0]
    result[i] = (value_group_sum[x]-y)/(value_group_count[x] - 1)
  return result
  

In [31]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data=np.concatenate([y, x], axis=1)

In [51]:
print(target_mean_cython(data))

[0.53182752 0.49609375 0.50590551 ... 0.46484375 0.47435897 0.46484375]


In [52]:
%%timeit
target_mean_cython(data)

100 loops, best of 3: 11.7 ms per loop
