# Load the Cython Execution Environment

In [1]:
%load_ext Cython

# Function Definitions

## python function -- V1

In [2]:
import numpy as np

def target_mean_v1_p(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (
            count_dict[data.loc[i, x_name]] - 1)
    return result

## python function (optimized version) -- V2

In [3]:
import pandas as pd
import numpy as np 

def target_mean_v2_p(data, y_name, x_name):
    '''
        分组聚合 --> 剔除自身计算
    '''
    data_groupby = data.groupby(x_name, as_index=False).agg(["sum", "count"])
    data_groupby.columns = data_groupby.columns.droplevel(0)
    data_groupby.reset_index(inplace=True)
    sum_count_dict = {item_[0]: item_[1:] for item_ in data_groupby.values}
    result = [(sum_count_dict[item_[1]][0] - item_[0]) /
              (sum_count_dict[item_[1]][1] - 1) for item_ in data.values]
    return result

## cython function -- V3

In [4]:
%%cython
import numpy as np
cimport numpy as cnp
import cython

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef cnp.ndarray[double] target_mean_v3_c(cnp.ndarray[long, ndim=2] data):
    cdef long index_num = data.shape[0]
    cdef dict sum_dict = {}
    cdef dict count_dict = {}
    cdef cnp.ndarray[double] result = np.zeros(index_num)
    for i in range(index_num):
        if data[i, 1] not in sum_dict.keys():
            sum_dict[data[i, 1]] = data[i, 1]
            count_dict[data[i, 1]] = 1
        else:
            sum_dict[data[i, 1]] += data[i, 0]
            count_dict[data[i, 1]] += 1
    for i in range(index_num):
        result[i] = (sum_dict[data[i, 1]]-data[i, 0])/(count_dict[data[i, 1]]-1)
    return result

## cython function (optimized version) -- V4

In [12]:
%%cython
import numpy as np
cimport numpy as cnp
import cython

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef cnp.ndarray[double] target_mean_v4_c(cnp.ndarray[long, ndim=2, mode="c"] data):
    cdef long index_num = data.shape[0]
    cdef cnp.ndarray[double] sum_dict = np.zeros(index_num)
    cdef cnp.ndarray[double] count_dict = np.zeros(index_num)
    cdef cnp.ndarray[double] result = np.zeros(index_num)
    for i in range(index_num):
        sum_dict[data[i, 1]] += data[i, 0]
        count_dict[data[i, 1]] += 1
    for i in range(index_num):
        result[i] = (sum_dict[data[i, 1]]-data[i, 0])/(count_dict[data[i, 1]]-1)
    return result

# Testing

In [6]:
import pandas as pd
import numpy as np 

np.random.seed(123)
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [7]:
%%timeit
target_mean_v1_p(data, "y", "x")

263 ms ± 40.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [8]:
%%timeit
target_mean_v2_p(data, "y", "x")

12 ms ± 877 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [9]:
%%timeit
target_mean_v3_c(data.values)

972 µs ± 54.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [13]:
%%timeit
target_mean_v4_c(data.values)

34 µs ± 891 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
