In [24]:
import numpy as np
import pandas as pd
y=np.random.randint(2,size=(5000,1))
x=np.random.randint(10,size=(5000,1))
data=pd.DataFrame(np.concatenate([y,x],axis=1),columns=['y','x'])

In [13]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [22]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [19]:
%%timeit
target_mean_v1(data,'y','x')

35.6 s ± 754 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [23]:
%%timeit
target_mean_v2(data,'y','x')

403 ms ± 24.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [25]:
%load_ext Cython

In [26]:
%%cython -a
import cython
import numpy as np
cimport cython
cimport numpy as cnp

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v3(data,str y_name,str x_name):
    cdef cnp.ndarray[double] result = np.zeros(data.shape[0], dtype=float)
    cdef cnp.ndarray[double] value_array = np.zeros(10)
    cdef cnp.ndarray[double] count_array = np.zeros(10)
    
    cdef cnp.ndarray[long] x = data[x_name].values
    cdef cnp.ndarray[long] y = data[y_name].values

    cdef int i = 0
    
    for i in range(data.shape[0]):
        value_array[x[i]] += y[i]
        count_array[x[i]] += 1
    
    for i in range(data.shape[0]):
        result[i] = (value_array[x[i]] - y[i]) / (count_array[x[i]] - 1)
    return result

In [28]:
%%timeit
target_mean_v3(data,'y','x')

67.6 µs ± 1.29 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
