In [None]:
import numpy as np
import pandas as pd

In [None]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [None]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [None]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [None]:
%%timeit
target_mean_v1(data, 'y', 'x')

1 loop, best of 3: 23.6 s per loop


In [None]:
%%timeit
target_mean_v2(data, 'y', 'x')

1 loop, best of 3: 265 ms per loop


In [None]:
%load_ext Cython

In [None]:
#version1:用Cython加速
#黄色代表纯用Python编译的，效率不高
%%cython -a
import cython
import numpy as np
cimport numpy as cnp
cpdef cnp.ndarray[double] target_mean_v3(cnp.ndarray[long, ndim=2] data):
    cdef cnp.ndarray[double] result = np.zeros(data.shape[0])
    cdef dict value_dict = {}
    cdef dict count_dict = {}
    cdef unsigned int i
    for i in range(data.shape[0]):
        if data[i, 1] not in value_dict.keys():
            value_dict[data[i, 1]] = data[i, 0]
            count_dict[data[i, 1]] = 1
        else:
            value_dict[data[i, 1]] += data[i, 0]
            count_dict[data[i, 1]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data[i, 1]] - data[i, 0]) / (count_dict[data[i, 1]] - 1)
    return result

# 新段落

In [None]:
data_ = np.concatenate([y, x], axis=1)

In [None]:
result_1 = target_mean_v1(data, 'y', 'x')
result_2 = target_mean_v3(data_)
diff = np.linalg.norm(result_1 - result_2)
print(diff)

0.0


In [None]:
%%timeit
target_mean_v3(data_)

1000 loops, best of 3: 1.16 ms per loop


In [None]:
#version2:用Cython加速
#黄色代表纯用Python编译的，效率不高
#时间增加了0.07ms
%%cython -a
import cython
import numpy as np
cimport numpy as cnp
cpdef cnp.ndarray[double] target_mean_v4(cnp.ndarray[long, ndim=2] data):
    cdef int ds = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(ds)
    cdef dict value_dict = {}
    cdef dict count_dict = {}
    cdef unsigned int i
    for i in range(ds):
        if data[i, 1] not in value_dict.keys():
            value_dict[data[i, 1]] = data[i, 0]
            count_dict[data[i, 1]] = 1
        else:
            value_dict[data[i, 1]] += data[i, 0]
            count_dict[data[i, 1]] += 1
    for i in range(ds):
        result[i] = (value_dict[data[i, 1]] - data[i, 0]) / (count_dict[data[i, 1]] - 1)
    return result

In [None]:
data_ = np.concatenate([y, x], axis=1)

In [None]:
%%timeit
target_mean_v4(data_)

1000 loops, best of 3: 1.23 ms per loop


In [None]:
#version3:用Cython加速，禁用边界检查
#黄色代表纯用Python编译的，效率不高
#时间减少了0.01ms
%%cython -a
import cython
import numpy as np
cimport numpy as cnp
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef cnp.ndarray[double] target_mean_v5(cnp.ndarray[long, ndim=2] data):
    cdef int ds = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(ds)
    cdef dict value_dict = {}
    cdef dict count_dict = {}
    cdef unsigned int i
    for i in range(ds):
        if data[i, 1] not in value_dict.keys():
            value_dict[data[i, 1]] = data[i, 0]
            count_dict[data[i, 1]] = 1
        else:
            value_dict[data[i, 1]] += data[i, 0]
            count_dict[data[i, 1]] += 1
    for i in range(ds):
        result[i] = (value_dict[data[i, 1]] - data[i, 0]) / (count_dict[data[i, 1]] - 1)
    return result

In [None]:
data_ = np.concatenate([y, x], axis=1)

In [None]:
%%timeit
target_mean_v4(data_)

1000 loops, best of 3: 1.15 ms per loop


In [None]:
#version4:用Cython加速，使用内存视图进行索引
#黄色代表纯用Python编译的，效率不高
#时间减少了0.01ms
%%cython -a
import cython
import numpy as np
cimport numpy as cnp
cpdef cnp.ndarray[long, ndim=2] target_mean_v6(cnp.ndarray[long, ndim=2] data):
    cdef cnp.ndarray[long, ndim=2] arr
    cdef int ds = data.shape[0]
    cdef double[: :] result = np.zeros[ds]
    cdef dict value_dict = {}
    cdef dict count_dict = {}
    cdef int i
    for i in range(ds):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(ds):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result
cpdef cnp.ndarray[double] target_mean_v5(cnp.ndarray[long, ndim=2] data):
    cdef int ds = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(ds)
    cdef dict value_dict = {}
    cdef dict count_dict = {}
    cdef unsigned int i
    for i in range(ds):
        if data[i, 1] not in value_dict.keys():
            value_dict[data[i, 1]] = data[i, 0]
            count_dict[data[i, 1]] = 1
        else:
            value_dict[data[i, 1]] += data[i, 0]
            count_dict[data[i, 1]] += 1
    for i in range(ds):
        result[i] = (value_dict[data[i, 1]] - data[i, 0]) / (count_dict[data[i, 1]] - 1)
    return result