In [None]:
%load_ext Cython

In [None]:
import numpy as np
import pandas as pd

In [None]:
# 测试数据
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])
x_name = 'x'
y_name = 'y'

In [None]:
# data.head()
data.tail()

Unnamed: 0,y,x
4995,0,1
4996,1,8
4997,0,3
4998,0,1
4999,1,0


### 版本1
原始版本，使用pd，导致使用了很多的循环减慢了运行速度

In [None]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0]) # =>5000
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [None]:
%%timeit -n 1
target_mean_v1(data, y_name, x_name)

1 loop, best of 3: 24.6 s per loop


### 版本二
按照老师的建议进行优化，减少pd的方法使用纯python来实现

In [None]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    map_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        _x = data.loc[i, x_name]
        _y = data.loc[i, y_name]
        if _x not in map_dict.keys():
            map_dict[_x] = _y
            count_dict[_x] = 1
        else:
            map_dict[_x] += _y
            count_dict[_x] += 1
    for i in range(data.shape[0]):
        _x = data.loc[i, x_name]
        _y = data.loc[i, y_name]
        result[i] = (map_dict[_x] - _y)/(count_dict[_x] - 1)
    return result

In [None]:
%%timeit -n 1
target_mean_v2(data, y_name, x_name)

1 loop, best of 3: 158 ms per loop


24.6s  -> 158 ms 优化了155倍

### 版本三
进一步干掉, loc的内涵循环

In [None]:
def target_mean_v3(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    map_dict = dict()
    count_dict = dict()
    shape_long = data.shape[0]
    x = data[x_name].values
    y = data[y_name].values

    for i in range(shape_long):
        _x = x[i]
        _y = y[i]
        if _x not in map_dict.keys():
            map_dict[_x] = _y
            count_dict[_x] = 1
        else:
            map_dict[_x] += _y
            count_dict[_x] += 1
    for i in range(shape_long):
        _x = x[i]
        _y = y[i]
        result[i] = (map_dict[_x] - _y)/(count_dict[_x] - 1)
    return result

In [None]:
%%timeit -n 1
target_mean_v3(data, y_name, x_name)

1 loop, best of 3: 9.75 ms per loop


158ms->9.75ms 优化了16倍

### 版本四

In [26]:
%%cython -a
import numpy as np
cimport numpy as cnp

cpdef target_mean_v4(data, str y_name, str x_name):
    cdef int shape_long = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(shape_long)
    cdef map_dict = dict()
    cdef count_dict = dict()
    cdef cnp.ndarray[long] x = data[x_name].values
    cdef cnp.ndarray[long] y = data[y_name].values

    for i in range(shape_long):
        _x = x[i]
        _y = y[i]
        if _x not in map_dict.keys():
            map_dict[_x] = _y
            count_dict[_x] = 1
        else:
            map_dict[_x] += _y
            count_dict[_x] += 1
    for i in range(shape_long):
        _x = x[i]
        _y = y[i]
        result[i] = (map_dict[_x] - _y)/(count_dict[_x] - 1)
    return result

In [27]:
%%timeit -n 1
target_mean_v4(data, y_name, x_name)

1 loop, best of 3: 1.74 ms per loop


9.75ms->1.74ms，优化了5.6倍

In [29]:
%%cython -a
import numpy as np
cimport numpy as cnp
import cython

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v5(data, str y_name, str x_name):
    cdef int shape_long = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(shape_long)
    cdef map_dict = dict()
    cdef count_dict = dict()
    cdef cnp.ndarray[long] x = data[x_name].values
    cdef cnp.ndarray[long] y = data[y_name].values

    for i in range(shape_long):
        _x = x[i]
        _y = y[i]
        if _x not in map_dict.keys():
            map_dict[_x] = _y
            count_dict[_x] = 1
        else:
            map_dict[_x] += _y
            count_dict[_x] += 1
    for i in range(shape_long):
        _x = x[i]
        _y = y[i]
        result[i] = (map_dict[_x] - _y)/(count_dict[_x] - 1)
    return result

In [30]:
%%timeit -n 10
target_mean_v5(data, y_name, x_name)

10 loops, best of 3: 1.26 ms per loop


1.74ms-1.26ms

### 版本六
使用数组

In [47]:
%%cython -a
import numpy as np
cimport numpy as cnp
import cython

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v6(data, str y_name, str x_name):
    cdef int shape_long = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(shape_long)
    cdef long[:] map_dict = np.zeros(10).astype(long)
    cdef long[:] count_dict = np.zeros(10).astype(long)
    cdef long[:] x = data[x_name].values
    cdef long[:] y = data[y_name].values

    for i in range(shape_long):
        _x = x[i]
        _y = y[i]
        map_dict[_x] += _y
        count_dict[_x] += 1
    for i in range(shape_long):
        _x = x[i]
        _y = y[i]
        result[i] = (map_dict[_x] - _y)/(count_dict[_x] - 1)
    return result

In [48]:
%%timeit -n 10
target_mean_v6(data, y_name, x_name)

10 loops, best of 3: 500 µs per loop


1.26ms->500μs优化2.52倍

In [76]:
%%cython -a
# distutils: language=c++
import numpy as np
cimport numpy as cnp
cimport cython
from cython.parallel import prange


@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v7(data, str y_name, str x_name):
    cdef long[:] x = data[x_name].values
    cdef long[:] y = data[y_name].values
    cdef int shape_long = data.shape[0]
    cdef double[:] result = np.zeros(shape_long)
    cdef long[:] map_dict = np.zeros(10).astype(long)
    cdef long[:] count_dict = np.zeros(10).astype(long)

    cdef int i = 0

    for i in prange(shape_long,nogil=True):
        map_dict[x[i]] += y[i]
        count_dict[x[i]] += 1
    for i in prange(shape_long, nogil=True):
        result[i] = (map_dict[x[i]] - y[i])/(count_dict[x[i]] - 1)
    return result

In [80]:
%%timeit -n 10
target_mean_v7(data, y_name, x_name)

10 loops, best of 3: 70.7 µs per loop


500μs->70.1μs优化7倍