In [4]:
%load_ext cython
import numpy as np
import pandas as pd
from collections import defaultdict

In [5]:
##参数定义
y = np.random.randint(2, size=(500, 1))
x = np.random.randint(10, size=(500, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])  

In [6]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [7]:
def target_mean_v4(data, y_name, x_name):
    n = data.shape[0]
    X = data[x_name].values
    Y = data[y_name].values

    value_dict = defaultdict(lambda:0)
    count_dict = defaultdict(lambda:0)

    for x,y in zip(X,Y):
        value_dict[x] += y
        count_dict[x] += 1
    result = [(value_dict[x]-y)/(count_dict[x]-1) for x,y in zip(X,Y)]
    return result

In [17]:
%%cython --cplus 
import numpy as np
cimport numpy as np
import pandas as pd
from libcpp.unordered_map cimport unordered_map
cimport cython
from libcpp.map cimport map


@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v7(long[:,:] data, int y, int x, int nrow, int ncol):
  cdef long[:,:] data_c=data
  cdef int i=0
  cdef int start=0
  cdef int shape=nrow
  cdef int step=1
  
  cdef int unique_cnt = 10  ##modify，设定为10类
  cdef double[:] result_c = np.zeros(shape)
  cdef double[:] m_value=np.zeros(unique_cnt)
  cdef double[:] m_count=np.zeros(unique_cnt) 


  for i from start <= i < shape by step:
    m_value[data_c[i, x]] += data_c[i,y]
    m_count[data_c[i, x]] += 1
    

  for i from start <= i < shape by step:
    result_c[i] = (m_value[data_c[i, x]] - data_c[i, y]) / (m_count[data_c[i, x]] - 1)
  
  return result_c

In [9]:
%%cython --cplus 
import numpy as np
cimport numpy as c_np
cimport cython


@cython.boundscheck(False)
@cython.wraparound(False)
@cython.cdivision(True)
@cython.initializedcheck(False)
@cython.overflowcheck(False)
@cython.binding(False)
cpdef target_mean_v8(data, str y_name, str x_name):
  cdef:
    int nrow    = data.shape[0]
    c_np.ndarray[double] output = np.empty(nrow)
    int step = 1
    int cnt[10]
    double val[10]
    long [:] X = data[x_name].values
    long [:] Y = data[y_name].values
    double[:] result   = output
    int i 
    long x
    long y

  for i from 0 <= i < 10 by step:
    cnt[i] = 0
    val[i] = 0
  
  for i from 0 <= i < nrow by step:
    x = X[i]
    y = Y[i]
    val[x] += y
    cnt[x] += 1
  
  for i from 0 <= i < nrow by step:
    x = X[i]
    y = Y[i]
    result[i] = (val[x]-y)/(cnt[x]-1)

  return output


In [10]:
%%cython --cplus  --compile-args=-fopenmp  --link-args=-fopenmp
import numpy as np
cimport numpy as np
import pandas as pd
from libcpp.unordered_map cimport unordered_map
from libcpp.vector cimport vector
from libcpp.map cimport map
import cython
cimport cython
from cython.parallel cimport prange,parallel

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v9(data,y,x):

  cdef long[:] data_x=data[x].values
  cdef long[:] data_y=data[y].values
  cdef int i=0
  cdef int start=0
  cdef int shape=data_x.shape[0]
  cdef int step=1

  cdef double[:] m_value=np.zeros(10)
  cdef double[:] m_count=np.zeros(10)  
  cdef double[:] result_c = np.zeros(shape)


  for i from start <= i < shape by step:
      m_value[data_x[i]] += data_y[i]
      m_count[data_x[i]] += 1
        
  for i in prange(shape,nogil=True,schedule='static'): #此处并行
    result_c[i]=(m_value[data_x[i]] - data_y[i]) / (m_count[data_x[i]] - 1)
  return result_c

In [11]:
if __name__ == "__main__": 
    result_1 = target_mean_v1(data,'y','x')
    result_4 = target_mean_v4(data,'y','x')
    result_7 = target_mean_v7(data.values,0,1,data.shape[0],data.shape[1])
    result_8 = target_mean_v8(data,'y','x')
    result_9 = target_mean_v9(data,'y','x')
    diff14   = np.linalg.norm(result_1 - result_4)
    diff17   = np.linalg.norm(result_1 - result_7)
    diff18   = np.linalg.norm(result_1 - result_8)
    diff19   = np.linalg.norm(result_1 - result_9) 
    print("diff14==",diff14)
    print("diff17==",diff17) 
    print("diff18==",diff18)  
    print("diff19==",diff19) 

diff14== 0.0
diff17== 0.0
diff18== 0.0
diff19== 0.0


In [12]:
%%timeit
target_mean_v1(data,'y','x')

1.49 s ± 183 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
%%timeit
target_mean_v4(data,'y','x')

519 µs ± 5.07 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [18]:
%%timeit
target_mean_v7(data.values,0,1,data.shape[0],data.shape[1])

12.7 µs ± 187 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [19]:
%%timeit
target_mean_v8(data, 'y', 'x')

9.63 µs ± 92.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [16]:
%%timeit
target_mean_v9(data, 'y', 'x')

155 µs ± 4.01 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
