x为职位，y为月收入是否过万；对每条记录的x做target encoding，处理逻辑：除去自身的其他所有相同x对应的y的平均值，作为x的编码。


In [1]:
%load_ext Cython

In [2]:
import numpy as np
import pandas as pd

In [3]:
#原题
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [4]:
#以更为直观理解的方式做改写。仍然使用了pandas（避免了在循环中做groupby）
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
      x = data.loc[i,x_name]
      withoutMe = data[data.index != i]
      otherMate = withoutMe[withoutMe[x_name] == x]
      result[i] = otherMate[y_name].mean()
    return result

In [5]:
#先对记录表以x列做排序，减少算编码值的循环次数。由于改变了记录顺序其结果与result1不匹配。
def target_mean_v3(data, y_name, x_name):
    total_row = data.shape[0]
    result = np.zeros(total_row)
    a = data.values
    ind = np.argsort( a[:,1] )
    a = a[ind]

    k = 0
    begin_value = a[0,1]
    for i in range(total_row):
      x_i = a[i,1]
      total_y = 0
      count = 0
      if x_i != begin_value:
        k = i
        begin_value = x_i
      
      for j in range(k,total_row):
        if a[j,1] == x_i:
          total_y += a[j,0]
          count += 1
        else:
          break

      result[i] = (total_y - a[i,0]) / (count - 1)

    return result


In [6]:
#用Numpy数组实现，不使用Pandas
def target_mean_v5(data, y_name, x_name):
    total_row = data.shape[0]
    result = np.zeros(total_row)
    a = data.values

    for i in range(total_row):
        x_i = a[i, 1]
        total_y = 0
        count = 0

        for j in range(total_row):
            if a[j, 1] == x_i:
                total_y += a[j, 0]
                count += 1

        result[i] = (total_y - a[i, 0]) / (count - 1)

    return result


In [7]:
#使用cython提升运行速度，增加类型定义。引入cache保存20个中间结果
%%cython
import numpy as np
cimport numpy as cnp


cpdef target_mean_v6(data, str y_name, str x_name):
    cdef long total_row = data.shape[0]
    cdef cnp.ndarray[double] result = np.empty(total_row)
    cdef cnp.ndarray[double] cache = np.zeros(20) 
    cdef cnp.ndarray[long, ndim=2] a = data.values
    cdef long i,j,x_i,total_y,count
    cdef double cached_value

    for i in range(total_row):
        x_i = a[i, 1]
        total_y = 0
        count = 0

        cached_value = cache[x_i * 2 + a[i, 0]]
        if cached_value != 0:
            result[i] = cached_value
            continue
            
        for j in range(total_row):
            if a[j, 1] == x_i:
                total_y += a[j, 0]
                count += 1

        result[i] = (total_y - a[i, 0]) / (count - 1)
        cache[x_i * 2 + a[i, 0]] = result[i]

    return result


In [8]:
#减少中间结果的循环次数。拆分中间结果计算与后面的result填充(避免并行过程中的写操作，为后续版本打基础）
%%cython
import numpy as np
cimport numpy as cnp

cpdef target_mean_v7(data, str y_name, str x_name):
    cdef long total_row = data.shape[0]
    cdef cnp.ndarray[double] result = np.empty(total_row)
    cdef cnp.ndarray[double] cache = np.empty(20) 
    cdef cnp.ndarray[long, ndim=2] a = data.values
    cdef long m,n,i,total_y,count

    for m in range(10):
      total_y = 0
      count = 0

      for n in range(total_row):
        if a[n, 1] == m:
            total_y += a[n, 0]
            count += 1

      cache[m * 2] = total_y / (count - 1)
      cache[m * 2 + 1] = (total_y - 1) / (count - 1)

    for i in range(total_row):
        result[i] = cache[a[i, 1] * 2 + a[i, 0]]

    return result


In [9]:
# openmp并行。并行执行10个循环（获得cache结果）
%%cython
import numpy as np
cimport numpy as cnp
import cython
cimport cython
from cython.parallel import prange

@cython.boundscheck(False)
@cython.wraparound(False)
cdef void parallel_v2(long[:,:] a, const long total_row, double[:] cache, const long m) nogil :
      cdef long n, total_y = 0, count = 0

      for n from 0 <= n < total_row by 1:
        if a[n, 1] == m:
            total_y += a[n, 0]
            count += 1

      cache[m * 2] = total_y / (count - 1)
      cache[m * 2 + 1] = (total_y - 1) / (count - 1)

cpdef target_mean_v8(data, y_name, x_name):
    cdef long total_row = data.shape[0]
    cdef cnp.ndarray[double] result = np.zeros(total_row)
    cdef cnp.ndarray[double] cache = np.zeros(20) 
    cdef cnp.ndarray[long, ndim=2] a = data.values
    cdef long i,j

    cdef long[:,:] arg_a = np.asfortranarray(a, dtype=np.long)
    cdef double[:] arg_cache = np.asfortranarray(cache, dtype=np.float64)

    for i in prange(10, nogil=True):
      parallel_v2(arg_a, total_row, arg_cache, i)

    cache = np.asfortranarray(arg_cache[:], dtype=np.float64)
    for i in range(total_row):
      result[i] = cache[a[i, 1] * 2 + a[i, 0]]

    return result

## 结果比较

In [10]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [11]:

result_1 = target_mean_v1(data, 'y', 'x')
result_2 = target_mean_v2(data, 'y', 'x')
result_6 = target_mean_v6(data, 'y', 'x')
result_7 = target_mean_v7(data, 'y', 'x')
result_8 = target_mean_v8(data, 'y', 'x')

diff = np.linalg.norm(result_2 - result_1)
print(diff)
diff = np.linalg.norm(result_6 - result_1)
print(diff)
diff = np.linalg.norm(result_7 - result_1)
print(diff)
diff = np.linalg.norm(result_8 - result_1)
print(diff)


0.0
0.0
0.0
0.0


In [12]:
%%timeit
result_1 = target_mean_v1(data, 'y', 'x')

1 loop, best of 3: 24.9 s per loop


In [13]:
%%timeit
result_2 = target_mean_v2(data, 'y', 'x')

1 loop, best of 3: 5.48 s per loop


In [14]:

%%timeit
result_6 = target_mean_v6(data, 'y', 'x')

1000 loops, best of 3: 205 µs per loop


In [15]:
%%timeit
result_7 = target_mean_v7(data, 'y', 'x')

10000 loops, best of 3: 122 µs per loop


In [16]:
#从这个结果看并行没有能提高运行速度
%%timeit
result_8 = target_mean_v8(data, 'y', 'x')

The slowest run took 4.04 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 138 µs per loop
