把提供的 target encoding 代码改为 cython 代码并比较速度区别

In [1]:
import numpy as np
import pandas as pd
np.random.seed(0)
x = np.random.randint(10, size=(5000,1))
y = np.random.randint(2, size=(5000,1))

data = pd.DataFrame(np.concatenate([y,x],axis=1), columns=['y','x'])
data.head()

Unnamed: 0,y,x
0,1,5
1,1,0
2,0,3
3,0,3
4,1,7


In [2]:
# 计算index=0的mean和count
groupby_result = data[data.index != 0].groupby(['x'], as_index=False).agg(['mean', 'count'])
groupby_result

Unnamed: 0_level_0,y,y
Unnamed: 0_level_1,mean,count
x,Unnamed: 1_level_2,Unnamed: 2_level_2
0,0.511858,506
1,0.492032,502
2,0.525105,478
3,0.450677,517
4,0.491837,490
5,0.53831,509
6,0.478088,502
7,0.529532,491
8,0.535573,506
9,0.554217,498


In [3]:
# index=0 
groupby_result.loc[groupby_result.index == data.loc[0, 'x'], ('y', 'mean')]

x
5    0.53831
Name: (y, mean), dtype: float64

In [4]:
# original code
def target_mean_v1(data, y_name, x_name):
  """
    1. 遍历 DataFrame，去除遍历到的索引对应值外，根据 x 值分组，统计平均值和总数
    2. 找到 data[i] 的 x 值，通过 groupby_result[x] 寻找对应的 y 列的 mean 值
  """
  result = np.zeros(data.shape[0])
  for i in range(data.shape[0]):
    groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
    result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
  return result

In [5]:
result_v1 = target_mean_v1(data, 'y', 'x')
result_v1[:10]

array([0.53831041, 0.51089109, 0.45155039, 0.45155039, 0.52857143,
       0.55331992, 0.4496124 , 0.53831041, 0.52620545, 0.49079755])

In [6]:
%%timeit
target_mean_v1(data, 'y', 'x')

1 loop, best of 3: 24.3 s per loop


In [7]:
# 课堂优化
def target_mean_v2(data, y_name, x_name):
  """
  优化groupby
  value_dict:存储y值的和
  count_dict:存储x(0-9)相应出现的总次数
  """
  result = np.zeros(data.shape[0])
  value_dict = dict()
  count_dict = dict()
  for i in range(data.shape[0]):
    if data.loc[i, x_name] not in value_dict.keys():
      value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
      count_dict[data.loc[i, x_name]] = 1
    else:
      value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
      count_dict[data.loc[i, x_name]] += 1
  
  for i in range(data.shape[0]):
    # 映射
    result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
  return result


In [8]:
result_v2 = target_mean_v2(data, 'y', 'x')
result_v2[:10]

array([0.53831041, 0.51089109, 0.45155039, 0.45155039, 0.52857143,
       0.55331992, 0.4496124 , 0.53831041, 0.52620545, 0.49079755])

In [9]:
%%timeit
target_mean_v2(data, 'y', 'x')

1 loop, best of 3: 265 ms per loop


Python 优化：
- 1.data.shape[0] 多次使用，用length代替，减少重复索引
- 2.DataFrame中的loc操作较慢，使用原始数组替换

In [10]:
def target_mean_v3(data, y_name, x_name):
  length = data.shape[0]
  result = np.zeros(length)
  xs = data[x_name].values
  ys = data[y_name].values
  value_dict = dict()
  count_dict = dict()
  for i in range(length):
    index = xs[i]
    if index not in value_dict.keys():
        value_dict[index] = ys[i]
        count_dict[index] = 1
    else:
        value_dict[index] += ys[i]
        count_dict[index] += 1
    
  for i in range(length):
    # 映射
    index = xs[i]
    result[i] = (value_dict[index] - ys[i]) / (count_dict[index] - 1)
  return result

In [11]:
result_v3 = target_mean_v3(data, 'y', 'x')
result_v3[:10]

array([0.53831041, 0.51089109, 0.45155039, 0.45155039, 0.52857143,
       0.55331992, 0.4496124 , 0.53831041, 0.52620545, 0.49079755])

In [12]:
%%timeit
target_mean_v3(data, 'y', 'x')

100 loops, best of 3: 9.42 ms per loop


Cython 优化
- 指定变量类型

In [13]:
%load_ext Cython

In [14]:
%%cython -a

import numpy as np
cimport numpy as cnp

cpdef target_mean_v4_cython(data, str y_name, str x_name):
  cdef long length = data.shape[0]
  cdef cnp.ndarray[double] result = np.zeros(length)
  cdef cnp.ndarray[long] xs = data[x_name].values
  cdef cnp.ndarray[long] ys = data[y_name].values
  value_dict = dict()
  count_dict = dict()

  for i in range(length):
    index = xs[i]
    if index not in value_dict.keys():
      value_dict[index] = ys[i]
      count_dict[index] = 1
    else:
      value_dict[index] += ys[i]
      count_dict[index] += 1
    
  for i in range(length):
    # 映射
    index = xs[i]
    result[i] = (value_dict[index] - ys[i]) / (count_dict[index] - 1)
  return result

In [15]:
result_v4 = target_mean_v4_cython(data,'y', 'x')
result_v4[:10]

array([0.53831041, 0.51089109, 0.45155039, 0.45155039, 0.52857143,
       0.55331992, 0.4496124 , 0.53831041, 0.52620545, 0.49079755])

In [16]:
%%timeit
target_mean_v4_cython(data, 'y', 'x')

1000 loops, best of 3: 1.17 ms per loop


- 去除类型检查、包装检查
- 修改for循环将range改为更cython的写法

In [17]:
%%cython -a

cimport cython
import numpy as np
cimport numpy as cnp

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v5_cython(data, str y_name, str x_name):
  cdef long length = data.shape[0]
  cdef cnp.ndarray[double] result = np.zeros(length)
  cdef cnp.ndarray[long] xs = data[x_name].values
  cdef cnp.ndarray[long] ys = data[y_name].values
  value_dict = dict()
  count_dict = dict()

  for i from 0 <= i < length by 1:
    index = xs[i]
    if index not in value_dict.keys():
      value_dict[index] = ys[i]
      count_dict[index] = 1
    else:
      value_dict[index] += ys[i]
      count_dict[index] += 1
    
  for i from 0 <= i < length by 1:
    # 映射
    index = xs[i]
    result[i] = (value_dict[index] - ys[i]) / (count_dict[index] - 1)
  return result

In [18]:
result_v5 = target_mean_v5_cython(data, 'y', 'x')
result_v5

array([0.53831041, 0.51089109, 0.45155039, ..., 0.47704591, 0.52410901,
       0.55331992])

In [19]:
%%timeit
target_mean_v5_cython(data, 'y', 'x')

1000 loops, best of 3: 1.15 ms per loop


- 使用 memoryview 用数组代替dict

In [20]:
%%cython -a

cimport cython
cimport numpy as cnp
import numpy as np

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v6_cython(data, str y_name, str x_name):
  cdef long length = data.shape[0]
  cdef double[:] result = np.zeros(length)
  cdef long[:] xs = data[x_name].values
  cdef long[:] ys = data[y_name].values
  cdef long[:] value = np.zeros(10).astype(long)
  cdef long[:] count = np.zeros(10).astype(long)

  for i from 0 <= i < length by 1:
    x_v, y_v = xs[i], ys[i]
    value[x_v] += y_v
    count[x_v] += 1
    
  for i from 0 <= i < length by 1:
    # 映射
    result[i] = (value[xs[i]] - ys[i]) / (count[xs[i]] - 1)
  return result



In [21]:
result_v6 = target_mean_v6_cython(data, 'y', 'x')
result_v6[0]

0.5383104125736738

In [22]:
%%timeit
target_mean_v6_cython(data, 'y', 'x')

10000 loops, best of 3: 145 µs per loop


- 使用openmp中的prange代替for循环

In [23]:
%%cython -a

cimport cython
cimport numpy as cnp
import numpy as np
import cython
from cython.parallel import prange

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v7_cython(data, str y_name, str x_name):
  cdef long length = data.shape[0]
  cdef double[:] result = np.zeros(length)
  cdef long[:] xs = data[x_name].values
  cdef long[:] ys = data[y_name].values
  cdef long[:] value = np.zeros(10).astype(long)
  cdef long[:] count = np.zeros(10).astype(long)

  cdef int i = 0
  for i in prange(length, nogil=True):
    value[xs[i]] += ys[i]
    count[xs[i]] += 1
    
  for i in prange(length, nogil=True):
    # 映射
    result[i] = (value[xs[i]] - ys[i]) / (count[xs[i]] - 1)
  return result

In [24]:
%%timeit
target_mean_v6_cython

10000000 loops, best of 3: 20.8 ns per loop


### 汇总比较

In [25]:
# %timeit -r 100 
# %timeit:多次执行一条语句，并返回平均时间
# -n {N}：在一次循环中执行N次给定的代码（如果代码只跑一次可能这个时间太小了）
%timeit -n 1 target_mean_v1(data, 'y', 'x')
%timeit -n 100 target_mean_v2(data, 'y', 'x')
%timeit -n 100 target_mean_v3(data, 'y', 'x')
%timeit -n 100 target_mean_v4_cython(data, 'y', 'x')
%timeit -n 100 target_mean_v5_cython(data, 'y', 'x')
%timeit -n 100 target_mean_v6_cython(data, 'y', 'x')
%timeit -n 100 target_mean_v7_cython(data, 'y', 'x')

1 loop, best of 3: 24.4 s per loop
100 loops, best of 3: 272 ms per loop
100 loops, best of 3: 9.17 ms per loop
100 loops, best of 3: 1.13 ms per loop
100 loops, best of 3: 1.13 ms per loop
100 loops, best of 3: 143 µs per loop
100 loops, best of 3: 44.4 µs per loop
