In [1]:
import pandas as pd
import numpy as np


In [2]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [7]:
from line_profiler import LineProfiler
def LineProfilerAnalysis(func):
    profile = LineProfiler(func) 
    profile.enable() 
    func(data, 'y', 'x')
    profile.disable() 
    profile.print_stats() 

## base_version
   * base version

In [3]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [4]:
%%timeit
target_mean_v1(data,'y','x')

18.3 s ± 807 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Python

### Version_2
   * using dict to store tmp_result
   * using numpy to replace pandas

In [18]:
def target_mean_v2(data,y_name, x_name):
    length = len(data)
    sum_dict = dict()
    count_dict = dict()
    x_array = data[x_name].values
    y_array = data[y_name].values
    # calculate total sum(count) of y_label 
    for i,val in enumerate(x_array):
        sum_dict[val] = sum_dict.get(val,0)+ y_array[i]
        count_dict[val] = count_dict.get(val,0)+1
    # calculate the final result
    result = np.zeros(length)
    for i in range(length):
        result[i] = (sum_dict[x_array[i]]-y_array[i])/(count_dict[x_array[i]]-1)
    return result

In [19]:
%%timeit -n 1000
target_mean_v2(data,'y','x')

6.5 ms ± 231 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [11]:
LineProfilerAnalysis(target_mean_v2)

Timer unit: 1e-06 s

Total time: 0.019898 s
File: <ipython-input-9-4d4ff5652e5b>
Function: target_mean_v2 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def target_mean_v2(data,y_name, x_name):
     2         1         18.0     18.0      0.1      length = len(data)
     3         1          1.0      1.0      0.0      sum_dict = dict()
     4         1          1.0      1.0      0.0      count_dict = dict()
     5         1         73.0     73.0      0.4      x_array = data[x_name].values
     6         1         11.0     11.0      0.1      y_array = data[y_name].values
     7                                               # calculate total sum(count) of y_label 
     8      5001       2463.0      0.5     12.4      for i,val in enumerate(x_array):
     9      5000       4299.0      0.9     21.6          sum_dict[val] = sum_dict.get(val,0)+ y_array[i]
    10      5000       3176.0      0.6     16.0          count_

### Version_3
   * optimize the for loop

In [15]:
def target_mean_v3(data,y_name, x_name):
    length = len(data)
    sum_dict = dict()
    count_dict = dict()
    x_array = data[x_name].values
    y_array = data[y_name].values
    # calculate total sum(count) of y_label 
    for i,val in enumerate(x_array):
        sum_dict[val] = sum_dict.get(val,0)+ y_array[i]
        count_dict[val] = count_dict.get(val,0)+1
    # calculate the final result
    numerator_array = np.array([sum_dict[x_array[i]]-y_array[i] for i in range(length)])
    denominator_array = np.array([count_dict[x_array[i]]-1 for i in range(length)])
    result = numerator_array / denominator_array
    return result

In [17]:
%%timeit -n 1000
target_mean_v3(data,'y','x')

6.36 ms ± 143 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [20]:
LineProfilerAnalysis(target_mean_v3)

Timer unit: 1e-06 s

Total time: 0.020804 s
File: <ipython-input-15-aa01368c91ec>
Function: target_mean_v3 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def target_mean_v3(data,y_name, x_name):
     2         1         13.0     13.0      0.1      length = len(data)
     3         1          1.0      1.0      0.0      sum_dict = dict()
     4         1          1.0      1.0      0.0      count_dict = dict()
     5         1         38.0     38.0      0.2      x_array = data[x_name].values
     6         1         14.0     14.0      0.1      y_array = data[y_name].values
     7                                               # calculate total sum(count) of y_label 
     8      5001       4220.0      0.8     20.3      for i,val in enumerate(x_array):
     9      5000       6232.0      1.2     30.0          sum_dict[val] = sum_dict.get(val,0)+ y_array[i]
    10      5000       4521.0      0.9     21.7          count

## Cython

*异常值处理*：分子为0时