In [30]:
import pandas as pd
import numpy as np

In [31]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [32]:
from line_profiler import LineProfiler
def LineProfilerAnalysis(func):
    profile = LineProfiler(func) 
    profile.enable() 
    func(data, 'y', 'x')
    profile.disable() 
    profile.print_stats() 

## base_version
   * base version

In [33]:
def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [34]:
%%timeit
target_mean_v1(data,'y','x')

20.6 s ± 2.3 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


## Python

### Version_2
   * using dict to store tmp_result
   * using numpy to replace pandas

In [35]:
def target_mean_v2(data,y_name, x_name):
    length = len(data)
    sum_dict = dict()
    count_dict = dict()
    x_array = data[x_name].values
    y_array = data[y_name].values
    # calculate total sum(count) of y_label 
    for i,val in enumerate(x_array):
        sum_dict[val] = sum_dict.get(val,0)+ y_array[i]
        count_dict[val] = count_dict.get(val,0)+1
    # calculate the final result
    result = np.zeros(length)
    for i in range(length):
        result[i] = (sum_dict[x_array[i]]-y_array[i])/(count_dict[x_array[i]]-1)
    return result

In [36]:
%%timeit -n 1000
target_mean_v2(data,'y','x')

6.83 ms ± 165 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [37]:
LineProfilerAnalysis(target_mean_v2)

Timer unit: 1e-06 s

Total time: 0.018897 s
File: <ipython-input-35-4d4ff5652e5b>
Function: target_mean_v2 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def target_mean_v2(data,y_name, x_name):
     2         1         10.0     10.0      0.1      length = len(data)
     3         1          1.0      1.0      0.0      sum_dict = dict()
     4         1          0.0      0.0      0.0      count_dict = dict()
     5         1         29.0     29.0      0.2      x_array = data[x_name].values
     6         1          8.0      8.0      0.0      y_array = data[y_name].values
     7                                               # calculate total sum(count) of y_label 
     8      5001       2442.0      0.5     12.9      for i,val in enumerate(x_array):
     9      5000       4519.0      0.9     23.9          sum_dict[val] = sum_dict.get(val,0)+ y_array[i]
    10      5000       3127.0      0.6     16.5          count

### Version_3
   * optimize the for loop

In [38]:
def target_mean_v3(data,y_name, x_name):
    length = len(data)
    sum_dict = dict()
    count_dict = dict()
    x_array = data[x_name].values
    y_array = data[y_name].values
    # calculate total sum(count) of y_label 
    for i,val in enumerate(x_array):
        sum_dict[val] = sum_dict.get(val,0)+ y_array[i]
        count_dict[val] = count_dict.get(val,0)+1
    # calculate the final result
    numerator_array = np.array([sum_dict[x_array[i]]-y_array[i] for i in range(length)])
    denominator_array = np.array([count_dict[x_array[i]]-1 for i in range(length)])
    result = numerator_array / denominator_array
    return result

In [39]:
%%timeit -n 1000
target_mean_v3(data,'y','x')

6.44 ms ± 116 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [40]:
LineProfilerAnalysis(target_mean_v3)

Timer unit: 1e-06 s

Total time: 0.020286 s
File: <ipython-input-38-aa01368c91ec>
Function: target_mean_v3 at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           def target_mean_v3(data,y_name, x_name):
     2         1         11.0     11.0      0.1      length = len(data)
     3         1          2.0      2.0      0.0      sum_dict = dict()
     4         1          0.0      0.0      0.0      count_dict = dict()
     5         1         33.0     33.0      0.2      x_array = data[x_name].values
     6         1         11.0     11.0      0.1      y_array = data[y_name].values
     7                                               # calculate total sum(count) of y_label 
     8      5001       3934.0      0.8     19.4      for i,val in enumerate(x_array):
     9      5000       6487.0      1.3     32.0          sum_dict[val] = sum_dict.get(val,0)+ y_array[i]
    10      5000       4692.0      0.9     23.1          count

## Cython

In [41]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


### Version_4

In [42]:
%%cython -a
cimport numpy as cnp
import numpy as np

def target_mean_v4(data,y_name,x_name):
    cdef:
        int length = len(data)
        dict sum_dict = {}
        dict count_dict = {}
        cnp.ndarray[cnp.int_t] x_array = data[x_name].values
        cnp.ndarray[cnp.int_t] y_array = data[y_name].values
    for i,val in enumerate(x_array):
        sum_dict[val] = sum_dict.get(val,0)+ y_array[i]
        count_dict[val] = count_dict.get(val,0)+1
    # calculate the final result
    numerator_array = np.array([sum_dict[x_array[i]]-y_array[i] for i in range(length)])
    denominator_array = np.array([count_dict[x_array[i]]-1 for i in range(length)])
    result = numerator_array / denominator_array
    return result

In [43]:
%%timeit -n 1000
target_mean_v4(data,'y','x')

5.53 ms ± 129 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


### Version_5

In [44]:
%%cython -a
import numpy as np
cimport numpy as cnp
import cython
cimport cython
from cython.parallel import prange

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v5(data,cnp.str y_name,cnp.str x_name):
    cdef:
        int length = len(data)
        double[:,] result = np.zeros(length, dtype=np.float64)
        double[:,] sum_dict = np.zeros(10,dtype=np.float64)
        long[:,] count_dict = np.zeros(10,dtype=np.int64)
        long[:,] x_array = data[x_name].values
        long[:,] y_array = data[y_name].values
        int i = 1
        long x
    for i in prange(length,nogil=True):
        x = x_array[i]
        sum_dict[x] += y_array[i]
        count_dict[x] += 1
    
    for i in prange(length,nogil=True):
        x = x_array[i]
        result[i] = (sum_dict[x] - y_array[i]) / (count_dict[x] -1)
    return result

In [45]:
%%timeit -n 1000
target_mean_v5(data,'y','x')

25.6 µs ± 2.06 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
