In [41]:
%load_ext Cython

The Cython extension is already loaded. To reload it, use:
  %reload_ext Cython


In [7]:
import numpy as np
import pandas as pd

def target_mean_v1(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        groupby_result = data[data.index != i].groupby([x_name], as_index=False).agg(['mean', 'count'])
        result[i] = groupby_result.loc[groupby_result.index == data.loc[i, x_name], (y_name, 'mean')]
    return result

In [8]:
def target_mean_v2(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        if data.loc[i, x_name] not in value_dict.keys():
            value_dict[data.loc[i, x_name]] = data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] = 1
        else:
            value_dict[data.loc[i, x_name]] += data.loc[i, y_name]
            count_dict[data.loc[i, x_name]] += 1
    for i in range(data.shape[0]):
        result[i] = (value_dict[data.loc[i, x_name]] - data.loc[i, y_name]) / (count_dict[data.loc[i, x_name]] - 1)
    return result

In [11]:
y = np.random.randint(2, size=(5000, 1))
x = np.random.randint(10, size=(5000, 1))

In [12]:
data = pd.DataFrame(np.concatenate([y, x], axis=1), columns=['y', 'x'])

In [None]:
result_1 = target_mean_v1(data, 'y', 'x')
result_2 = target_mean_v2(data, 'y', 'x')

In [None]:
diff = np.linalg.norm(result_1 - result_2)
print(diff)

In [None]:
%%timeit 
np.linalg.norm(result_1 - result_2)

The slowest run took 112.07 times longer than the fastest. This could mean that an intermediate result is being cached.
100000 loops, best of 3: 10.2 µs per loop


In [None]:
pip install line_profiler

In [None]:
%load_ext line_profiler

In [13]:
%lprun -f target_mean_v2 target_mean_v2(data, 'y', 'x')

In [14]:
%%timeit
target_mean_v2(data,'y','x')

1 loop, best of 3: 279 ms per loop


In [32]:
def target_mean_v3(data, y_name, x_name):
    result = np.zeros(data.shape[0])
    value_dict = dict()
    count_dict = dict()
    for i in range(data.shape[0]):
        index_x = data.loc[i, x_name]
        index_y = data.loc[i, y_name]
        if index_x not in value_dict.keys():
            value_dict[index_x] = index_y
            count_dict[index_x] = 1
        else:
            value_dict[index_x] += index_y
            count_dict[index_x] += 1
    for i in range(data.shape[0]):
        index_x = data.loc[i, x_name]
        index_y = data.loc[i, y_name]
        result[i] = (value_dict[index_x] - index_y) / (count_dict[index_x] - 1)
    return result

In [20]:
%lprun -f target_mean_v3 target_mean_v3(data, 'y', 'x')

In [33]:
%%timeit
target_mean_v3(data,'y','x')

10 loops, best of 3: 161 ms per loop


In [34]:
def target_mean_v4(data:pd.DataFrame, y_name:str, x_name:str) -> np.ndarray:
  data_shape = data.shape[0]
  result = np.zeros(data_shape)

  value_dict = dict()
  count_dict = dict()

  x_val_series = data[x_name].values
  y_val_series = data[y_name].values
  for i in range(data_shape):
    data_loc_x = x_val_series[i]
    data_loc_y = y_val_series[i]
    if data_loc_x not in value_dict:
      value_dict[data_loc_x] = data_loc_y
      count_dict[data_loc_x] = 1
    else:
      value_dict[data_loc_x] += data_loc_y
      count_dict[data_loc_x] += 1
  for i in range(data_shape):
    data_loc_x = x_val_series[i]
    data_loc_y = y_val_series[i]
    result[i] = (value_dict[data_loc_x] - data_loc_y) / (count_dict[data_loc_x] - 1)
  return result
  

In [35]:
%%timeit
target_mean_v4(data,'y','x')

100 loops, best of 3: 8.16 ms per loop


In [None]:
%%cython -a

import numpy as np
cimport numpy as cnp
import pandas as pd

def target_mean_v5(data:pd.DataFrame, y_name:str, x_name:str) -> np.ndarray:
  cdef:
    int data_shape = data.shape[0]
    cnp.ndarray[cnp.float64_t] result = np.zeros(data_shape, dtype=np.float64)
    dict value_dict = {}
    dict count_dict = {}
    cnp.ndarray[cnp.int_t] x_val_array = data[x_name].values
    cnp.ndarray[cnp.int_t] y_val_array = data[y_name].values
  for i in range(data_shape):
    data_loc_x = x_val_array[i]
    data_loc_y = y_val_array[i]
    if data_loc_x not in value_dict:
      value_dict[data_loc_x] = data_loc_y
      count_dict[data_loc_x] = 1
    else:
      value_dict[data_loc_x] += data_loc_y
      count_dict[data_loc_x] += 1
  for i in range(data_shape):
    count = count_dict[x_val_array[i]] - 1
    result[i] = (value_dict[x_val_array[i]] - y_val_array[i]) / count
  return result

In [48]:
%%timeit
target_mean_v5(data,'y','x')
    

1000 loops, best of 3: 1.08 ms per loop


In [None]:
# use openmp in cython
# use prange instead of range
# open gil

In [None]:
%%cython -a

import numpy as np
cimport numpy as cnp
import pandas as pd
import cython
cimport cython
from cython.parallel import prange

cpdef target_mean_v6(data, cnp.str y_name, cnp.str x_name):
  cdef:
    int data_shape = data.shape[0]
    double[:,] result = np.zeros(data.shape, dtype=np.float64)
    double[:,] value_dict = np.zeros(10, dtype=np.float64)
    double[:,] count_dict = np.zeros(10, dtype=np.float64)
    long[:,] x_val_array = data[x_name].values
    long[:,] y_val_array = data[y_name].values
    int i = 0
  for i in prange(data_shape, nogil=True):
    value_dict[x_val_array[i]] += y_val_array[i]
    count_dict[x_val_array[i]] += 1
  for i in prange(data_shape, nogil=True):
    result[i] = (value_dict[x_val_array[i]] - y_val_array[i]) / (count_dict[x_val_array[i]] - 1) 

  return result

In [None]:
%%timeit
target_mean_v6(data,'y','x')

In [59]:
%%cython -a

import numpy as np
cimport numpy as cnp
import pandas as pd
import cython
cimport cython
from cython.parallel import prange

@cython.boundscheck(False)
@cython.wraparound(False)
cpdef target_mean_v7(data, cnp.str y_name, cnp.str x_name):
  cdef:
    int data_shape = data.shape[0]
    double[::1] result = np.zeros(data_shape, dtype=np.float64)
    double[::1] value_dict = np.zeros(10, dtype=np.float64)
    long[::1] count_dict = np.zeros(10, dtype=np.int64)
    long[::1] x_val_array = np.asfortranarray(data[x_name].values, dtype=np.int64)
    long[::1] y_val_array = np.asfortranarray(data[y_name].values, dtype=np.int64)
    int i = 0
    long x
  for i in prange(data_shape, nogil=True):
    x = x_val_array[i]
    value_dict[x] += y_val_array[i]
    count_dict[x] += 1
  for i in prange(data_shape, nogil=True):
    x = x_val_array[i]
    result[i] = (value_dict[x] - y_val_array[i]) / (count_dict[x] - 1)
    
  return result




In [58]:
%%timeit
target_mean_v7(data,'y','x')

The slowest run took 19.64 times longer than the fastest. This could mean that an intermediate result is being cached.
10000 loops, best of 3: 43.6 µs per loop
