In [12]:
import pandas as pd
import numpy as np
from pandarallel import pandarallel
#!pip install pandarallel

In [7]:
def compute_trans_features(cus_group):
  by_mon = cus_group.groupby('month').sum()
  n_mon = len(cus_group['month'].unique())

  in_per_cnt_avg = np.nansum((by_mon['in_amt']/by_mon['in_cnt']).fillna(0))/13.0
  in_per_cnt_std = np.sqrt(np.nanstd((by_mon['in_amt']/by_mon['in_cnt']).fillna(0))**2 * n_mon / 13.0)
  out_per_cnt_avg = np.nansum((by_mon['out_amt']/by_mon['out_cnt']).fillna(0))/13.0
  out_per_cnt_std = np.sqrt(np.nanstd((by_mon['out_amt']/by_mon['out_cnt']).fillna(0))**2 * n_mon / 13.0)
  
  in_amt_avg = np.nansum(by_mon['in_amt'])/13.0
  in_amt_std = np.sqrt((np.nanstd(by_mon['in_amt']))**2 * n_mon / 13.0)
  out_amt_avg = np.nansum(by_mon['out_amt'])/13.0
  out_amt_std = np.sqrt((np.nanstd(by_mon['out_amt']))**2 * n_mon / 13.0)

  amt_ratio_avg = np.nansum((by_mon['in_amt']/by_mon['out_amt']).replace(np.inf,0))/13.0
  amt_ratio_std = np.sqrt(np.nanstd((by_mon['in_amt']/by_mon['out_amt']).replace(np.inf,0))**2 * n_mon / 13.0)

  in_cnt_avg = np.nansum(by_mon['in_cnt'])/13.0
  in_cnt_std = np.sqrt(np.nanstd(by_mon['in_cnt'])**2 * n_mon / 13.0)
  out_cnt_avg = np.nansum(by_mon['out_cnt'])/13.0
  out_cnt_std = np.sqrt(np.nanstd(by_mon['out_cnt'])**2 * n_mon / 13.0)

  cnt_ratio_avg = np.nansum((by_mon['in_cnt']/by_mon['out_cnt']).replace(np.inf,0))/13.0
  cnt_ratio_std = np.sqrt(np.nanstd((by_mon['in_cnt']/by_mon['out_cnt']).replace(np.inf,0))**2 * n_mon / 13.0)
  
  mon_cash_group = cus_group[cus_group['trsactn_type']=='cash'].groupby(by=['month']).sum()
  mon_cheque_group = cus_group[cus_group['trsactn_type']=='cheque'].groupby(by=['month']).sum()
  mon_visa_group = cus_group[cus_group['trsactn_type']=='visa'].groupby(by=['month']).sum()
  mon_debit_group = cus_group[cus_group['trsactn_type']=='debit'].groupby(by=['month']).sum()
  mon_amex_group = cus_group[cus_group['trsactn_type']=='amex'].groupby(by=['month']).sum()
  
  per_cash_in_amt_avg = np.nansum((mon_cash_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))/13.0
  per_cash_in_amt_std = np.sqrt(np.nanstd((mon_cash_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_cash_in_cnt_avg = np.nansum(mon_cash_group['in_cnt']/by_mon['in_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_cash_in_cnt_std = np.sqrt(np.nanstd((mon_cash_group['in_cnt']/by_mon['in_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_cash_out_amt_avg = np.nansum((mon_cash_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))/13.0
  per_cash_out_amt_std = np.sqrt(np.nanstd((mon_cash_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_cash_out_cnt_avg = np.nansum(mon_cash_group['out_cnt']/by_mon['out_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_cash_out_cnt_std = np.sqrt(np.nanstd((mon_cash_group['out_cnt']/by_mon['out_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)

  per_cheque_in_amt_avg = np.nansum((mon_cheque_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))/13.0
  per_cheque_in_amt_std = np.sqrt(np.nanstd((mon_cheque_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_cheque_in_cnt_avg = np.nansum(mon_cheque_group['in_cnt']/by_mon['in_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_cheque_in_cnt_std = np.sqrt(np.nanstd((mon_cheque_group['in_cnt']/by_mon['in_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_cheque_out_amt_avg = np.nansum((mon_cheque_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))/13.0
  per_cheque_out_amt_std = np.sqrt(np.nanstd((mon_cheque_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_cheque_out_cnt_avg = np.nansum(mon_cheque_group['out_cnt']/by_mon['out_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_cheque_out_cnt_std = np.sqrt(np.nanstd((mon_cheque_group['out_cnt']/by_mon['out_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)

  per_visa_in_amt_avg = np.nansum((mon_visa_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))/13.0
  per_visa_in_amt_std = np.sqrt(np.nanstd((mon_visa_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_visa_in_cnt_avg = np.nansum(mon_visa_group['in_cnt']/by_mon['in_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_visa_in_cnt_std = np.sqrt(np.nanstd((mon_visa_group['in_cnt']/by_mon['in_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_visa_out_amt_avg = np.nansum((mon_visa_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))/13.0
  per_visa_out_amt_std = np.sqrt(np.nanstd((mon_visa_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_visa_out_cnt_avg = np.nansum(mon_visa_group['out_cnt']/by_mon['out_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_visa_out_cnt_std = np.sqrt(np.nanstd((mon_visa_group['out_cnt']/by_mon['out_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)

  per_debit_in_amt_avg = np.nansum((mon_debit_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))/13.0
  per_debit_in_amt_std = np.sqrt(np.nanstd((mon_debit_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_debit_in_cnt_avg = np.nansum(mon_debit_group['in_cnt']/by_mon['in_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_debit_in_cnt_std = np.sqrt(np.nanstd((mon_debit_group['in_cnt']/by_mon['in_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_debit_out_amt_avg = np.nansum((mon_debit_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))/13.0
  per_debit_out_amt_std = np.sqrt(np.nanstd((mon_debit_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_debit_out_cnt_avg = np.nansum(mon_debit_group['out_cnt']/by_mon['out_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_debit_out_cnt_std = np.sqrt(np.nanstd((mon_debit_group['out_cnt']/by_mon['out_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)

  per_amex_in_amt_avg = np.nansum((mon_amex_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))/13.0
  per_amex_in_amt_std = np.sqrt(np.nanstd((mon_amex_group['in_amt']/by_mon['in_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_amex_in_cnt_avg = np.nansum(mon_amex_group['in_cnt']/by_mon['in_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_amex_in_cnt_std = np.sqrt(np.nanstd((mon_amex_group['in_cnt']/by_mon['in_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_amex_out_amt_avg = np.nansum((mon_amex_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))/13.0
  per_amex_out_amt_std = np.sqrt(np.nanstd((mon_amex_group['out_amt']/by_mon['out_amt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)
  per_amex_out_cnt_avg = np.nansum(mon_amex_group['out_cnt']/by_mon['out_cnt'].fillna(0).replace(np.inf,0)) / 13.0
  per_amex_out_cnt_std = np.sqrt(np.nanstd((mon_amex_group['out_cnt']/by_mon['out_cnt']).fillna(0).replace(np.inf,0))**2 * n_mon / 13.0)

  return pd.DataFrame({'in_per_cnt_avg':in_per_cnt_avg, 'in_per_cnt_std':in_per_cnt_std, 
                       'out_per_cnt_avg':out_per_cnt_avg, 'out_per_cnt_std':out_per_cnt_std,
                       'in_amt_avg':in_amt_avg, 'in_amt_std':in_amt_std, 
                       'out_amt_avg':out_amt_avg, 'out_amt_std':out_amt_std,
                       'per_cash_in_amt_avg':per_cash_in_amt_avg, 'per_cash_in_amt_std':per_cash_in_amt_std, 
                       'per_cash_in_cnt_avg':per_cash_in_cnt_avg, 'per_cash_in_cnt_std':per_cash_in_cnt_std,
                       'per_cash_out_amt_avg':per_cash_out_amt_avg, 'per_cash_out_amt_std':per_cash_out_amt_std,
                       'per_cash_out_cnt_avg':per_cash_out_cnt_avg, 'per_cash_out_cnt_std':per_cash_out_cnt_std,
                       'per_cheque_in_amt_avg':per_cheque_in_amt_avg, 'per_cheque_in_amt_std':per_cheque_in_amt_std, 
                       'per_cheque_in_cnt_avg':per_cheque_in_cnt_avg, 'per_cheque_in_cnt_std':per_cheque_in_cnt_std,
                       'per_cheque_out_amt_avg':per_cheque_out_amt_avg, 'per_cheque_out_amt_std':per_cheque_out_amt_std,
                       'per_cheque_out_cnt_avg':per_cheque_out_cnt_avg, 'per_cheque_out_cnt_std':per_cheque_out_cnt_std,
                       'per_visa_in_amt_avg':per_visa_in_amt_avg, 'per_visa_in_amt_std':per_visa_in_amt_std, 
                       'per_visa_in_cnt_avg':per_visa_in_cnt_avg, 'per_visa_in_cnt_std':per_visa_in_cnt_std,
                       'per_visa_out_amt_avg':per_visa_out_amt_avg, 'per_visa_out_amt_std':per_visa_out_amt_std,
                       'per_visa_out_cnt_avg':per_visa_out_cnt_avg, 'per_visa_out_cnt_std':per_visa_out_cnt_std,
                       'per_debit_in_amt_avg':per_debit_in_amt_avg, 'per_debit_in_amt_std':per_debit_in_amt_std, 
                       'per_debit_in_cnt_avg':per_debit_in_cnt_avg, 'per_debit_in_cnt_std':per_debit_in_cnt_std,
                       'per_debit_out_amt_avg':per_debit_out_amt_avg, 'per_debit_out_amt_std':per_debit_out_amt_std,
                       'per_debit_out_cnt_avg':per_debit_out_cnt_avg, 'per_debit_out_cnt_std':per_debit_out_cnt_std,
                       'per_amex_in_amt_avg':per_amex_in_amt_avg, 'per_amex_in_amt_std':per_amex_in_amt_std, 
                       'per_amex_in_cnt_avg':per_amex_in_cnt_avg, 'per_amex_in_cnt_std':per_amex_in_cnt_std,
                       'per_amex_out_amt_avg':per_amex_out_amt_avg, 'per_amex_out_amt_std':per_amex_out_amt_std,
                       'per_amex_out_cnt_avg':per_amex_out_cnt_avg, 'per_amex_out_cnt_std':per_amex_out_cnt_std,},index=[0])


In [8]:
df_trans = pd.read_parquet("bigdata2021data/transaction_train.parquet")

In [9]:
df_trans_by_cus = df_trans.groupby('customer_id_mskd')

In [13]:
pandarallel.initialize(nb_workers=2)

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [14]:
df_trans_features = df_trans_by_cus.parallel_apply(compute_trans_features)