# **1. Library**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# **2. Utils**

## **2.1. reduce_mem_usage**

In [None]:
%%time
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

CPU times: user 1 µs, sys: 3 µs, total: 4 µs
Wall time: 8.11 µs


## **2.2. longest_streak**

In [None]:
def longest_streak(arr):
    """
    https://codereview.stackexchange.com/questions/138550/count-consecutive-ones-in-a-binary-list
    """
    one_list = []
    size = 0
    for num in arr:
        if num == 1:
            one_list.append(num)
        elif num == 0 and size < len(one_list):
            size = len(one_list)
            one_list = []
    return max(size, len(one_list))

## **2.3. group_entropy**

In [None]:
def group_entropy(df, group, subgroup, cname, value, df_feats):
  if isinstance(subgroup, list):
      full_group = [group]
      full_group.extend(subgroup)
  else:
      full_group = [group, subgroup]

  gp_1 = df.groupby(full_group)[value].count().reset_index()
  gp_1.columns = full_group + ['subgroup_cnt']

  gp_2 = df.groupby(group)[value].count().reset_index()
  gp_2.columns = [group, 'cnt']

  gp_3 = gp_2.merge(gp_1, on=group, how='left')

  gp_3['entropy'] = -np.log(gp_3['subgroup_cnt'] / gp_3['cnt']) * gp_3['subgroup_cnt'] / gp_3['cnt']
  gp_3['entropy'].fillna(0, inplace=True)

  gp_4 = gp_3.groupby(group)['entropy'].sum().reset_index()
  gp_4.columns = [group, cname]

  df_feats = df_feats.merge(gp_4, on=group, how='left')
  
  return df_feats

# **3. Analyst**

## **3.1. Pre-process**

In [None]:
hist_feats = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/viettel/data/hist_transac.csv') # change here

In [None]:
hist_feats.head()

Unnamed: 0,card_id,hist_transac_count,hist_transac_city_nunique,hist_transac_state_nunique,hist_transac_merchant_category_nunique,hist_transac_subsector_nunique,hist_transac_merchant_nunique,hist_transac_category_1_1_count,hist_transac_category_1_0_count,hist_transac_category_1_1_mean,hist_transac_category_1_1_std,hist_transac_category_2=1_count,hist_transac_category_2=1_mean,hist_transac_category_2=2_count,hist_transac_category_2=2_mean,hist_transac_category_2=3_count,hist_transac_category_2=3_mean,hist_transac_category_2=4_count,hist_transac_category_2=4_mean,hist_transac_category_2=5_count,hist_transac_category_2=5_mean,hist_transac_category_3=0_count,hist_transac_category_3=0_mean,hist_transac_category_3=1_count,hist_transac_category_3=1_mean,hist_transac_category_3=2_count,hist_transac_category_3=2_mean,hist_transac_category_3=3_count,hist_transac_category_3=3_mean,hist_transac_installments_mean,hist_transac_installments_sum,hist_transac_installments_max,hist_transac_installments_min,hist_transac_installments_std,hist_transac_installments_skew,hist_transac_monthlag_count_std,hist_transac_monthlag_count_max,hist_transac_monthlag_streak_max,hist_transac_approved_count,hist_transac_approved_mean,...,hist_transac_purchase_date_diff_hour_std,hist_transac_purchase_date_diff_sec_max,hist_transac_purchase_date_diff_day_max,hist_transac_purchase_date_diff_hour_max,hist_transac_purchase_date_diff_sec_min,hist_transac_purchase_date_diff_day_min,hist_transac_purchase_date_diff_hour_min,hist_purchase_date_last,hist_purchase_date_first,hist_purchase_date_diff_day,hist_purchase_count_ratio,hist_month_diff_mean,hist_month_diff_std,hist_month_diff_min,hist_month_diff_max,hist_transac_duration_mean,hist_transac_amount_month_ratio_mean,hist_transac_duration_std,hist_transac_amount_month_ratio_std,hist_transac_duration_min,hist_transac_amount_month_ratio_min,hist_transac_duration_max,hist_transac_amount_month_ratio_max,hist_transac_duration_skew,hist_transac_amount_month_ratio_skew,hist_transac_monthlag_nunique,hist_transac_monthlag_mean,hist_transac_monthlag_std,hist_transac_monthlag_min,hist_transac_monthlag_skew,hist_transac_month_lag=0_count,hist_transac_month_lag=0_mean,hist_transac_month_lag=-1_count,hist_transac_month_lag=-1_mean,hist_transac_month_lag=-2_count,hist_transac_month_lag=-2_mean,hist_transac_monthlag_0_-1_ratio,hist_transac_monthlag_0_-2_ratio,hist_transac_last_3_mon_count,hist_transac_last_3_mon_ratio
0,C_ID_00007093c1,149,4,3,18,13,29,28,121,0.187919,0.391965,28,0.187919,0,0.0,120,0.805369,0,0.0,1,0.006711,0,0.0,125,0.838926,24,0.161074,0,0,1.288591,192.0,6.0,1.0,0.7649,3.312481,5.457629,19.0,13,114,0.765101,...,8.637018,86049.0,33.0,23.0,20.0,0.0,0.0,2018-02-27 05:14:57,2017-02-14 14:00:43,377,0.39418,10.295302,0.457717,10,11,1570.681745,13.741052,1990.672301,18.004936,120.0,1.090909,15000.0,136.363636,3.89813,4.016048,13,-5.852349,3.453114,-12,0.135031,12,0.080537,10,0.067114,12,0.080537,1.090909,0.923077,34,0.226667
1,C_ID_0001238066,123,18,6,29,17,65,2,121,0.01626,0.126992,103,0.837398,0,0.0,0,0.0,0,0.0,20,0.162602,3,0.02439,88,0.715447,32,0.260163,0,0,1.634146,201.0,10.0,0.0,1.450089,3.220606,12.992179,37.0,6,120,0.97561,...,7.364404,85624.0,8.0,23.0,0.0,0.0,0.0,2018-02-27 16:18:59,2017-09-28 22:25:14,151,0.809211,10.162602,0.370511,10,11,1063.659187,9.456149,1267.672061,11.503708,80.0,0.727273,10082.2,91.656364,4.019542,4.082893,6,-1.813008,1.28898,-5,-0.239327,24,0.195122,26,0.211382,37,0.300813,0.888889,0.631579,87,0.701613
2,C_ID_0001506ef0,66,3,2,19,12,28,0,66,0.0,0.0,2,0.030303,0,0.0,64,0.969697,0,0.0,0,0.0,65,0.984848,1,0.015152,0,0.0,0,0,0.015152,1.0,1.0,0.0,0.123091,8.124038,4.564555,18.0,10,62,0.939394,...,8.795295,86273.0,50.0,23.0,25.0,0.0,0.0,2018-02-17 12:33:56,2017-01-14 16:16:01,398,0.165414,10.227273,0.422282,10,11,1512.507424,13.235398,3208.373908,28.120284,46.97,0.355833,14910.0,135.545455,2.664332,2.728336,13,-4.833333,4.2375,-13,-0.723706,6,0.090909,7,0.106061,18,0.272727,0.75,0.315789,31,0.462687
3,C_ID_0001793786,216,10,4,48,24,119,2,214,0.009259,0.096001,125,0.578704,76,0.351852,15,0.069444,0,0.0,0,0.0,211,0.976852,5,0.023148,0,0.0,0,0,0.023148,5.0,1.0,0.0,0.150723,6.386653,15.415847,38.0,10,189,0.875,...,7.692843,86311.0,25.0,23.0,24.0,0.0,0.0,2017-10-31 20:20:18,2017-01-21 10:15:21,283,0.760563,14.175926,0.381642,14,15,5438.681019,25.304234,7858.525843,36.83051,14.0,0.066667,49389.2,235.186667,2.953311,3.046109,10,-3.328704,2.306373,-9,-0.410039,20,0.092593,38,0.175926,34,0.157407,0.512821,0.571429,92,0.423963
4,C_ID_000183fdda,144,9,7,36,21,73,4,140,0.027778,0.164909,11,0.076389,1,0.006944,131,0.909722,0,0.0,1,0.006944,4,0.027778,103,0.715278,37,0.256944,0,0,1.861111,268.0,10.0,0.0,2.08763,2.801324,12.368873,28.0,7,137,0.951389,...,7.533026,85815.0,14.0,23.0,4.0,0.0,0.0,2018-02-25 20:57:08,2017-08-07 09:49:14,202,0.70936,10.159722,0.367627,10,11,1806.194931,16.138059,3522.536676,31.561706,66.0,0.5,23370.0,212.454545,3.821535,3.862588,7,-2.451389,1.895264,-6,-0.235599,28,0.194444,28,0.194444,26,0.180556,0.965517,1.037037,82,0.565517


In [None]:
# Read data
hist = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/viettel/data/historical_transactions.csv') # change here

# Reduce memory usage
hist = reduce_mem_usage(hist)

# Replace inf, illegal values by nan values
hist.replace([-np.inf, np.inf], np.nan, inplace=True) 
hist['installments'].replace(-1, np.nan, inplace=True)
hist['installments'].replace(999, np.nan, inplace=True)

# Fill nan values of categorical features by using mode
hist['category_2'].fillna(1, inplace=True)
hist['category_3'].fillna('A', inplace=True)
hist['merchant_id'].fillna('M_ID_00a6ca8a8a', inplace=True)
hist['installments'].fillna(0, inplace=True)

# Encode categorical feature
cols = ['authorized_flag', 'category_1', 'category_3']
lbl_encoder = LabelEncoder()
for c in cols:
    hist[c] = lbl_encoder.fit_transform(hist[c].astype(str))

# Transfer data to datetime type
hist['purchase_date'] = pd.to_datetime(hist['purchase_date'])
hist = hist.sort_values('purchase_date')

# One-hot categorical features: month_lag, category_2, category_3
hist['month_lag=0']  = (hist['month_lag'] == 0).astype(int)
hist['month_lag=-1'] = (hist['month_lag'] == -1).astype(int)
hist['month_lag=-2'] = (hist['month_lag'] == -2).astype(int)
hist['month_lag=-3'] = (hist['month_lag'] == -3).astype(int)
hist['month_lag=-4'] = (hist['month_lag'] == -4).astype(int)
hist['month_lag=-5'] = (hist['month_lag'] == -5).astype(int)
hist['month_lag=-6'] = (hist['month_lag'] == -6).astype(int)

hist['category_2=1'] = (hist['category_2'] == 1.).astype(int)
hist['category_2=2'] = (hist['category_2'] == 2.).astype(int)
hist['category_2=3'] = (hist['category_2'] == 3.).astype(int)
hist['category_2=4'] = (hist['category_2'] == 4.).astype(int)
hist['category_2=5'] = (hist['category_2'] == 5.).astype(int)

hist['category_3=0'] = (hist['category_3'] == 0).astype(int)
hist['category_3=1'] = (hist['category_3'] == 1).astype(int)
hist['category_3=2'] = (hist['category_3'] == 2).astype(int)
hist['category_3=3'] = (hist['category_3'] == 3).astype(int)

Mem. usage decreased to 1749.11 Mb (43.7% reduction)


In [None]:
# Normalize 'purchase_amount'
hist['purchase_amount'] = hist['purchase_amount'].astype(np.float64) 
hist['purchase_amount'] = np.round(hist['purchase_amount'] / 0.00150265118 + 497.06, 2)

In [None]:
hist.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,month_lag=0,month_lag=-1,month_lag=-2,month_lag=-3,month_lag=-4,month_lag=-5,month_lag=-6,category_2=1,category_2=2,category_2=3,category_2=4,category_2=5,category_3=0,category_3=1,category_3=2,category_3=3
7289521,1,C_ID_da2090f28e,69,0,0.0,0,623,M_ID_f001319a61,-11,40.0,2017-01-01 00:00:08,1.0,9,4,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
18512762,1,C_ID_efced389a0,76,0,1.0,1,842,M_ID_18038b5ae7,-12,120.0,2017-01-01 00:00:59,3.0,2,37,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
14942234,1,C_ID_83561fe74a,233,0,1.0,1,661,M_ID_52d3026407,-13,124.9,2017-01-01 00:01:41,1.0,9,8,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
28659693,1,C_ID_479fd6392a,-1,1,1.0,1,839,M_ID_e5374dabc0,-1,6.0,2017-01-01 00:02:03,1.0,-1,29,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0
20004812,1,C_ID_1cf6056088,69,0,0.0,0,278,M_ID_2cf6dc1f6f,-4,500.0,2017-01-01 00:02:12,1.0,9,37,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0


## **3.2. Feature Engineering**

### **3.2.4. merchant**

In [None]:
# Revisited transactions

# Create new_merchant table: count transactions
hist_merchant = hist.groupby(['merchant_id']).size().reset_index()
hist_merchant.columns = ['merchant_id', 'merchant_customer_count']

# Create new_merchant_card table: count transactions for each customer
hist_merchant_card = hist.groupby(['merchant_id', 'card_id']).size().reset_index()
hist_merchant_card.columns = ['merchant_id', 'card_id', 'customer_visit_count']

# Choose intimate customer
hist_merchant_card = hist_merchant_card.loc[hist_merchant_card['customer_visit_count'] > 1]

# Create new_merchant_repurchase_binary table
hist_merchant_repurchase_binary = hist_merchant_card.groupby(['merchant_id']).size().reset_index()
hist_merchant_repurchase_binary.columns = ['merchant_id', 'revisited_customers']
hist_merchant_repurchase_binary['revisited_customers'].fillna(0.0, inplace=True)

# Create new_merchant_repurchase_exact table
hist_merchant_repurchase_exact = hist_merchant_card.groupby(['merchant_id'])['customer_visit_count'].sum().reset_index()
hist_merchant_repurchase_exact.columns = ['merchant_id', 'revisited_count']
hist_merchant_repurchase_exact['revisited_count'].fillna(0.0, inplace=True)

# Processing
hist_merchant = hist_merchant.merge(hist_merchant_repurchase_binary, on=['merchant_id'], how='left')
hist_merchant = hist_merchant.merge(hist_merchant_repurchase_exact, on=['merchant_id'], how='left')
hist_merchant['repurchase_customer_ratio'] = hist_merchant['revisited_customers'].values / hist_merchant['merchant_customer_count'].values
hist_merchant['repurchase_ratio'] = hist_merchant['revisited_count'].values / hist_merchant['merchant_customer_count'].values

hist = hist.merge(hist_merchant, on=['merchant_id'], how='left')
for m in ['mean', 'std', 'max', 'min']:
    hist_feats['merchant_repurchase_customer_ratio_{}'.format(m)] = hist.groupby(['card_id'])['repurchase_customer_ratio'].agg([m]).values
    hist_feats['merchant_repurchase_ratio_{}'.format(m)] = hist.groupby(['card_id'])['repurchase_ratio'].agg([m]).values

In [None]:
del hist_merchant, hist_merchant_card, hist_merchant_repurchase_binary, hist_merchant_repurchase_exact

In [None]:
for c in hist_feats.columns[hist_feats.isnull().sum() != 0].values:
    hist_feats[c].fillna(hist_feats[c].median(), inplace=True)

In [None]:
hist_feats.head()

Unnamed: 0,card_id,hist_transac_count,hist_transac_city_nunique,hist_transac_state_nunique,hist_transac_merchant_category_nunique,hist_transac_subsector_nunique,hist_transac_merchant_nunique,hist_transac_category_1_1_count,hist_transac_category_1_0_count,hist_transac_category_1_1_mean,hist_transac_category_1_1_std,hist_transac_category_2=1_count,hist_transac_category_2=1_mean,hist_transac_category_2=2_count,hist_transac_category_2=2_mean,hist_transac_category_2=3_count,hist_transac_category_2=3_mean,hist_transac_category_2=4_count,hist_transac_category_2=4_mean,hist_transac_category_2=5_count,hist_transac_category_2=5_mean,hist_transac_category_3=0_count,hist_transac_category_3=0_mean,hist_transac_category_3=1_count,hist_transac_category_3=1_mean,hist_transac_category_3=2_count,hist_transac_category_3=2_mean,hist_transac_category_3=3_count,hist_transac_category_3=3_mean,hist_transac_installments_mean,hist_transac_installments_sum,hist_transac_installments_max,hist_transac_installments_min,hist_transac_installments_std,hist_transac_installments_skew,hist_transac_monthlag_count_std,hist_transac_monthlag_count_max,hist_transac_monthlag_streak_max,hist_transac_approved_count,hist_transac_approved_mean,...,hist_purchase_date_first,hist_purchase_date_diff_day,hist_purchase_count_ratio,hist_month_diff_mean,hist_month_diff_std,hist_month_diff_min,hist_month_diff_max,hist_transac_duration_mean,hist_transac_amount_month_ratio_mean,hist_transac_duration_std,hist_transac_amount_month_ratio_std,hist_transac_duration_min,hist_transac_amount_month_ratio_min,hist_transac_duration_max,hist_transac_amount_month_ratio_max,hist_transac_duration_skew,hist_transac_amount_month_ratio_skew,hist_transac_monthlag_nunique,hist_transac_monthlag_mean,hist_transac_monthlag_std,hist_transac_monthlag_min,hist_transac_monthlag_skew,hist_transac_month_lag=0_count,hist_transac_month_lag=0_mean,hist_transac_month_lag=-1_count,hist_transac_month_lag=-1_mean,hist_transac_month_lag=-2_count,hist_transac_month_lag=-2_mean,hist_transac_monthlag_0_-1_ratio,hist_transac_monthlag_0_-2_ratio,hist_transac_last_3_mon_count,hist_transac_last_3_mon_ratio,merchant_repurchase_customer_ratio_mean,merchant_repurchase_ratio_mean,merchant_repurchase_customer_ratio_std,merchant_repurchase_ratio_std,merchant_repurchase_customer_ratio_max,merchant_repurchase_ratio_max,merchant_repurchase_customer_ratio_min,merchant_repurchase_ratio_min
0,C_ID_00007093c1,149,4,3,18,13,29,28,121,0.187919,0.391965,28,0.187919,0,0.0,120,0.805369,0,0.0,1,0.006711,0,0.0,125,0.838926,24,0.161074,0,0,1.288591,192.0,6.0,1.0,0.7649,3.312481,5.457629,19.0,13,114,0.765101,...,2017-02-14 14:00:43,377,0.39418,10.295302,0.457717,10,11,1570.681745,13.741052,1990.672301,18.004936,120.0,1.090909,15000.0,136.363636,3.89813,4.016048,13,-5.852349,3.453114,-12,0.135031,12,0.080537,10,0.067114,12,0.080537,1.090909,0.923077,34,0.226667,0.128674,0.811013,0.042918,0.12853,0.254181,0.97123,0.083333,0.3125
1,C_ID_0001238066,123,18,6,29,17,65,2,121,0.01626,0.126992,103,0.837398,0,0.0,0,0.0,0,0.0,20,0.162602,3,0.02439,88,0.715447,32,0.260163,0,0,1.634146,201.0,10.0,0.0,1.450089,3.220606,12.992179,37.0,6,120,0.97561,...,2017-09-28 22:25:14,151,0.809211,10.162602,0.370511,10,11,1063.659187,9.456149,1267.672061,11.503708,80.0,0.727273,10082.2,91.656364,4.019542,4.082893,6,-1.813008,1.28898,-5,-0.239327,24,0.195122,26,0.211382,37,0.300813,0.888889,0.631579,87,0.701613,0.138738,0.750012,0.037288,0.196286,0.3,0.925272,0.0625,0.125
2,C_ID_0001506ef0,66,3,2,19,12,28,0,66,0.0,0.0,2,0.030303,0,0.0,64,0.969697,0,0.0,0,0.0,65,0.984848,1,0.015152,0,0.0,0,0,0.015152,1.0,1.0,0.0,0.123091,8.124038,4.564555,18.0,10,62,0.939394,...,2017-01-14 16:16:01,398,0.165414,10.227273,0.422282,10,11,1512.507424,13.235398,3208.373908,28.120284,46.97,0.355833,14910.0,135.545455,2.664332,2.728336,13,-4.833333,4.2375,-13,-0.723706,6,0.090909,7,0.106061,18,0.272727,0.75,0.315789,31,0.462687,0.144347,0.790385,0.041737,0.159611,0.333333,0.954894,0.090909,0.272727
3,C_ID_0001793786,216,10,4,48,24,119,2,214,0.009259,0.096001,125,0.578704,76,0.351852,15,0.069444,0,0.0,0,0.0,211,0.976852,5,0.023148,0,0.0,0,0,0.023148,5.0,1.0,0.0,0.150723,6.386653,15.415847,38.0,10,189,0.875,...,2017-01-21 10:15:21,283,0.760563,14.175926,0.381642,14,15,5438.681019,25.304234,7858.525843,36.83051,14.0,0.066667,49389.2,235.186667,2.953311,3.046109,10,-3.328704,2.306373,-9,-0.410039,20,0.092593,38,0.175926,34,0.157407,0.512821,0.571429,92,0.423963,0.167546,0.659638,0.051718,0.203681,0.375,0.982902,0.065476,0.181818
4,C_ID_000183fdda,144,9,7,36,21,73,4,140,0.027778,0.164909,11,0.076389,1,0.006944,131,0.909722,0,0.0,1,0.006944,4,0.027778,103,0.715278,37,0.256944,0,0,1.861111,268.0,10.0,0.0,2.08763,2.801324,12.368873,28.0,7,137,0.951389,...,2017-08-07 09:49:14,202,0.70936,10.159722,0.367627,10,11,1806.194931,16.138059,3522.536676,31.561706,66.0,0.5,23370.0,212.454545,3.821535,3.862588,7,-2.451389,1.895264,-6,-0.235599,28,0.194444,28,0.194444,26,0.180556,0.965517,1.037037,82,0.565517,0.178198,0.656061,0.036421,0.173186,0.26,0.954894,0.076923,0.153846


# **3. Save data**

In [None]:
hist_feats.to_csv('/content/drive/MyDrive/Colab Notebooks/viettel/data/hist_transac.csv', index=False)