# **1. Library**

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# **2. Utils**

## **2.1. reduce_mem_usage**

In [None]:
%%time
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

CPU times: user 2 µs, sys: 5 µs, total: 7 µs
Wall time: 11 µs


## **2.2. longest_streak**

In [None]:
def longest_streak(arr):
    """
    https://codereview.stackexchange.com/questions/138550/count-consecutive-ones-in-a-binary-list
    """
    one_list = []
    size = 0
    for num in arr:
        if num == 1:
            one_list.append(num)
        elif num == 0 and size < len(one_list):
            size = len(one_list)
            one_list = []
    return max(size, len(one_list))

## **2.3. group_entropy**

In [None]:
def group_entropy(df, group, subgroup, cname, value, df_feats):
  if isinstance(subgroup, list):
      full_group = [group]
      full_group.extend(subgroup)
  else:
      full_group = [group, subgroup]

  gp_1 = df.groupby(full_group)[value].count().reset_index()
  gp_1.columns = full_group + ['subgroup_cnt']

  gp_2 = df.groupby(group)[value].count().reset_index()
  gp_2.columns = [group, 'cnt']

  gp_3 = gp_2.merge(gp_1, on=group, how='left')

  gp_3['entropy'] = -np.log(gp_3['subgroup_cnt'] / gp_3['cnt']) * gp_3['subgroup_cnt'] / gp_3['cnt']
  gp_3['entropy'].fillna(0, inplace=True)

  gp_4 = gp_3.groupby(group)['entropy'].sum().reset_index()
  gp_4.columns = [group, cname]

  df_feats = df_feats.merge(gp_4, on=group, how='left')
  
  return df_feats

# **3. Analyst**

## **3.1. Pre-process**

In [None]:
# Read data
hist = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/viettel/data/historical_transactions.csv') # change here

# Reduce memory usage
hist = reduce_mem_usage(hist)

# Replace inf, illegal values by nan values
hist.replace([-np.inf, np.inf], np.nan, inplace=True) 
hist['installments'].replace(-1, np.nan, inplace=True)
hist['installments'].replace(999, np.nan, inplace=True)

# Fill nan values of categorical features by using mode
hist['category_2'].fillna(1, inplace=True)
hist['category_3'].fillna('A', inplace=True)
hist['merchant_id'].fillna('M_ID_00a6ca8a8a', inplace=True)
hist['installments'].fillna(0, inplace=True)

# Encode categorical feature
cols = ['authorized_flag', 'category_1', 'category_3']
lbl_encoder = LabelEncoder()
for c in cols:
    hist[c] = lbl_encoder.fit_transform(hist[c].astype(str))

# Transfer data to datetime type
hist['purchase_date'] = pd.to_datetime(hist['purchase_date'])
hist = hist.sort_values('purchase_date')

# One-hot categorical features: month_lag, category_2, category_3
hist['month_lag=0']  = (hist['month_lag'] == 0).astype(int)
hist['month_lag=-1'] = (hist['month_lag'] == -1).astype(int)
hist['month_lag=-2'] = (hist['month_lag'] == -2).astype(int)
hist['month_lag=-3'] = (hist['month_lag'] == -3).astype(int)
hist['month_lag=-4'] = (hist['month_lag'] == -4).astype(int)
hist['month_lag=-5'] = (hist['month_lag'] == -5).astype(int)
hist['month_lag=-6'] = (hist['month_lag'] == -6).astype(int)

hist['category_2=1'] = (hist['category_2'] == 1.).astype(int)
hist['category_2=2'] = (hist['category_2'] == 2.).astype(int)
hist['category_2=3'] = (hist['category_2'] == 3.).astype(int)
hist['category_2=4'] = (hist['category_2'] == 4.).astype(int)
hist['category_2=5'] = (hist['category_2'] == 5.).astype(int)

hist['category_3=0'] = (hist['category_3'] == 0).astype(int)
hist['category_3=1'] = (hist['category_3'] == 1).astype(int)
hist['category_3=2'] = (hist['category_3'] == 2).astype(int)
hist['category_3=3'] = (hist['category_3'] == 3).astype(int)

Mem. usage decreased to 1749.11 Mb (43.7% reduction)


In [None]:
# Chuẩn hóa giá trị 'purchase_amount'
hist['purchase_amount'] = hist['purchase_amount'].astype(np.float64) 
hist['purchase_amount'] = np.round(hist['purchase_amount'] / 0.00150265118 + 497.06, 2)

In [None]:
hist_feats = pd.DataFrame(hist.groupby(['card_id']).size()).reset_index()
hist_feats.columns = ['card_id', 'hist_transac_count']

In [None]:
hist.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,month_lag=0,month_lag=-1,month_lag=-2,month_lag=-3,month_lag=-4,month_lag=-5,month_lag=-6,category_2=1,category_2=2,category_2=3,category_2=4,category_2=5,category_3=0,category_3=1,category_3=2,category_3=3
7289521,1,C_ID_da2090f28e,69,0,0.0,0,623,M_ID_f001319a61,-11,40.0,2017-01-01 00:00:08,1.0,9,4,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
18512762,1,C_ID_efced389a0,76,0,1.0,1,842,M_ID_18038b5ae7,-12,120.0,2017-01-01 00:00:59,3.0,2,37,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
14942234,1,C_ID_83561fe74a,233,0,1.0,1,661,M_ID_52d3026407,-13,124.9,2017-01-01 00:01:41,1.0,9,8,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
28659693,1,C_ID_479fd6392a,-1,1,1.0,1,839,M_ID_e5374dabc0,-1,6.0,2017-01-01 00:02:03,1.0,-1,29,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0
20004812,1,C_ID_1cf6056088,69,0,0.0,0,278,M_ID_2cf6dc1f6f,-4,500.0,2017-01-01 00:02:12,1.0,9,37,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0


In [None]:
hist_feats.head()

Unnamed: 0,card_id,hist_transac_count
0,C_ID_00007093c1,149
1,C_ID_0001238066,123
2,C_ID_0001506ef0,66
3,C_ID_0001793786,216
4,C_ID_000183fdda,144


## **3.2. Feature Engineering**

### **2.2.1 info**

In [None]:
# Count unique values for city_id, state_id, merchant_category_id, subsector_id, merchant_id 
for c in ['city', 'state', 'merchant_category', 'subsector', 'merchant']:
    hist_feats['hist_transac_{}_nunique'.format(c)] = hist.groupby(['card_id'])['{}_id'.format(c)].nunique().values

# One-hot category_1: sum(), mean(), std()
hist_feats['hist_transac_category_1_1_count'] = hist.groupby(['card_id'])['category_1'].sum().values
hist_feats['hist_transac_category_1_0_count'] = hist_feats['hist_transac_count'].values - hist_feats['hist_transac_category_1_1_count'].values
hist_feats['hist_transac_category_1_1_mean'] = hist.groupby(['card_id'])['category_1'].mean().values
hist_feats['hist_transac_category_1_1_std'] = hist.groupby(['card_id'])['category_1'].std().values

# One-hot category_2, category_3: sum(), mean()
for c in ['category_2=1', 'category_2=2', 'category_2=3', 'category_2=4', 'category_2=5',
          'category_3=0', 'category_3=1', 'category_3=2', 'category_3=3']:
    hist_feats['hist_transac_{}_count'.format(c)] = hist.groupby(['card_id'])[c].sum().values
    hist_feats['hist_transac_{}_mean'.format(c)]  = hist.groupby(['card_id'])[c].mean().values

# Installments count
for m in ['mean', 'sum', 'max', 'min', 'std', 'skew']:
    hist_feats['hist_transac_installments_{}'.format(m)] = hist.groupby(['card_id'])['installments'].agg([m]).values

# Monthlag count purchase_amount
hist_monthsum_count = hist.groupby(['card_id', 'month_lag'])['purchase_amount'].count().unstack().fillna(0.0).reset_index()
hist_feats['hist_transac_monthlag_count_std'] = hist_monthsum_count.iloc[:, 1:].std(axis=1).values
hist_feats['hist_transac_monthlag_count_max'] = hist_monthsum_count.iloc[:, 1:].max(axis=1).values
hist_have_purchase = (hist_monthsum_count.iloc[:, 1:] != 0).astype(int).values
hist_feats['hist_transac_monthlag_streak_max'] = np.apply_along_axis(longest_streak, 1, hist_have_purchase) # streak

# Authorized_flag, we don't consider this in new transaction table
hist_feats['hist_transac_approved_count'] = hist.groupby(['card_id'])['authorized_flag'].sum().values
hist_feats['hist_transac_approved_mean'] = hist.groupby(['card_id'])['authorized_flag'].mean().values
hist_feats['hist_transac_denied_count'] = hist_feats['hist_transac_count'].values - hist_feats['hist_transac_approved_count'].values

# Group entropy
for c in ['merchant_category_id', 'subsector_id', 'merchant_id', 'city_id', 'state_id', 'category_1', 'category_2', 'category_3', 'month_lag']:
    hist_feats = group_entropy(hist, 'card_id', c, 'hist_transac_{}_entropy'.format(c), 'purchase_amount', hist_feats)

In [None]:
for c in hist_feats.columns[hist_feats.isnull().sum() != 0].values:
    hist_feats[c].fillna(hist_feats[c].median(), inplace=True)

In [None]:
hist.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,month_lag=0,month_lag=-1,month_lag=-2,month_lag=-3,month_lag=-4,month_lag=-5,month_lag=-6,category_2=1,category_2=2,category_2=3,category_2=4,category_2=5,category_3=0,category_3=1,category_3=2,category_3=3
7289521,1,C_ID_da2090f28e,69,0,0.0,0,623,M_ID_f001319a61,-11,40.0,2017-01-01 00:00:08,1.0,9,4,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0
18512762,1,C_ID_efced389a0,76,0,1.0,1,842,M_ID_18038b5ae7,-12,120.0,2017-01-01 00:00:59,3.0,2,37,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
14942234,1,C_ID_83561fe74a,233,0,1.0,1,661,M_ID_52d3026407,-13,124.9,2017-01-01 00:01:41,1.0,9,8,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0
28659693,1,C_ID_479fd6392a,-1,1,1.0,1,839,M_ID_e5374dabc0,-1,6.0,2017-01-01 00:02:03,1.0,-1,29,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0
20004812,1,C_ID_1cf6056088,69,0,0.0,0,278,M_ID_2cf6dc1f6f,-4,500.0,2017-01-01 00:02:12,1.0,9,37,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0


In [None]:
hist_feats.head()

Unnamed: 0,card_id,hist_transac_count,hist_transac_city_nunique,hist_transac_state_nunique,hist_transac_merchant_category_nunique,hist_transac_subsector_nunique,hist_transac_merchant_nunique,hist_transac_category_1_1_count,hist_transac_category_1_0_count,hist_transac_category_1_1_mean,hist_transac_category_1_1_std,hist_transac_category_2=1_count,hist_transac_category_2=1_mean,hist_transac_category_2=2_count,hist_transac_category_2=2_mean,hist_transac_category_2=3_count,hist_transac_category_2=3_mean,hist_transac_category_2=4_count,hist_transac_category_2=4_mean,hist_transac_category_2=5_count,hist_transac_category_2=5_mean,hist_transac_category_3=0_count,hist_transac_category_3=0_mean,hist_transac_category_3=1_count,hist_transac_category_3=1_mean,hist_transac_category_3=2_count,hist_transac_category_3=2_mean,hist_transac_category_3=3_count,hist_transac_category_3=3_mean,hist_transac_installments_mean,hist_transac_installments_sum,hist_transac_installments_max,hist_transac_installments_min,hist_transac_installments_std,hist_transac_installments_skew,hist_transac_monthlag_count_std,hist_transac_monthlag_count_max,hist_transac_monthlag_streak_max,hist_transac_approved_count,hist_transac_approved_mean,hist_transac_denied_count,hist_transac_merchant_category_id_entropy,hist_transac_subsector_id_entropy,hist_transac_merchant_id_entropy,hist_transac_city_id_entropy,hist_transac_state_id_entropy,hist_transac_category_1_entropy,hist_transac_category_2_entropy,hist_transac_category_3_entropy,hist_transac_month_lag_entropy
0,C_ID_00007093c1,149,4,3,18,13,29,28,121,0.187919,0.391965,28,0.187919,0,0.0,120,0.805369,0,0.0,1,0.006711,0,0.0,125,0.838926,24,0.161074,0,0,1.288591,192.0,6.0,1.0,0.7649,3.312481,5.457629,19.0,13,114,0.765101,35,2.032974,1.916544,2.607847,0.88493,0.522062,0.483192,0.522062,0.441446,2.48199
1,C_ID_0001238066,123,18,6,29,17,65,2,121,0.01626,0.126992,103,0.837398,0,0.0,0,0.0,0,0.0,20,0.162602,3,0.02439,88,0.715447,32,0.260163,0,0,1.634146,201.0,10.0,0.0,1.450089,3.220606,12.992179,37.0,6,120,0.97561,3,2.489648,2.151944,3.553355,2.173489,0.815597,0.083103,0.443959,0.680436,1.605133
2,C_ID_0001506ef0,66,3,2,19,12,28,0,66,0.0,0.0,2,0.030303,0,0.0,64,0.969697,0,0.0,0,0.0,65,0.984848,1,0.015152,0,0.0,0,0,0.015152,1.0,1.0,0.0,0.123091,8.124038,4.564555,18.0,10,62,0.939394,4,2.30427,1.987715,2.862484,0.21384,0.135794,0.0,0.135794,0.078516,2.267843
3,C_ID_0001793786,216,10,4,48,24,119,2,214,0.009259,0.096001,125,0.578704,76,0.351852,15,0.069444,0,0.0,0,0.0,211,0.976852,5,0.023148,0,0.0,0,0,0.023148,5.0,1.0,0.0,0.150723,6.386653,15.415847,38.0,10,189,0.875,27,3.250401,2.606678,4.415834,1.573236,1.041667,0.052569,0.86928,0.11005,2.072621
4,C_ID_000183fdda,144,9,7,36,21,73,4,140,0.027778,0.164909,11,0.076389,1,0.006944,131,0.909722,0,0.0,1,0.006944,4,0.027778,103,0.715278,37,0.256944,0,0,1.861111,268.0,10.0,0.0,2.08763,2.801324,12.368873,28.0,7,137,0.951389,7,3.070986,2.506768,3.966664,0.67817,0.514169,0.126931,0.351565,0.688381,1.84263


### **2.2.2. amount**

In [None]:
# Purchase_amount
for m in ['sum', 'mean', 'max', 'min', 'median', 'std', 'skew']:
    hist_feats['hist_transac_amount_{}'.format(m)] = hist.groupby(['card_id'])['purchase_amount'].agg([m]).values
hist_feats['hist_transac_amount_diff'] = hist_feats['hist_transac_amount_max'].values - hist_feats['hist_transac_amount_min'].values

# Total purchase amount in recent months
hist_monthsum_amount = hist.groupby(['card_id', 'month_lag'])['purchase_amount'].sum().unstack(fill_value=0.0).reset_index()
for i in range(1, 7):
    hist_feats['hist_transac_monthlag_last_{}_amount'.format(i)] = hist_monthsum_amount.iloc[:, -i:].sum(axis=1).values
for i in range(1, 6):
    for j in range(i + 1, 7):
        hist_feats['hist_transac_monthlag_last_{}_{}_amount_ratio'.format(j, i)] = hist_feats['hist_transac_monthlag_last_{}_amount'.format(j)].values / hist_feats['hist_transac_monthlag_last_{}_amount'.format(i)].values
        hist_feats['hist_transac_monthlag_last_{}_{}_amount_ratio'.format(j, i)] = hist_feats['hist_transac_monthlag_last_{}_{}_amount_ratio'.format(j, i)].replace([np.inf, -np.inf], np.nan)
        hist_feats['hist_transac_monthlag_last_{}_{}_amount_log_ratio'.format(j, i)] = np.log2(hist_feats['hist_transac_monthlag_last_{}_{}_amount_ratio'.format(j, i)])

# Purchase_amount decay
tmp_df = hist.groupby(['card_id']).size().reset_index()
tmp_df.columns = ['card_id', 'hist_transac_count']
hist = hist.merge(tmp_df, on=['card_id'], how='left')

hist['transac_seq_num'] = hist.groupby(['card_id']).cumcount() + 1
hist['transac_seq_num_desc'] = hist['hist_transac_count'] - hist['transac_seq_num'] - 1
hist['transac_decay'] = 0.8 ** hist['transac_seq_num_desc'].values
hist['transac_amount_decay'] = hist['purchase_amount'] * hist['transac_decay']
hist['transac_month_decay'] = 1.2 ** hist['month_lag'] + 1.
hist['transac_amount_month_decay'] = hist['purchase_amount'] * hist['transac_month_decay']

for m in ['sum', 'mean', 'max', 'min', 'median', 'std', 'skew']:
    hist_feats['hist_transac_amount_decay_{}'.format(m)] = hist.groupby(['card_id'])['transac_amount_decay'].agg([m]).values
    hist_feats['hist_transac_amount_month_decay_{}'.format(m)] = hist.groupby(['card_id'])['transac_amount_month_decay'].agg([m]).values
for m in ['mean', 'sum']:
    hist_feats['hist_transac_decay_{}'.format(m)] = hist.groupby(['card_id'])['transac_decay'].agg([m]).values

# Aggregation function on purchase_amount by using 'category_1', 'category_2', 'category_3', 'installments', 'city_id', 'merchant_category_id', 'merchant_id', 'subsector_id'
def successive_aggregates(df, field1, field2):
    t = df.groupby(['card_id', field1])[field2].mean()
    u = pd.DataFrame(t).reset_index().groupby('card_id')[field2].agg(['mean', 'min', 'max', 'std'])
    u.columns = ['hist_transac_' + field1 + '_' + field2 + '_' + c for c in u.columns.values]
    u.reset_index(inplace=True)
    return u

tmp_df_1 = successive_aggregates(hist, 'category_1', 'purchase_amount')
tmp_df_2 = successive_aggregates(hist, 'installments', 'purchase_amount')
tmp_df_3 = successive_aggregates(hist, 'city_id', 'purchase_amount')
tmp_df_4 = successive_aggregates(hist, 'merchant_category_id', 'purchase_amount')
tmp_df_5 = successive_aggregates(hist, 'merchant_id', 'purchase_amount')
tmp_df_6 = successive_aggregates(hist, 'subsector_id', 'purchase_amount')
tmp_df_7 = successive_aggregates(hist, 'category_2', 'purchase_amount')
tmp_df_8 = successive_aggregates(hist, 'category_3', 'purchase_amount')

hist_feats = hist_feats.merge(tmp_df_1, on=['card_id'], how='left')
hist_feats = hist_feats.merge(tmp_df_2, on=['card_id'], how='left')
hist_feats = hist_feats.merge(tmp_df_3, on=['card_id'], how='left')
hist_feats = hist_feats.merge(tmp_df_4, on=['card_id'], how='left')
hist_feats = hist_feats.merge(tmp_df_5, on=['card_id'], how='left')
hist_feats = hist_feats.merge(tmp_df_6, on=['card_id'], how='left')
hist_feats = hist_feats.merge(tmp_df_7, on=['card_id'], how='left')
hist_feats = hist_feats.merge(tmp_df_8, on=['card_id'], how='left')

  if sys.path[0] == '':
  if sys.path[0] == '':


In [None]:
for c in hist_feats.columns[hist_feats.isnull().sum() != 0].values:
    hist_feats[c].fillna(hist_feats[c].median(), inplace=True)

In [None]:
hist.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,month_lag=0,month_lag=-1,month_lag=-2,month_lag=-3,month_lag=-4,month_lag=-5,month_lag=-6,category_2=1,category_2=2,category_2=3,category_2=4,category_2=5,category_3=0,category_3=1,category_3=2,category_3=3,hist_transac_count,transac_seq_num,transac_seq_num_desc,transac_decay,transac_amount_decay,transac_month_decay,transac_amount_month_decay
0,1,C_ID_da2090f28e,69,0,0.0,0,623,M_ID_f001319a61,-11,40.0,2017-01-01 00:00:08,1.0,9,4,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,288,1,286,1.921924e-28,7.687697e-27,1.134588,45.383519
1,1,C_ID_efced389a0,76,0,1.0,1,842,M_ID_18038b5ae7,-12,120.0,2017-01-01 00:00:59,3.0,2,37,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,136,1,134,1.0329e-13,1.23948e-11,1.112157,133.458799
2,1,C_ID_83561fe74a,233,0,1.0,1,661,M_ID_52d3026407,-13,124.9,2017-01-01 00:01:41,1.0,9,8,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,408,1,406,4.513721e-40,5.637637999999999e-38,1.093464,136.573638
3,1,C_ID_479fd6392a,-1,1,1.0,1,839,M_ID_e5374dabc0,-1,6.0,2017-01-01 00:02:03,1.0,-1,29,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,11,1,9,0.1342177,0.8053064,1.833333,11.0
4,1,C_ID_1cf6056088,69,0,0.0,0,278,M_ID_2cf6dc1f6f,-4,500.0,2017-01-01 00:02:12,1.0,9,37,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,131,1,129,3.15216e-13,1.57608e-10,1.482253,741.126543


In [None]:
hist_feats.head()

Unnamed: 0,card_id,hist_transac_count,hist_transac_city_nunique,hist_transac_state_nunique,hist_transac_merchant_category_nunique,hist_transac_subsector_nunique,hist_transac_merchant_nunique,hist_transac_category_1_1_count,hist_transac_category_1_0_count,hist_transac_category_1_1_mean,hist_transac_category_1_1_std,hist_transac_category_2=1_count,hist_transac_category_2=1_mean,hist_transac_category_2=2_count,hist_transac_category_2=2_mean,hist_transac_category_2=3_count,hist_transac_category_2=3_mean,hist_transac_category_2=4_count,hist_transac_category_2=4_mean,hist_transac_category_2=5_count,hist_transac_category_2=5_mean,hist_transac_category_3=0_count,hist_transac_category_3=0_mean,hist_transac_category_3=1_count,hist_transac_category_3=1_mean,hist_transac_category_3=2_count,hist_transac_category_3=2_mean,hist_transac_category_3=3_count,hist_transac_category_3=3_mean,hist_transac_installments_mean,hist_transac_installments_sum,hist_transac_installments_max,hist_transac_installments_min,hist_transac_installments_std,hist_transac_installments_skew,hist_transac_monthlag_count_std,hist_transac_monthlag_count_max,hist_transac_monthlag_streak_max,hist_transac_approved_count,hist_transac_approved_mean,...,hist_transac_amount_decay_median,hist_transac_amount_month_decay_median,hist_transac_amount_decay_std,hist_transac_amount_month_decay_std,hist_transac_amount_decay_skew,hist_transac_amount_month_decay_skew,hist_transac_decay_mean,hist_transac_decay_sum,hist_transac_category_1_purchase_amount_mean,hist_transac_category_1_purchase_amount_min,hist_transac_category_1_purchase_amount_max,hist_transac_category_1_purchase_amount_std,hist_transac_installments_purchase_amount_mean,hist_transac_installments_purchase_amount_min,hist_transac_installments_purchase_amount_max,hist_transac_installments_purchase_amount_std,hist_transac_city_id_purchase_amount_mean,hist_transac_city_id_purchase_amount_min,hist_transac_city_id_purchase_amount_max,hist_transac_city_id_purchase_amount_std,hist_transac_merchant_category_id_purchase_amount_mean,hist_transac_merchant_category_id_purchase_amount_min,hist_transac_merchant_category_id_purchase_amount_max,hist_transac_merchant_category_id_purchase_amount_std,hist_transac_merchant_id_purchase_amount_mean,hist_transac_merchant_id_purchase_amount_min,hist_transac_merchant_id_purchase_amount_max,hist_transac_merchant_id_purchase_amount_std,hist_transac_subsector_id_purchase_amount_mean,hist_transac_subsector_id_purchase_amount_min,hist_transac_subsector_id_purchase_amount_max,hist_transac_subsector_id_purchase_amount_std,hist_transac_category_2_purchase_amount_mean,hist_transac_category_2_purchase_amount_min,hist_transac_category_2_purchase_amount_max,hist_transac_category_2_purchase_amount_std,hist_transac_category_3_purchase_amount_mean,hist_transac_category_3_purchase_amount_min,hist_transac_category_3_purchase_amount_max,hist_transac_category_3_purchase_amount_std
0,C_ID_00007093c1,149,4,3,18,13,29,28,121,0.187919,0.391965,28,0.187919,0,0.0,120,0.805369,0,0.0,1,0.006711,0,0.0,125,0.838926,24,0.161074,0,0,1.288591,192.0,6.0,1.0,0.7649,3.312481,5.457629,19.0,13,114,0.765101,...,5.391989e-06,133.489798,18.258366,269.336834,4.543892,3.732748,0.041946,6.25,102.706637,20.781786,184.631488,115.859235,360.138093,114.9688,527.0,192.432773,154.969371,20.781786,236.0,93.12507,212.432666,20.781786,966.666667,229.605789,191.146672,20.0,966.666667,191.006012,179.380754,20.781786,386.615385,117.965525,146.995067,20.781786,236.0,112.330165,235.6344,114.9688,356.3,170.646928
1,C_ID_0001238066,123,18,6,29,17,65,2,121,0.01626,0.126992,103,0.837398,0,0.0,0,0.0,0,0.0,20,0.162602,3,0.02439,88,0.715447,32,0.260163,0,0,1.634146,201.0,10.0,0.0,1.450089,3.220606,12.992179,37.0,6,120,0.97561,...,9.499761e-05,112.0,14.231634,228.514004,5.136044,4.28508,0.050813,6.25,350.328285,96.84157,603.815,358.48435,306.509701,61.920341,1008.22,283.588279,129.121627,28.8,452.0,118.813963,160.945458,10.5,1008.22,221.545848,119.837707,8.0,1008.22,158.673976,207.0488,10.5,1008.22,266.011557,114.448311,100.572621,128.324,19.623188,213.717475,61.920341,381.343333,160.298701
2,C_ID_0001506ef0,66,3,2,19,12,28,0,66,0.0,0.0,2,0.030303,0,0.0,64,0.969697,0,0.0,0,0.0,65,0.984848,1,0.015152,0,0.0,0,0,0.015152,1.0,1.0,0.0,0.123091,8.124038,4.564555,18.0,10,62,0.939394,...,0.0364257,42.864212,97.486259,535.874532,7.249163,3.277542,0.094697,6.249997,148.162727,148.162727,148.162727,61.625087,87.774923,25.5,150.049846,88.070041,61.888836,12.0,154.416508,80.213267,60.842565,4.27,325.4648,73.524223,60.911561,4.27,596.785385,110.988535,62.260658,4.27,325.4648,86.403247,85.720625,19.25,152.19125,94.003659,87.774923,25.5,150.049846,88.070041
3,C_ID_0001793786,216,10,4,48,24,119,2,214,0.009259,0.096001,125,0.578704,76,0.351852,15,0.069444,0,0.0,0,0.0,211,0.976852,5,0.023148,0,0.0,0,0,0.023148,5.0,1.0,0.0,0.150723,6.386653,15.415847,38.0,10,189,0.875,...,2.472838e-08,273.640556,55.06489,887.628662,7.863958,3.337331,0.028935,6.25,194.150047,1.0,387.300093,273.155416,218.528209,45.314,391.742417,244.961883,374.280114,1.0,955.65,297.385775,475.892804,1.0,3527.8,589.993812,393.936227,1.0,3527.8,600.234393,391.916515,1.0,1475.58,364.193844,460.689452,346.51296,641.51,158.388403,218.528209,45.314,391.742417,244.961883
4,C_ID_000183fdda,144,9,7,36,21,73,4,140,0.027778,0.164909,11,0.076389,1,0.006944,131,0.909722,0,0.0,1,0.006944,4,0.027778,103,0.715278,37,0.256944,0,0,1.861111,268.0,10.0,0.0,2.08763,2.801324,12.368873,28.0,7,137,0.951389,...,2.282832e-05,100.0,27.248743,618.040685,8.500531,4.070709,0.043403,6.25,566.182964,156.148429,976.2175,579.876401,470.376915,72.249709,1411.855,469.847025,206.257815,31.0,976.2175,293.488299,255.864103,14.5,2337.0,437.929785,197.792197,14.5,2337.0,377.126694,346.207151,20.946667,2337.0,549.487181,210.981619,89.23,405.351818,136.186482,258.008822,72.249709,470.206757,200.291566


### **2.2.3. time**

In [None]:
# Features of day
hist['year'] = hist['purchase_date'].dt.year
hist['month'] = hist['purchase_date'].dt.month
hist['woy'] = hist['purchase_date'].dt.weekofyear
hist['doy'] = hist['purchase_date'].dt.dayofyear
hist['wday'] = hist['purchase_date'].dt.dayofweek
hist['day'] = hist['purchase_date'].dt.day
hist['hour'] = hist['purchase_date'].dt.hour
for m in ['nunique', 'mean', 'min', 'max']:
    for c in ['year', 'month', 'woy', 'doy', 'wday', 'day', 'hour']:
        hist_feats['hist_transac_{}_{}'.format(c, m)] = hist.groupby(['card_id'])[c].agg([m]).values

# Special days in a year
hist['ChristmasDay_2017'] = (pd.to_datetime('2017-12-25') - hist['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0).values
hist['FathersDay_2017'] = (pd.to_datetime('2017-08-13') - hist['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0).values
hist['ChildrenDay_2017'] = (pd.to_datetime('2017-10-12') - hist['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0).values
hist['BlackFriday_2017'] = (pd.to_datetime('2017-11-24') - hist['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0).values
hist['ValentineDay_2017'] = (pd.to_datetime('2017-06-12') - hist['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0).values
hist['MothersDay_2018'] = (pd.to_datetime('2018-05-13') - hist['purchase_date']).dt.days.apply(lambda x: x if x > 0 and x < 100 else 0).values
for c in ['ChristmasDay_2017', 'FathersDay_2017', 'ChildrenDay_2017',
          'BlackFriday_2017', 'ValentineDay_2017', 'MothersDay_2018']:
    hist_feats['hist_transac_{}_mean'.format(c)] = hist.groupby(['card_id'])[c].mean().values

# Weekend
hist['is_weekend'] = (hist['purchase_date'].dt.weekday >= 5).astype(int)
hist_feats['hist_transac_purchase_weekend_count'] = hist.groupby(['card_id'])['is_weekend'].sum().values
hist_feats['hist_transac_purchase_weekend_mean'] = hist.groupby(['card_id'])['is_weekend'].mean().values

# Days difference between 2 consecutive purchase_date
hist['prev_1_purchase_date'] = hist.groupby(['card_id'])['purchase_date'].shift(1)
hist['purchase_date_diff_days'] = (hist['purchase_date'] - hist['prev_1_purchase_date']).dt.days.values
hist['purchase_date_diff_seconds'] = (hist['purchase_date'] - hist['prev_1_purchase_date']).dt.seconds.values
hist['purchase_date_diff_hours'] = hist['purchase_date_diff_seconds'].values // 3600
for m in ['mean', 'std', 'max', 'min']:
    hist_feats['hist_transac_purchase_date_diff_sec_{}'.format(m)] = hist.groupby(['card_id'])['purchase_date_diff_seconds'].agg([m]).values
    hist_feats['hist_transac_purchase_date_diff_day_{}'.format(m)] = hist.groupby(['card_id'])['purchase_date_diff_days'].agg([m]).values
    hist_feats['hist_transac_purchase_date_diff_hour_{}'.format(m)] = hist.groupby(['card_id'])['purchase_date_diff_hours'].agg([m]).values

# Days difference between the first and last purchase_date
hist_feats['hist_purchase_date_last'] = hist.groupby(['card_id'])['purchase_date'].max().values
hist_feats['hist_purchase_date_first'] = hist.groupby(['card_id'])['purchase_date'].min().values
hist_feats['hist_purchase_date_diff_day'] = (pd.to_datetime(hist_feats['hist_purchase_date_last']) - pd.to_datetime(hist_feats['hist_purchase_date_first'])).dt.days.values
hist_feats['hist_purchase_count_ratio'] = hist_feats['hist_transac_count'].values / (1. + hist_feats['hist_purchase_date_diff_day'].values)

# Months difference
hist['month_diff'] = (pd.to_datetime('2018-12-31') - pd.to_datetime(hist['purchase_date'])).dt.days // 30
hist['month_diff'] += hist['month_lag']
for m in ['mean', 'std', 'min', 'max']:
    hist_feats['hist_month_diff_{}'.format(m)] = hist.groupby(['card_id'])['month_diff'].agg([m]).values

hist['duration'] = hist['purchase_amount'].values * hist['month_diff'].values
hist['amount_month_ratio'] = hist['purchase_amount'].values / (1. + hist['month_diff'].values)
for m in ['mean', 'std', 'min', 'max', 'skew']:
    hist_feats['hist_transac_duration_{}'.format(m)] = hist.groupby(['card_id'])['duration'].agg([m]).values
    hist_feats['hist_transac_amount_month_ratio_{}'.format(m)] = hist.groupby(['card_id'])['amount_month_ratio'].agg([m]).values

# Month_lag
for m in ['nunique', 'mean', 'std', 'min', 'skew']:
    hist_feats['hist_transac_monthlag_{}'.format(m)] = hist.groupby(['card_id'])['month_lag'].agg([m]).values

for c in ['month_lag=0', 'month_lag=-1', 'month_lag=-2']:
    hist_feats['hist_transac_{}_count'.format(c)] = hist.groupby(['card_id'])[c].sum().values
    hist_feats['hist_transac_{}_mean'.format(c)] = hist.groupby(['card_id'])[c].mean().values
hist_feats['hist_transac_monthlag_0_-1_ratio'] = hist_feats['hist_transac_month_lag=0_count'].values / (1. + hist_feats['hist_transac_month_lag=-1_count'].values)
hist_feats['hist_transac_monthlag_0_-2_ratio'] = hist_feats['hist_transac_month_lag=0_count'].values / (1. + hist_feats['hist_transac_month_lag=-2_count'].values)
hist_feats['hist_transac_last_3_mon_count'] = hist_feats['hist_transac_month_lag=0_count'].values + hist_feats['hist_transac_month_lag=-1_count'].values + hist_feats['hist_transac_month_lag=-2_count'].values
hist_feats['hist_transac_last_3_mon_ratio'] = hist_feats['hist_transac_last_3_mon_count'].values / (1. + hist_feats['hist_transac_count'].values)

  after removing the cwd from sys.path.


In [None]:
for c in hist_feats.columns[hist_feats.isnull().sum() != 0].values:
    hist_feats[c].fillna(hist_feats[c].median(), inplace=True)

In [None]:
hist.head()

Unnamed: 0,authorized_flag,card_id,city_id,category_1,installments,category_3,merchant_category_id,merchant_id,month_lag,purchase_amount,purchase_date,category_2,state_id,subsector_id,month_lag=0,month_lag=-1,month_lag=-2,month_lag=-3,month_lag=-4,month_lag=-5,month_lag=-6,category_2=1,category_2=2,category_2=3,category_2=4,category_2=5,category_3=0,category_3=1,category_3=2,category_3=3,hist_transac_count,transac_seq_num,transac_seq_num_desc,transac_decay,transac_amount_decay,transac_month_decay,transac_amount_month_decay,year,month,woy,doy,wday,day,hour,ChristmasDay_2017,FathersDay_2017,ChildrenDay_2017,BlackFriday_2017,ValentineDay_2017,MothersDay_2018,is_weekend,prev_1_purchase_date,purchase_date_diff_days,purchase_date_diff_seconds,purchase_date_diff_hours,month_diff,duration,amount_month_ratio
0,1,C_ID_da2090f28e,69,0,0.0,0,623,M_ID_f001319a61,-11,40.0,2017-01-01 00:00:08,1.0,9,4,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,288,1,286,1.921924e-28,7.687697e-27,1.134588,45.383519,2017,1,52,1,6,1,0,0,0,0,0,0,0,1,NaT,,,,13,520.0,2.857143
1,1,C_ID_efced389a0,76,0,1.0,1,842,M_ID_18038b5ae7,-12,120.0,2017-01-01 00:00:59,3.0,2,37,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,136,1,134,1.0329e-13,1.23948e-11,1.112157,133.458799,2017,1,52,1,6,1,0,0,0,0,0,0,0,1,NaT,,,,12,1440.0,9.230769
2,1,C_ID_83561fe74a,233,0,1.0,1,661,M_ID_52d3026407,-13,124.9,2017-01-01 00:01:41,1.0,9,8,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,408,1,406,4.513721e-40,5.637637999999999e-38,1.093464,136.573638,2017,1,52,1,6,1,0,0,0,0,0,0,0,1,NaT,,,,11,1373.9,10.408333
3,1,C_ID_479fd6392a,-1,1,1.0,1,839,M_ID_e5374dabc0,-1,6.0,2017-01-01 00:02:03,1.0,-1,29,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,11,1,9,0.1342177,0.8053064,1.833333,11.0,2017,1,52,1,6,1,0,0,0,0,0,0,0,1,NaT,,,,23,138.0,0.25
4,1,C_ID_1cf6056088,69,0,0.0,0,278,M_ID_2cf6dc1f6f,-4,500.0,2017-01-01 00:02:12,1.0,9,37,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,131,1,129,3.15216e-13,1.57608e-10,1.482253,741.126543,2017,1,52,1,6,1,0,0,0,0,0,0,0,1,NaT,,,,20,10000.0,23.809524


In [None]:
hist_feats.head()

Unnamed: 0,card_id,hist_transac_count,hist_transac_city_nunique,hist_transac_state_nunique,hist_transac_merchant_category_nunique,hist_transac_subsector_nunique,hist_transac_merchant_nunique,hist_transac_category_1_1_count,hist_transac_category_1_0_count,hist_transac_category_1_1_mean,hist_transac_category_1_1_std,hist_transac_category_2=1_count,hist_transac_category_2=1_mean,hist_transac_category_2=2_count,hist_transac_category_2=2_mean,hist_transac_category_2=3_count,hist_transac_category_2=3_mean,hist_transac_category_2=4_count,hist_transac_category_2=4_mean,hist_transac_category_2=5_count,hist_transac_category_2=5_mean,hist_transac_category_3=0_count,hist_transac_category_3=0_mean,hist_transac_category_3=1_count,hist_transac_category_3=1_mean,hist_transac_category_3=2_count,hist_transac_category_3=2_mean,hist_transac_category_3=3_count,hist_transac_category_3=3_mean,hist_transac_installments_mean,hist_transac_installments_sum,hist_transac_installments_max,hist_transac_installments_min,hist_transac_installments_std,hist_transac_installments_skew,hist_transac_monthlag_count_std,hist_transac_monthlag_count_max,hist_transac_monthlag_streak_max,hist_transac_approved_count,hist_transac_approved_mean,...,hist_transac_purchase_date_diff_hour_std,hist_transac_purchase_date_diff_sec_max,hist_transac_purchase_date_diff_day_max,hist_transac_purchase_date_diff_hour_max,hist_transac_purchase_date_diff_sec_min,hist_transac_purchase_date_diff_day_min,hist_transac_purchase_date_diff_hour_min,hist_purchase_date_last,hist_purchase_date_first,hist_purchase_date_diff_day,hist_purchase_count_ratio,hist_month_diff_mean,hist_month_diff_std,hist_month_diff_min,hist_month_diff_max,hist_transac_duration_mean,hist_transac_amount_month_ratio_mean,hist_transac_duration_std,hist_transac_amount_month_ratio_std,hist_transac_duration_min,hist_transac_amount_month_ratio_min,hist_transac_duration_max,hist_transac_amount_month_ratio_max,hist_transac_duration_skew,hist_transac_amount_month_ratio_skew,hist_transac_monthlag_nunique,hist_transac_monthlag_mean,hist_transac_monthlag_std,hist_transac_monthlag_min,hist_transac_monthlag_skew,hist_transac_month_lag=0_count,hist_transac_month_lag=0_mean,hist_transac_month_lag=-1_count,hist_transac_month_lag=-1_mean,hist_transac_month_lag=-2_count,hist_transac_month_lag=-2_mean,hist_transac_monthlag_0_-1_ratio,hist_transac_monthlag_0_-2_ratio,hist_transac_last_3_mon_count,hist_transac_last_3_mon_ratio
0,C_ID_00007093c1,149,4,3,18,13,29,28,121,0.187919,0.391965,28,0.187919,0,0.0,120,0.805369,0,0.0,1,0.006711,0,0.0,125,0.838926,24,0.161074,0,0,1.288591,192.0,6.0,1.0,0.7649,3.312481,5.457629,19.0,13,114,0.765101,...,8.637018,86049.0,33.0,23.0,20.0,0.0,0.0,2018-02-27 05:14:57,2017-02-14 14:00:43,377,0.39418,10.295302,0.457717,10,11,1570.681745,13.741052,1990.672301,18.004936,120.0,1.090909,15000.0,136.363636,3.89813,4.016048,13,-5.852349,3.453114,-12,0.135031,12,0.080537,10,0.067114,12,0.080537,1.090909,0.923077,34,0.226667
1,C_ID_0001238066,123,18,6,29,17,65,2,121,0.01626,0.126992,103,0.837398,0,0.0,0,0.0,0,0.0,20,0.162602,3,0.02439,88,0.715447,32,0.260163,0,0,1.634146,201.0,10.0,0.0,1.450089,3.220606,12.992179,37.0,6,120,0.97561,...,7.364404,85624.0,8.0,23.0,0.0,0.0,0.0,2018-02-27 16:18:59,2017-09-28 22:25:14,151,0.809211,10.162602,0.370511,10,11,1063.659187,9.456149,1267.672061,11.503708,80.0,0.727273,10082.2,91.656364,4.019542,4.082893,6,-1.813008,1.28898,-5,-0.239327,24,0.195122,26,0.211382,37,0.300813,0.888889,0.631579,87,0.701613
2,C_ID_0001506ef0,66,3,2,19,12,28,0,66,0.0,0.0,2,0.030303,0,0.0,64,0.969697,0,0.0,0,0.0,65,0.984848,1,0.015152,0,0.0,0,0,0.015152,1.0,1.0,0.0,0.123091,8.124038,4.564555,18.0,10,62,0.939394,...,8.795295,86273.0,50.0,23.0,25.0,0.0,0.0,2018-02-17 12:33:56,2017-01-14 16:16:01,398,0.165414,10.227273,0.422282,10,11,1512.507424,13.235398,3208.373908,28.120284,46.97,0.355833,14910.0,135.545455,2.664332,2.728336,13,-4.833333,4.2375,-13,-0.723706,6,0.090909,7,0.106061,18,0.272727,0.75,0.315789,31,0.462687
3,C_ID_0001793786,216,10,4,48,24,119,2,214,0.009259,0.096001,125,0.578704,76,0.351852,15,0.069444,0,0.0,0,0.0,211,0.976852,5,0.023148,0,0.0,0,0,0.023148,5.0,1.0,0.0,0.150723,6.386653,15.415847,38.0,10,189,0.875,...,7.692843,86311.0,25.0,23.0,24.0,0.0,0.0,2017-10-31 20:20:18,2017-01-21 10:15:21,283,0.760563,14.175926,0.381642,14,15,5438.681019,25.304234,7858.525843,36.83051,14.0,0.066667,49389.2,235.186667,2.953311,3.046109,10,-3.328704,2.306373,-9,-0.410039,20,0.092593,38,0.175926,34,0.157407,0.512821,0.571429,92,0.423963
4,C_ID_000183fdda,144,9,7,36,21,73,4,140,0.027778,0.164909,11,0.076389,1,0.006944,131,0.909722,0,0.0,1,0.006944,4,0.027778,103,0.715278,37,0.256944,0,0,1.861111,268.0,10.0,0.0,2.08763,2.801324,12.368873,28.0,7,137,0.951389,...,7.533026,85815.0,14.0,23.0,4.0,0.0,0.0,2018-02-25 20:57:08,2017-08-07 09:49:14,202,0.70936,10.159722,0.367627,10,11,1806.194931,16.138059,3522.536676,31.561706,66.0,0.5,23370.0,212.454545,3.821535,3.862588,7,-2.451389,1.895264,-6,-0.235599,28,0.194444,28,0.194444,26,0.180556,0.965517,1.037037,82,0.565517


# **4. Save data**

In [None]:
hist_feats.to_csv('/content/drive/MyDrive/Colab Notebooks/viettel/data/hist_transac.csv', index=False)