# Making Features and a Validation Set

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from pathlib import Path

%matplotlib inline

In [2]:
from elo_helpers import *

In [3]:
train = pd.read_csv('data/train.csv', dtype=train_dtypes)
test = pd.read_csv('data/test.csv', dtype=test_dtypes).fillna('2017-03')

In [4]:
merch = pd.read_csv('data/merchants.csv', dtype=merch_dtypes)
hist_trans = pd.read_csv('data/historical_transactions.csv', dtype=trans_dtypes)
new_trans = pd.read_csv('data/new_merchant_transactions.csv', dtype=trans_dtypes)

In [5]:
for cols in ['category_1', 'category_4', 'most_recent_purchases_range', 'most_recent_sales_range']:
    merch[cols] = merch[cols].astype('category')

### Validation Set

Looks like we can just make it randomly. Need to do more feature engineering before this is possible. Need to drop `card_id` and `target` from X set for validation. Using `test_size=0.35` and `random_state=11`.

### Feature Engineering

In [6]:
train.head()

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target
0,2017-06,C_ID_92a2005557,5,2,1,-0.820283
1,2017-01,C_ID_3d0044924f,4,1,0,0.392913
2,2016-08,C_ID_d639edf6cd,2,2,0,0.688056
3,2017-09,C_ID_186d6a6901,4,3,0,0.142495
4,2017-11,C_ID_cdbd2c0db2,1,3,0,-0.159749


In [7]:
train_v2 = pd.get_dummies(train, columns=['feature_1', 'feature_2', 'feature_3'])

In [14]:
%%timeit
train_v2.first_active_month.str.split('-')[0]
train_v2.first_active_month.str.split('-')[1]

234 ms ± 5.78 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [15]:
%%timeit
pd.to_datetime(train_v2.first_active_month).dt.month
pd.to_datetime(train_v2.first_active_month).dt.year

96.2 ms ± 1.25 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [16]:
# turn these into categories?
train_v2['first_active_year'] = pd.to_datetime(train_v2.first_active_month).dt.year
train_v2['first_active_month'] = pd.to_datetime(train_v2.first_active_month).dt.month

In [19]:
column_match(train, hist_trans)

array(['card_id'], dtype=object)

In [18]:
column_match(hist_trans, merch)

array(['city_id', 'category_1', 'merchant_category_id', 'merchant_id',
       'category_2', 'state_id', 'subsector_id'], dtype=object)

Tackle `hist_trans` first.

In [22]:
hist_trans.head(3).transpose()

Unnamed: 0,0,1,2
authorized_flag,Y,Y,Y
card_id,C_ID_4e6213e9bc,C_ID_4e6213e9bc,C_ID_4e6213e9bc
city_id,88,88,88
category_1,N,N,N
installments,0,0,0
category_3,A,A,A
merchant_category_id,80,367,80
merchant_id,M_ID_e020e9b302,M_ID_86ec983688,M_ID_979ed661fc
month_lag,-8,-7,-6
purchase_amount,-0.703331,-0.733128,-0.720386
