# Load Data

In [10]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
import seaborn as sns
import os
import pathlib

from inflection import underscore

from trav_lib.data_prep import reduce_memory

sns.set()
pd.set_option('display.max_columns',500)

In [2]:
data_dir = pathlib.Path().resolve().parent / 'data'

data_raw = data_dir / 'raw'
data_interim = data_dir / 'interim'
data_external = data_dir / 'external'
data_processed = data_dir / 'processed'

model_dir = pathlib.Path().resolve().parent / 'models'

## Load Data

In [3]:
os.listdir(data_raw)

['.gitkeep',
 'sample_submission.csv',
 'test_identity.csv',
 'test_transaction.csv',
 'train_identity.csv',
 'train_transaction.csv']

In [4]:
trans = pd.read_csv(data_raw / 'train_transaction.csv', index_col=0)
identity = pd.read_csv(data_raw / 'train_identity.csv', index_col=0)

test_trans = pd.read_csv(data_raw / 'test_transaction.csv', index_col=0)
test_identity = pd.read_csv(data_raw / 'test_identity.csv', index_col=0)

In [5]:
# switch all columns to snake_case
for df in [trans, identity, test_trans, test_identity]:
    df.columns = [underscore(col) for col in df.columns]
    df.index.name = underscore(df.index.name)

### Merge transaction and identity files

In [6]:
train = trans.merge(identity, how = 'left', left_index=True, right_index=True, indicator = 'identity_match')
train['identity_match'] = (train['identity_match']=='both').astype(int)

In [7]:
test = test_trans.merge(test_identity, how = 'left', left_index=True, right_index=True, indicator = 'identity_match')
test['identity_match'] = (test['identity_match']=='both').astype(int)

### Downcast numeric columns and convert categorical columns

In [8]:
target_col = 'is_fraud'
date_col = 'transaction_dt'
trans_cat_cols = ['product_cd','card1','card2','card3','card4','card5','card6','addr1','addr2',
                  'p_emaildomain','r_emaildomain','m1','m2','m3','m4','m5','m6','m7','m8','m9']
trans_num_cols = list(train.loc[:,:'v339'].columns.drop([target_col] + trans_cat_cols + [date_col]))

ident_cat_cols = list(train.loc[:,'id_12':'identity_match'].columns)
ident_num_cols = list(train.loc[:,'id_01':'id_11'].columns)

cat_cols = trans_cat_cols + ident_cat_cols
num_cols = trans_num_cols + ident_num_cols

In [11]:
train = reduce_memory(train, cat_cols)
test = reduce_memory(test, cat_cols)

before: 2729.2 MB
after: 967.6 MB
decreased by: 64.55 %
before: 2358.2 MB
after: 832.9 MB
decreased by: 64.68 %


### Save as parquet files

In [12]:
train.to_parquet(data_interim / 'train.parquet')
test.to_parquet(data_interim / 'test.parquet')