# Load Data

In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline
import seaborn as sns
import os
import pathlib

from inflection import underscore

sns.set()
pd.set_option('display.max_columns',500)

  import pandas.util.testing as tm


In [2]:
data_dir = pathlib.Path().resolve().parent / 'data'

data_raw = data_dir / 'raw'
data_interim = data_dir / 'interim'
data_external = data_dir / 'external'
data_processed = data_dir / 'processed'

model_dir = pathlib.Path().resolve().parent / 'models'

## Load Data

In [3]:
os.listdir(data_raw)

['.gitkeep',
 'sample_submission.csv',
 'test_identity.csv',
 'test_transaction.csv',
 'train_identity.csv',
 'train_transaction.csv']

In [4]:
trans = pd.read_csv(data_raw / 'train_transaction.csv', index_col=0)
identity = pd.read_csv(data_raw / 'train_identity.csv', index_col=0)

test_trans = pd.read_csv(data_raw / 'test_transaction.csv', index_col=0)
test_identity = pd.read_csv(data_raw / 'test_identity.csv', index_col=0)

In [5]:
# switch all columns to snake_case
for df in [trans, identity, test_trans, test_identity]:
    df.columns = [underscore(col) for col in df.columns]
    df.index.name = underscore(df.index.name)

### Save transaction and identity files separately

In [7]:
trans.to_parquet(data_interim / 'train_transaction.parquet')
identity.to_parquet(data_interim / 'train_identity.parquet')

test_trans.to_parquet(data_interim / 'test_transaction.parquet')
test_identity.to_parquet(data_interim / 'test_identity.parquet')

### Merge transaction and identity files and then save
#### TODO: Go back and check if this is the appropriate way to combine the two dataframes

In [8]:
train = trans.merge(identity, how = 'left', left_index=True, right_index=True, indicator = 'identity_match')
train['identity_match'] = (train['identity_match']=='both').astype(int)

In [9]:
test = test_trans.merge(test_identity, how = 'left', left_index=True, right_index=True, indicator = 'identity_match')
test['identity_match'] = (test['identity_match']=='both').astype(int)

In [14]:
train.to_parquet(data_interim / 'train.parquet')
test.to_parquet(data_interim / 'test.parquet')