In [2]:
import sys
from pathlib import Path

# Get the project root path and add it to sys.path
project_root = Path().resolve().parent  # Adjust to point to your project root if necessary
src_path = project_root / 'src'

if str(src_path) not in sys.path:
    sys.path.append(str(src_path))

In [3]:
import pandas as pd
import numpy as np
from data_loader import DataLoader
import eda
import seaborn as sns
import matplotlib.pyplot as plt

In [34]:
train_id = DataLoader("../data/train_identity.csv")
train_trans = DataLoader("../data/train_transaction.csv")

test_id = DataLoader("../data/test_identity.csv")
test_trans = DataLoader("../data/test_transaction.csv")

print(train_id.dataset.shape)
print(train_trans.dataset.shape)

print(test_id.dataset.shape)
print(test_trans.dataset.shape)

(144233, 41)
(590540, 394)
(141907, 41)
(506691, 393)


## Feature removal/selection

In [36]:
# separates categorical & numerical variables
num_cols_id = train_id.dataset.select_dtypes(include=['number']).columns
num_cols_trans = train_trans.dataset.select_dtypes(include=['number']).columns
print(f"{len(num_cols_id)}")
print(f"{len(num_cols_trans)}")

# Fill up <UNKWN> token for categorical variables & a integer for numerical (-1) variables

24
380


In [None]:
# Separate categorical & numerical columns
def get_sequential_features(starting_letter, num_start, num_end):
    return [f"{starting_letter}{i}" for i in range(num_start, num_end+1)]

trans_categorical_cols = ['ProductCD'] + get_sequential_features('card', 1, 6) + get_sequential_features('addr', 1, 2) + \
                            ['P_emaildomain', 'R_emaildomain'] + get_sequential_features('M', 1, 9)
id_categorical_cols = ['DeviceType', 'DeviceInfo'] + get_sequential_features('id_', 12, 38)
print(f"Total # of categorical variables in transaction data: {len(trans_categorical_cols)}")
print(f"Total # of categorical variables in id data: {len(id_categorical_cols)}")

trans_num_cols = [c for c in train_trans.dataset.columns.tolist() if c not in trans_categorical_cols]
id_num_cols = [c for c in train_id.dataset.columns.tolist() if c not in id_categorical_cols]
print(f"Total # of numerical variables in transaction data: {len(trans_num_cols)}")
print(f"Total # of numerical variables in id data: {len(id_num_cols)}")

# checks
print(f"Total # of columns in transaction data: {len(trans_categorical_cols) + len(trans_num_cols)}")
print(f"Total # of columns in id data: {len(id_categorical_cols) + len(id_num_cols)}")

Total # of categorical variables in transaction data: 20
Total # of categorical variables in id data: 29
Total # of numerical variables in transaction data: 374
Total # of numerical variables in id data: 12
Total # of columns in transaction data: 394
Total # of columns in id data: 41


In [39]:
# imputing -999 for all the missing values in numerical column
for c in id_num_cols:
    train_id.dataset[c].fillna(-999, inplace=True)

for c in trans_num_cols:
    train_trans.dataset[c].fillna(-999, inplace=True)

In [40]:
# imputing <UNKWN> for all the missing values in categorical column
for c in id_categorical_cols:
    train_id.dataset[c].fillna('<UNKWN>', inplace=True)

for c in trans_categorical_cols:
    train_trans.dataset[c].fillna('<UNKWN>', inplace=True)

In [44]:
# Remove features with > 90% missing values
features_to_rem_id = ['id_07', 'id_08'] + [f'id_{i}' for i in range(21,28)]
features_to_rem_trans = ['dist1', 'D11'] + [f'M{i}' for i in range(1,10)] + [f'V{i}' for i in range(1,12)]

train_id_v2 = train_id.dataset.drop(features_to_rem_id, axis=1)
train_trans_v2 = train_trans.dataset.drop(features_to_rem_trans, axis=1)

print(train_id_v2.shape)
print(train_trans_v2.shape)

(144233, 32)
(590540, 372)


In [31]:
# Categorical Features:
# ProductCD
# card1 - card6
# addr1, addr2
# P_emaildomain
# R_emaildomain
# M1 - M9

# Categorical Features:
# DeviceType
# DeviceInfo
# id_12 - id_38