# Preprocessing & Training Data

In [1]:
# import files from google colab
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
# import necessary packages
import pandas as pd
import numpy as np


# preprocessing
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# models
from sklearn.linear_model import LogisticRegression

In [3]:
# create path variables
path_test = '/content/drive/MyDrive/Springboard/capstones/Capstone Project/data/merged_test_df.csv'
path_train = '/content/drive/MyDrive/Springboard/capstones/Capstone Project/data/merged_train_df.csv'

In [4]:
# read the files and convert to data frames
df_test = pd.read_csv(path_test)
df_train = pd.read_csv(path_train)

In [5]:
# print first five rows
df_train.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,0,86400,68.5,W,13926,-1.0,150.0,discover,142.0,credit,...,0.0,0.0,0.0,0.0,0.0,117.0,0.0,0.0,0.0,0.0
1,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,...,135.0,0.0,0.0,0.0,50.0,1404.0,790.0,0.0,0.0,0.0
4,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# print information about train data
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 590540 entries, 0 to 590539
Columns: 225 entries, isFraud to V321
dtypes: float64(209), int64(3), object(13)
memory usage: 1013.7+ MB


In [7]:
# observe all object data types
cat_feats = df_train.select_dtypes(include=['object'])
# create the columns into a list
cat_feat_list = list(cat_feats.columns)

In [8]:
# use get dummies
df_train = pd.get_dummies(df_train, columns=cat_feat_list)

In [9]:
#check one hot encoded columns
df_train.head()

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,dist1,...,M6_T,M7_-1,M7_F,M7_T,M8_-1,M8_F,M8_T,M9_-1,M9_F,M9_T
0,0,86400,68.5,13926,-1.0,150.0,142.0,315.0,87.0,19.0,...,1,1,0,0,1,0,0,1,0,0
1,0,86401,29.0,2755,404.0,150.0,102.0,325.0,87.0,-1.0,...,1,1,0,0,1,0,0,1,0,0
2,0,86469,59.0,4663,490.0,150.0,166.0,330.0,87.0,287.0,...,0,0,1,0,0,1,0,0,1,0
3,0,86499,50.0,18132,567.0,150.0,117.0,476.0,87.0,-1.0,...,0,1,0,0,1,0,0,1,0,0
4,0,86506,50.0,4497,514.0,150.0,102.0,420.0,87.0,-1.0,...,0,1,0,0,1,0,0,1,0,0


In [10]:
# split the data frames to X and y
X = df_train.iloc[:,1:]
y = df_train['isFraud']

In [11]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify = y, random_state=42)

In [12]:
# standarize
# creating object
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [15]:
n_components_range = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

In [24]:
# cross validation for n components
# best_score = -1  # Initialize the best score to a low value
# best_n_components = None  # Initialize the best n_components

# for n_components in n_components_range:
#     # create a PCA model with the current n_components
#     pca = PCA(n_components=n_components)

#     # initialize a Logistic Regression model
#     model = LogisticRegression()

#     # replace 'model' with your actual machine learning model
#     scores = cross_val_score(model, pca.fit_transform(X_train_scaled), y_train, cv=5)  # Use 5-fold cross-validation

#     # calculate the mean score
#     mean_score = scores.mean()

#     # check if this n_components is better than the previous best
#     if mean_score > best_score:
#         best_score = mean_score
#         best_n_components = n_components

# print("Best n_components:", best_n_components)


Best n_components: 6


In [25]:
# apply PCA to reduce dimensionality
n_components = 6
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

By doing SMOTE we ensure that the synthetic examples generated by SMOTE don't influence the test set. This prevents the model from "cheating" by learning from synthetic data that it might encounter in the test set.

In [27]:
# class imbalance: apply class imbalance handling SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_pca, y_train)

In [30]:
print("X_resampled shape:", X_train_resampled.shape)
print("y_resampled shape:", y_train_resampled.shape)

X_resampled shape: (797828, 6)
y_resampled shape: (797828,)
