In [1]:
# Import Necessary Libraries 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [6]:
# Load and Explore Dataset
# Load the dataset
data = pd.read_csv('data/final_merged_data.csv')

# Explore the dataset
data.head()

Unnamed: 0,User,Card,Month,Day,Time,Use Chip,Merchant City,MCC,Errors?,Is Fraud?,...,Num Credit Cards,Age Group,CARD INDEX,Card Brand,Card Type,Has Chip,Cards Issued,Credit Limit,Year PIN last Changed,Card on Dark Web
0,0,0,1,5,06:02,Chip Transaction,Princeton,5411,Insufficient Balance,No,...,5,46-60,0,Visa,Debit,YES,2,$24295,2008,No
1,0,0,1,5,06:02,Chip Transaction,Princeton,5411,Insufficient Balance,No,...,5,46-60,1,Visa,Debit,YES,2,$21968,2014,No
2,0,0,1,5,06:02,Chip Transaction,Princeton,5411,Insufficient Balance,No,...,5,46-60,2,Visa,Debit,YES,2,$46414,2004,No
3,0,0,1,5,06:02,Chip Transaction,Princeton,5411,Insufficient Balance,No,...,5,46-60,3,Visa,Credit,NO,1,$12400,2012,No
4,0,0,1,5,06:02,Chip Transaction,Princeton,5411,Insufficient Balance,No,...,5,46-60,4,Mastercard,Debit (Prepaid),YES,1,$28,2009,No


In [None]:
# Explore the dataset
data.head()

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.isnull().sum()

In [None]:
data.shape

In [None]:
data_dropna = data.dropna()

In [None]:
data_dropna.shape

In [None]:
# Define features and target variable
features = ['User', 'Card', 'Month', 'Day', 'Time', 'Use Chip', 'Merchant City', 'MCC', 
            'Errors?', 'City', 'Zipcode', 'Per Capita Income - Zipcode', 'Yearly Income - Person', 
            'Total Debt', 'FICO Score', 'Num Credit Cards', 'Age Group', 'CARD INDEX', 'Card Brand', 
            'Card Type', 'Has Chip', 'Cards Issued', 'Credit Limit', 'Year PIN last Changed', 
            'Card on Dark Web']
target = 'Is Fraud?'

In [None]:
X = data_dropna[features].copy()
y = data_dropna[target].copy()

In [None]:
# Clean numerical columns by removing '$' and ',' from the 'Per Capita Income - Zipcode' and 'Yearly Income - Person' columns and convert them to float
X.loc[:, 'Per Capita Income - Zipcode'] = X['Per Capita Income - Zipcode'].replace('[\$,]', '', regex=True).astype(float)
X.loc[:, 'Yearly Income - Person'] = X['Yearly Income - Person'].replace('[\$,]', '', regex=True).astype(float)


In [None]:
# Clean numerical columns by removing '$' and ',' from the 'Per Capita Income - Zipcode' and 'Yearly Income - Person' columns and convert them to float
X['Per Capita Income - Zipcode'] = X['Per Capita Income - Zipcode'].replace('[\$,]', '', regex=True).astype(float)
X['Yearly Income - Person'] = X['Yearly Income - Person'].replace('[\$,]', '', regex=True).astype(float)


In [None]:
# Define categorical and numerical columns
categorical_cols = ['User', 'Card', 'Month', 'Day', 'Time', 'Use Chip', 'Merchant City', 
                    'Errors?', 'City', 'Age Group', 'Card Brand', 'Card Type', 'Has Chip', 
                    'Card on Dark Web']
numerical_cols = ['MCC', 'Zipcode', 'Per Capita Income - Zipcode', 'Yearly Income - Person', 
                  'Total Debt', 'FICO Score', 'Num Credit Cards', 'CARD INDEX', 'Cards Issued', 
                  'Credit Limit', 'Year PIN last Changed']

In [None]:
# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

In [None]:
# Define the model 1 - option 1
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000, random_state=42))
])

In [None]:
# Define the model - option 2
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, max_features='auto', max_depth=10, criterion='gini', random_state=42))
])

In [None]:
# Define the model - option 3
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(n_estimators=100, max_depth=10, learning_rate=0.1, random_state=42))
])

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Model evaluation
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')