In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Data Collection

In [None]:
df = pd.read_csv('transactions.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.isna().sum()

In [None]:
df.drop('transaction_time', axis= 1, inplace= True)

In [None]:
# EDA

In [None]:
categorical_columns = ['country', 'bin_country', 'channel', 'merchant_category', 'promo_used', 'avs_match', 'cvv_result', 'three_ds_flag',]
numerical_columns = ['account_age_days', 'total_transactions_user']

In [None]:
sns.set(style='whitegrid')

In [None]:
plt.figure(figsize= (12, 14))

for i, column in enumerate(categorical_columns, 1):
    plt.subplot(4, 2, i)
    sns.countplot(x=df[column])
    plt.title(f'Countplot of {column}')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize= (15, 8))

for i, column in enumerate(numerical_columns, 1):
    plt.subplot(1, 2, i)
    sns.histplot(df[column], kde=True)
    plt.title(f'Distribution of {column}')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize= (15, 8))

for i, column in enumerate(numerical_columns, 1):
    plt.subplot(1, 2, i)
    sns.boxplot(df[column])
    plt.title(f'Distribution of {column}')

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(15, 8))
sns.heatmap(df.corr(numeric_only= True), annot= True)
plt.show()

In [None]:
# Data Preprocessing

In [None]:
df.drop(['transaction_id', 'user_id'], inplace=True, axis=1)

In [None]:
df = pd.get_dummies(df, columns= ['country', 'bin_country', 'channel', 'merchant_category'], drop_first=False)

In [None]:
boolean_columns = df.select_dtypes('boolean')

for column in boolean_columns:
    df[column] = df[column].astype(int)

In [None]:
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

In [None]:
# Mild over sampling of 1 ( 10:1 ratio )

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
target_ratio=10
desired_minority_count = int(len(y[y == 0]) / target_ratio)

print(desired_minority_count)

In [None]:
from collections import Counter

In [None]:
print(f'Count before over sampling: {Counter(y)}')

In [None]:
smote = SMOTE(
    sampling_strategy= {1: desired_minority_count},
    random_state=42
)

X_resampled, y_resampled = smote.fit_resample(X, y)

In [None]:
print(f'Count after over sampling: {Counter(y_resampled)}')

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state= 42)

In [None]:
# Model Selection

In [None]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score, confusion_matrix

In [None]:
# Without class weight parameter

In [None]:
models = {
    'Random Forest': RandomForestClassifier(),
    'XGB': XGBClassifier()
}

for name, model in models.items():
    print(f'Model: {name}')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'Confusion Matrix:\n {confusion_matrix(y_test, y_pred)}')
    print('-'*30)

In [None]:
# With class weight parameter

In [None]:
models = {
    'Random Forest': RandomForestClassifier(
        class_weight= {0:1, 1:10},
        random_state=42
    ),
    'XGB': XGBClassifier(
        scale_pos_weight= 10,
        random_state=42
    )
}

for name, model in models.items():
    print(f'Model: {name}')
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
    print(f'Confusion Matrix:\n {confusion_matrix(y_test, y_pred)}')
    print('-'*30)

In [None]:
# Model Training

In [None]:
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'aucpr',
    'use_label_encoder': False,
    'scale_pos_weight': 10,
    'learning_rate': 0.02,
    'n_estimators': 800,
    'max_depth': 3, 
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'random_state': 42,
    'reg_alpha': 0.1,
    'reg_lambda': 0.1
}

model = XGBClassifier(**params)

model.fit(X_train, y_train)

In [None]:
# Model Evaluation

In [None]:
y_pred = model.predict(X_test)

print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_pred)}')

In [None]:
# Dumping the columns name and model

In [None]:
import joblib

joblib.dump(model, 'model.joblib')
joblib.dump(X.columns, 'columns.joblib')