#                 ---Financial Fraud Detection---

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3

In [None]:
# load the dataset
data = pd.read_csv('C:\\Users\\user\\Downloads\\Bank_Transaction_Fraud_Detection.csv')
data.head()

### Data Preprocessing

In [None]:
data.info()

In [None]:
# change data type of date column to datetime
data['Transaction_Date'] = pd.to_datetime(data['Transaction_Date'])
data['Transaction_Time'] = pd.to_datetime(data['Transaction_Time'], format='%H:%M:%S').dt.time
# combine date and time into a single datestamp column
data['Transaction_Timestamp'] = pd.to_datetime(
    data['Transaction_Date'].astype(str) + ' ' + data['Transaction_Time'].astype(str)
)
# drop original date and time columns
data.drop(columns=['Transaction_Date', 'Transaction_Time'], inplace=True)
data.info()

In [None]:
# check duplicate rows on a specific subset of columns
data.duplicated(subset=['Transaction_ID', 'Customer_ID', 'Transaction_Amount', 'Transaction_Timestamp']).sum()

In [None]:
# drop unnecessary columns
data.drop(columns=['Customer_Email', 'Customer_Contact'], inplace=True)
data.info()

In [None]:
# covert date and time format to US format
data['Transaction_Timestamp'] = data['Transaction_Timestamp'].dt.strftime('%m/%d/%Y %I:%M:%S %p')
data['Transaction_Timestamp'].head()

In [None]:
# rename columns for better readability
data.columns = [col.strip().replace(' ', '_').lower() for col in data.columns]
data.head()

## Creating Database

In [None]:
# creating a SQLite database and storing the cleaned data
conn = sqlite3.connect('bank_transactions.db')
data.to_sql('transactions', conn, if_exists='replace', index=False)
conn.close()

## Load Data from Database 

In [None]:
# load data from SQLite database to verify
conn = sqlite3.connect('bank_transactions.db')
data = pd.read_sql('SELECT * FROM transactions', conn)
conn.close()    
data.head()

## Identify Numeric & Categorical Columns

In [None]:
# separate numerical and categorical columns
numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = data.select_dtypes(include=['object']).columns.tolist()
numerical_cols, categorical_cols

## Encode Categorical and Numeric Columns

In [None]:
# encoding categorical variables
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le
data.head()


## Split Features and Target


In [None]:
# splitting the dataset into features and target variable
X = data.drop(columns=['is_fraud'])
y = data['is_fraud']
# splitting into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Scale Data

In [None]:
# # scale data
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)


## Train Multiple Models

In [None]:
# Train Multiple Models
# import all necessary model libraries
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
# import evaluation metrics
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score
# initialize models
models = {
    # 'Logistic Regression': LogisticRegression(max_iter=100)
       'Decision Tree': DecisionTreeClassifier()
    #  'Random Forest': RandomForestClassifier(),  
    #  'Gradient Boosting': GradientBoostingClassifier(),
    #  'Support Vector Machine': SVC(probability=True),
    #  'XGBoost': XGBClassifier()
}
# train and evaluate each model
results = {}
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    results[model_name] = {
        'Accuracy': accuracy_score(y_test, y_pred),
        'Precision': precision_score(y_test, y_pred),
        'Recall': recall_score(y_test, y_pred),
        'F1 Score': f1_score(y_test, y_pred),   
        'Classification Report': classification_report(y_test, y_pred),
        'Confusion Matrix': confusion_matrix(y_test, y_pred),   
        'ROC AUC': roc_auc_score(y_test, y_proba),
       
    }
    # display results for each model
    print(f"Results for {model_name}:")
    print(f"Accuracy: {results[model_name]['Accuracy']:.4f}")
    print(f"Precision: {results[model_name]['Precision']:.4f}")
    print(f"Recall: {results[model_name]['Recall']:.4f}")
    print(f"F1 Score: {results[model_name]['F1 Score']:.4f}")
    print(f"ROC AUC: {results[model_name]['ROC AUC']:.4f}")
    print("Classification Report:")
    print(results[model_name]['Classification Report'])
    print("Confusion Matrix:")
    print(results[model_name]['Confusion Matrix'])
    
    # plot confusion matrix for each model
    plt.figure(figsize=(6, 4))
    sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(f'Confusion Matrix for {model_name}')
    plt.show()




## Save the model

In [None]:
# save the model
# import joblib
import joblib
joblib.dump(models['Decision Tree'], 'decision_tree_model.pkl')


## Save label encoder

In [None]:
# save label encoders
joblib.dump(label_encoders, 'label_encoders.pkl')
