In [26]:
import sys
import subprocess
import pkg_resources

def install_requirements(file='../requirements.txt'):
    try:
        with open(file) as f:
            packages = [line.strip() for line in f if line.strip() and not line.startswith('#')]
    except FileNotFoundError:
        print(f"{file} not found.")
        return

    for package in packages:
        # Extract package name without version specifier for import
        pkg_name = package.split('==')[0].split('>=')[0].split('<=')[0]

        try:
            # Check if the package is installed
            dist = pkg_resources.get_distribution(pkg_name)
            installed_version = dist.version

            # Check if the installed version satisfies the requirement
            requirement = pkg_resources.Requirement.parse(package)
            if installed_version not in requirement:
                print(f"Upgrading {package} (installed version: {installed_version})...")
                subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", package])
            else:
                print(f"{package} already installed and up to date (version {installed_version}).")

        except pkg_resources.DistributionNotFound:
            print(f"Installing {package}...")
            subprocess.check_call([sys.executable, "-m", "pip", "install", package])

install_requirements()


pandas already installed and up to date (version 1.3.4).
numpy already installed and up to date (version 1.20.3).
matplotlib already installed and up to date (version 3.4.3).
seaborn already installed and up to date (version 0.11.2).
scikit-learn already installed and up to date (version 1.6.1).
lightgbm already installed and up to date (version 4.6.0).
imblearn already installed and up to date (version 0.0).


In [27]:
import warnings
warnings.simplefilter('ignore')

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from datetime import timedelta

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

# Processed Data Load

In [28]:
df = pd.read_csv("../Data/Processed/loan_transactions_features.csv")

# Data preprocessing for model

In [29]:
# 1. Define target variables
y_fraud = df['fraud_flag']
y_loan_status = df['loan_status']

In [30]:
# 2. Define the feature set X by dropping irrelevant columns
X = df.drop(columns=['fraud_flag', 'loan_status', 'fraud_type', 'application_id', 'customer_id', 'application_date'])

In [31]:
# 3. Identify numerical and categorical features
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

print("\033[1mNumerical features:\033[0m", numerical_features)
print('-'*70)
print("\033[1mCategorical features:\033[0m", categorical_features)

[1mNumerical features:[0m ['application_year', 'application_month', 'application_day_of_week', 'applicant_age', 'number_of_dependents', 'monthly_income', 'cibil_score', 'loan_amount_requested', 'loan_tenure_months', 'interest_rate_offered', 'loan_amount_to_income_ratio', 'existing_emis_monthly', 'existing_emi_to_income_ratio', 'debt_to_income_ratio', 'transaction_count', 'total_transaction_amount', 'avg_transaction_amount', 'max_transaction_amount', 'fraud_count', 'fraud_rate', 'international_txn_rate', 'unique_devices_used', 'transaction_count_pre', 'total_amount_pre', 'avg_amount_pre', 'fraud_count_pre', 'fraud_rate_pre']
----------------------------------------------------------------------
[1mCategorical features:[0m ['gender', 'residential_address', 'property_ownership_status', 'employment_status', 'loan_type', 'purpose_of_loan']


In [32]:
# 4. Create a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough' # Keep other columns that are not transformed
)

In [33]:
# 5. Fit and transform the feature set X
X_processed = preprocessor.fit_transform(X)

print("\nShape of original features X:", X.shape)
print("Shape of processed features X_processed:", X_processed.shape)


Shape of original features X: (50000, 33)
Shape of processed features X_processed: (50000, 18365)
