### 01 EDA

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
import statsmodels.formula.api as smf
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from xgboost import XGBRegressor

In [2]:
transactions = pd.read_csv(r'C:\Users\sunsu\OneDrive\Desktop\Winter ML Project 25-26\loan pred ipynb new\data\raw\train_transaction.csv')
identity = pd.read_csv(r'C:\Users\sunsu\OneDrive\Desktop\Winter ML Project 25-26\loan pred ipynb new\data\raw\train_identity.csv')


df = pd.merge(transactions, identity, on='TransactionID', how='left')
#df.to_csv("output.csv", index=False)
df

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.50,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.00,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.00,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.00,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.00,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,3577535,0,15811047,49.00,W,6550,,150.0,visa,226.0,...,,,,,,,,,,
590536,3577536,0,15811049,39.50,W,10444,225.0,150.0,mastercard,224.0,...,,,,,,,,,,
590537,3577537,0,15811079,30.95,W,12037,595.0,150.0,mastercard,224.0,...,,,,,,,,,,
590538,3577538,0,15811088,117.00,W,7826,481.0,150.0,mastercard,224.0,...,,,,,,,,,,


In [3]:
df = df.sort_values(by = 'TransactionDT').reset_index(drop = True)

cutoff_index = int(0.8 * len(df))

df_train = df.iloc[:cutoff_index]
df_test = df.iloc[cutoff_index:]

df_train.shape[0]

472432

In [4]:
feature_train = df_train.drop(columns = "isFraud")
feature_test = df_test.drop(columns = "isFraud")

target_train = df_train["isFraud"]
target_test = df_test["isFraud"]

In [5]:
categorical_cols = feature_train.select_dtypes(include = ['object']).columns.tolist()
numerical_cols = feature_train.select_dtypes(exclude = ['object']).columns.tolist()

In [6]:
from sklearn.impute import SimpleImputer

ni = SimpleImputer(strategy = 'median')

feature_train[numerical_cols] = ni.fit_transform(feature_train[numerical_cols])
feature_test[numerical_cols] = ni.transform(feature_test[numerical_cols])

In [7]:
ci = SimpleImputer(strategy = 'constant', fill_value = 'missing')

feature_train[categorical_cols] = ci.fit_transform(feature_train[categorical_cols])
feature_test[categorical_cols] = ci.transform(feature_test[categorical_cols])        

In [8]:
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
feature_train[categorical_cols] = enc.fit_transform(feature_train[categorical_cols])
feature_test[categorical_cols] = enc.transform(feature_test[categorical_cols])

In [9]:
print(feature_train.shape, feature_test.shape)
print(feature_train.isna().sum().sum())  # Should be 0
print(feature_test.isna().sum().sum())   # Should be 0

(472432, 433) (118108, 433)
0
0


In [10]:
import lightgbm as lgb
from sklearn.metrics import average_precision_score

#Weights to have a ratio of 1:10 for fraud to non-fraud cases
num_fraud = target_train.sum()
num_nonFraud = len(target_train) - num_fraud
fraud_ratio = num_nonFraud / num_fraud

#Convertin data into LightGBM dataset format
train_data = lgb.Dataset(feature_train, label = target_train)
val_data = lgb.Dataset(feature_test, label = target_test, reference = train_data)

#params define what should the model have in terms of objective, metric, boosting type, etc.
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'metric': 'average_precision',
    'learning_rate': 0.1,
    'num_leaves': 31,
    'max_depth': -1,
    'scale_pos_weight' : fraud_ratio,
    'verbose': -1,
    'n_jobs': -1,
}

#model defines how the model should actual run for training
model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, val_data],
    valid_names=['train', 'valid'],
    num_boost_round = 200,
    callbacks=[
        lgb.early_stopping(stopping_rounds=20),
        lgb.log_evaluation(period=10)
    ]
    #verbose_eval = 10
)

y_pred = model.predict(feature_test, num_iteration=model.best_iteration)
AUC = average_precision_score(target_test, y_pred)
AUC

Training until validation scores don't improve for 20 rounds
[10]	train's average_precision: 0.439973	valid's average_precision: 0.358607
[20]	train's average_precision: 0.503574	valid's average_precision: 0.425116
[30]	train's average_precision: 0.546739	valid's average_precision: 0.443067
[40]	train's average_precision: 0.574376	valid's average_precision: 0.454916
[50]	train's average_precision: 0.598946	valid's average_precision: 0.459033
[60]	train's average_precision: 0.616264	valid's average_precision: 0.469988
[70]	train's average_precision: 0.630788	valid's average_precision: 0.476033
[80]	train's average_precision: 0.643935	valid's average_precision: 0.48165
[90]	train's average_precision: 0.655369	valid's average_precision: 0.48588
[100]	train's average_precision: 0.666299	valid's average_precision: 0.492512
[110]	train's average_precision: 0.677207	valid's average_precision: 0.495919
[120]	train's average_precision: 0.684827	valid's average_precision: 0.496954
[130]	train's 

np.float64(0.5187289522586421)

In [11]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix
import numpy as np 

thresholds = [0.5,0.6,0.7,0.8,0.9]

for thresh in thresholds:
    y_label = (y_pred > thresh).astype(int)
    precision = precision_score(target_test, y_label)
    recall = recall_score(target_test, y_label)
    f1 = f1_score(target_test, y_label)
    cm = confusion_matrix(target_test, y_label)
    tn, fp, fn, tp = cm.ravel()
    print(f"Threshold: {thresh:.2f} | Precision: {precision:.3f} | Recall: {recall:.3f} | F1: {f1:.3f} | Flags: {y_label.sum()} | TP: {tp} | FP: {fp}")

Threshold: 0.50 | Precision: 0.209 | Recall: 0.734 | F1: 0.325 | Flags: 14280 | TP: 2983 | FP: 11297
Threshold: 0.60 | Precision: 0.267 | Recall: 0.676 | F1: 0.383 | Flags: 10285 | TP: 2746 | FP: 7539
Threshold: 0.70 | Precision: 0.351 | Recall: 0.610 | F1: 0.445 | Flags: 7068 | TP: 2479 | FP: 4589
Threshold: 0.80 | Precision: 0.481 | Recall: 0.500 | F1: 0.490 | Flags: 4233 | TP: 2034 | FP: 2199
Threshold: 0.90 | Precision: 0.701 | Recall: 0.374 | F1: 0.487 | Flags: 2165 | TP: 1518 | FP: 647


In [12]:
preprocessor = {
    "num_imputer": ni,
    "cat_imputer": ci,
    "encoder": enc,
    "numerical_cols": numerical_cols,
    "categorical_cols": categorical_cols
}

In [13]:
import joblib

joblib.dump(model, "fraud_model.pkl")
joblib.dump(preprocessor, "preprocessor.pkl")

['preprocessor.pkl']

In [14]:
def prob_result(prob):
    if prob >= 0.9:
        return "AUTO BLOCK"
    elif prob >= 0.7:
        return "HUMAN REVIEW"
    else:
        return "AUTO ALLOW"

In [15]:
df_results = feature_test.copy()

df_results["isFraud_Prob"] = y_pred
df_results["Decision"] = df_results["isFraud_Prob"].apply(prob_result)


In [16]:
df_results["Decision"].value_counts(normalize = True)

Decision
AUTO ALLOW      0.940156
HUMAN REVIEW    0.041513
AUTO BLOCK      0.018331
Name: proportion, dtype: float64

In [17]:
import numpy, pandas, lightgbm, sklearn
print("numpy:", np.__version__)
print("pandas:", pd.__version__)
print("lightgbm:", lgb.__version__)
print("sklearn:", sklearn.__version__)  

numpy: 2.1.3
pandas: 2.3.3
lightgbm: 4.6.0
sklearn: 1.6.1
