In [13]:
import pandas as pd
import numpy as np
from haversine import haversine, Unit
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV

seed = 65

# Loading data
data = pd.read_csv("cct_train.csv")

# Distance calculation function
def caldistance(row):
    ulocation = (row['lat'], row['long'])
    mlocation = (row['merch_lat'], row['merch_long'])
    return haversine(ulocation, mlocation, unit=Unit.KILOMETERS)

# Adding column to dataset
data['distance'] = data.apply(caldistance, axis=1)

# Dropping less important features
X = data.drop(columns=['is_fraud', 'street', 'city', 'state', 'zip', 'lat', 'long', 'dob', 'trans_num', 'trans_date', 'merch_lat', 'merch_long'])
y = data['is_fraud']


# Train-test split
xtrain, xtemp, ytrain, ytemp = train_test_split(X, y, test_size=0.4, random_state=seed)
xval, xtest, yval, ytest = train_test_split(xtemp, ytemp, test_size=0.5, random_state=seed)

# Feature Encoding
numfeatures = ['amt', 'city_pop', 'cc_num', 'acct_num', 'unix_time', 'distance']
catfeatures = ['first', 'last', 'category', 'job', 'profile', 'merchant', 'ssn']
numtransformer = Pipeline(steps=[('scaler', StandardScaler())])
cattransformer = Pipeline(steps=[('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numtransformer, numfeatures),
        ('cat', cattransformer, catfeatures)
    ])

# Transforming Data
xtrain = preprocessor.fit_transform(xtrain)
xval = preprocessor.transform(xval)
xtest = preprocessor.transform(xtest)

# Hyperparameter tuning with GridSearchCV
xgbparameter = {
    'scale_pos_weight': [7],
    'learning_rate': [0.1],
    'n_estimators': [400],
    'max_depth': [5],
    'min_child_weight': [5],
    'subsample': [0.7],
    'colsample_bytree': [0.7],
    'gamma': [0.6]
}
xgbgrid = GridSearchCV(
    estimator=xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss'),
    param_grid=xgbparameter,
    scoring='roc_auc',
    cv=3,
    verbose=1,
    n_jobs=-1
)
xgbgrid.fit(xtrain, ytrain)
print("Best Hyperparameters:", xgbgrid.best_params_)
best = xgbgrid.best_estimator_

# Best model
ypred = best.predict(xtest)
acc = accuracy_score(ytest, ypred)
print(f"Test Accuracy (Best Model): {acc:.4f}")
print("Confusion Matrix (Best Model):\n", confusion_matrix(ytest, ypred))
print("\nClassification Report (Best Model):\n", classification_report(ytest, ypred))

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best Hyperparameters: {'colsample_bytree': 0.7, 'gamma': 0.6, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 5, 'n_estimators': 400, 'scale_pos_weight': 7, 'subsample': 0.7}
Test Accuracy (Best Model): 0.9989
Confusion Matrix (Best Model):
 [[139386     46]
 [   105    625]]

Classification Report (Best Model):
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    139432
           1       0.93      0.86      0.89       730

    accuracy                           1.00    140162
   macro avg       0.97      0.93      0.95    140162
weighted avg       1.00      1.00      1.00    140162



In [14]:
import joblib

# Saving the trained model
joblib.dump(best, 'T1.joblib')

['T1.joblib']