In [1]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix


In [2]:
train_data = pd.read_csv('fraudTrain.csv').drop(columns=['Unnamed: 0'])
test_data = pd.read_csv('fraudTest.csv').drop(columns=['Unnamed: 0'])

Training Data Preprocessing - Same as Simon's

In [None]:
transformed_train_data = train_data.copy()

# Distance between merchant and customer
transformed_train_data['distance'] = np.sqrt((transformed_train_data['lat'] - transformed_train_data['merch_lat'])**2 + (transformed_train_data['long'] - transformed_train_data['merch_long'])**2)

# Transforming dob to age
transformed_train_data['dob'] = pd.to_datetime(transformed_train_data['dob'])
transformed_train_data['age'] = (datetime.now() - transformed_train_data['dob']).dt.days / 365
transformed_train_data.drop(columns=['dob'], inplace=True)

# Transforming date to day, month, year
date = transformed_train_data['trans_date_trans_time'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
transformed_train_data['hour'] = date.dt.hour
transformed_train_data['day'] = date.dt.dayofweek
transformed_train_data['month'] = date.dt.month
transformed_train_data.drop(columns=['trans_date_trans_time'], inplace=True)

# Log-transform skewed data
transformed_train_data['amt'] = np.log1p(transformed_train_data['amt'])

transformed_train_data.drop(columns=['first', 'last', 'street', 'lat', 'long', 'merch_lat', 'merch_long', 'cc_num', 'trans_num', 'unix_time', 'job'], inplace=True)

# Transforming categorical data (Label encoding)
label_encoder = LabelEncoder()
label_vars = ['gender']

for var in label_vars:
    transformed_train_data[var] = label_encoder.fit_transform(transformed_train_data[var])

# Fit frequency encoding on training data
frequency_vars = ['merchant', 'city', 'state', 'category', 'zip']
frequency_encodings = {}

for col in frequency_vars:
    freq_encoding = transformed_train_data[col].value_counts(normalize=True).to_dict()
    frequency_encodings[col] = freq_encoding
    transformed_train_data[col] = transformed_train_data[col].map(freq_encoding)

Test Data Preprocessing - Slight tweak to Simon's. I made sure to apply the same label and frequency encoding to the test data as he did in the training data.

In [None]:
transformed_test_data = test_data.copy()

# Distance between merchant and customer
transformed_test_data['distance'] = np.sqrt((transformed_test_data['lat'] - transformed_test_data['merch_lat'])**2 + (transformed_test_data['long'] - transformed_test_data['merch_long'])**2)

# Transforming dob to age
transformed_test_data['dob'] = pd.to_datetime(transformed_test_data['dob'])
transformed_test_data['age'] = (datetime.now() - transformed_test_data['dob']).dt.days / 365
transformed_test_data.drop(columns=['dob'], inplace=True)

# Transforming date to day, month, year
date = transformed_test_data['trans_date_trans_time'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
transformed_test_data['hour'] = date.dt.hour
transformed_test_data['day'] = date.dt.dayofweek
transformed_test_data['month'] = date.dt.month
transformed_test_data.drop(columns=['trans_date_trans_time'], inplace=True)

# Log-transform skewed data
transformed_test_data['amt'] = np.log1p(transformed_test_data['amt'])

transformed_test_data.drop(columns=['first', 'last', 'street', 'lat', 'long', 'merch_lat', 'merch_long', 'cc_num', 'trans_num', 'unix_time', 'job'], inplace=True)

# Make sure to apply same label encoding to test data
for var in label_vars:
    transformed_test_data[var] = label_encoder.fit_transform(transformed_test_data[var])

# Make sure to pply the same frequency encoding to the test data
for col in frequency_vars:
    transformed_test_data[col] = transformed_test_data[col].map(frequency_encodings[col]).fillna(0)  # fill NaNs with 0 or another appropriate value

I tried applying smote and actually got a lower utility, so moved forward with a very simple model below. I haven't changed any of the parameters in the XGBClassifier, so I am sure we could tweak to make the model even better. 

In [None]:
y_train = transformed_train_data['is_fraud']
X_train = transformed_train_data.drop('is_fraud', axis=1)

y_test = transformed_test_data['is_fraud']
X_test = transformed_test_data.drop('is_fraud', axis=1)

# Apply SMOTE to balance the training data 
#smote = SMOTE(random_state=42)
#X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

#Feature Scaling
#scaler = StandardScaler()
#X_train_scaled = scaler.fit_transform(X_train_res)
#X_test_scaled = scaler.transform(X_test)

In [None]:
# Balance the classes via scale_pos_weight
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

# Initialize XGBoost model
model = XGBClassifier(
    objective='binary:logistic',
    scale_pos_weight=scale_pos_weight,
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='auc'
)

model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:,1]

print(classification_report(y_test, y_pred))
cm = confusion_matrix(y_test, y_pred)
print(cm)

TP = cm[1, 1]
FN = cm[1, 0]
FP = cm[0, 1]

S_minus_L = 50  
C = 100         
P = 5           
utility = TP * S_minus_L - FN * C - FP * P
print(f'Total Utility: {utility}')


Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

           0       1.00      0.99      0.99    553574
           1       0.23      0.96      0.37      2145

    accuracy                           0.99    555719
   macro avg       0.62      0.97      0.68    555719
weighted avg       1.00      0.99      0.99    555719

[[546775   6799]
 [    90   2055]]
Total Utility: 59755
