<h2> Libraries

In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score, make_scorer, roc_auc_score, confusion_matrix, roc_curve
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
import seaborn as sns

<h2> Loading the train and test dataset

In [3]:
# Load train data 
training_data = pd.read_csv("data/train.csv")
training_data.shape

(14254, 22)

In [4]:
# Load Test data
test_data = pd.read_csv("data/test.csv")
test_data.shape

(3564, 22)

<h2> Prepare Features and Target 

In [5]:
# Prepare X_train (features), y_train (target)
X_train = training_data.drop(['fraud'], axis = 1)
y_train = training_data['fraud']

# Prepare X_test (predictors), y_test (target)
X_test = test_data.drop(['fraud'], axis = 1)
y_test = test_data['fraud']

In [6]:
X_train.columns

Index(['marital_status', 'high_education_ind', 'address_change_ind',
       'past_num_of_claims', 'witness_present_ind', 'liab_prct',
       'policy_report_filed_ind', 'age', 'safety_grade',
       'annual_income_category', 'part_of_month', 'weekday',
       'accident_site_Highway', 'accident_site_Local',
       'accident_site_Parking Lot', 'channel_Broker', 'channel_Online',
       'channel_Phone', 'claim_est_payout_category', 'age_of_vehicle_category',
       'vehicle_price_category'],
      dtype='object')

<h2> Baseline SVM Model

In [15]:
# Train an SVM classifier 
svc = SVC(kernel='linear',probability=True, random_state = 0)  

# Fit the model on training data
svc.fit(X_train, y_train)


# Calculate and print the accuracy score on the training data
train_accuracy = svc.score(X_train, y_train)
print("Training Accuracy:", train_accuracy)

# Calculate and print the accuracy score on the test data
test_accuracy = svc.score(X_test, y_test)
print("Test Accuracy:", test_accuracy)

y_svc_pred = svc.predict_proba(X_test)
y_pred = svc.predict(X_test)

linear_report = classification_report(y_test  , y_pred)


# print classification report
print(linear_report)

Training Accuracy: 0.8427108180159955
Test Accuracy: 0.8451178451178452
              precision    recall  f1-score   support

           0       0.85      1.00      0.92      3012
           1       0.00      0.00      0.00       552

    accuracy                           0.85      3564
   macro avg       0.42      0.50      0.46      3564
weighted avg       0.71      0.85      0.77      3564



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


We see that there is a mass imbalance in classes between Fraudulent and Non Fraudulent cases in our training data, resulting in the model struggling to correctly predict fraudulent (class 1) transactions. Hence, Synthetic Minority Over Sampling Technique (SMOTE) can be used to balance the class distribution through data augmentation.

<h2> Weighting classes

In [23]:
# Train an SVM classifier 
svc_2 = SVC(kernel='linear',probability=True, random_state = 0, class_weight='balanced')  

# Fit the model on training data
svc_2.fit(X_train, y_train)


# Calculate and print the accuracy score on the training data
train_accuracy_2 = svc_2.score(X_train, y_train)
print("Training Accuracy after SMOTE:", train_accuracy_2)

# Calculate and print the accuracy score on the test data
test_accuracy_2 = svc_2.score(X_test, y_test)
print("Test Accuracy after SMOTE:", test_accuracy_2)

y_svc_pred_2 = svc_2.predict_proba(X_test)
y_pred_2 = svc_2.predict(X_test)

weighted_report = classification_report(y_test, y_pred_2)

# print classification report
print(weighted_report)

Training Accuracy after SMOTE: 0.6311912445629297
Test Accuracy after SMOTE: 0.6262626262626263
              precision    recall  f1-score   support

           0       0.91      0.62      0.74      3012
           1       0.24      0.65      0.35       552

    accuracy                           0.63      3564
   macro avg       0.57      0.64      0.54      3564
weighted avg       0.80      0.63      0.68      3564



<h2> SMOTE

In [24]:
# Create a DataFrame with class counts
class_counts_df = pd.DataFrame({'Class': [0, 1], 'Count': pd.Series(y_train).value_counts().sort_index()})

# Display the DataFrame
print(class_counts_df)

   Class  Count
0      0  12012
1      1   2242


In [16]:
# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [17]:
X_train_resampled.shape

(24024, 21)

In [18]:
y_train_resampled.shape

(24024,)

In [19]:
# Create a DataFrame with class counts
class_counts_df = pd.DataFrame({'Class': [0, 1], 'Count': pd.Series(y_train_resampled).value_counts().sort_index()})

# Display the DataFrame
print(class_counts_df)

   Class  Count
0      0  12012
1      1  12012


In [20]:
# Create a DataFrame with class counts for test
class_counts_test = pd.DataFrame({'Class': [0, 1], 'Count': pd.Series(y_test).value_counts().sort_index()})

# Display the DataFrame
print(class_counts_test)

   Class  Count
0      0   3012
1      1    552


In [21]:
# Apply resampled data on SVM
# Train an SVM classifier 
svc_SMOTE = SVC(kernel='linear',probability=True, random_state = 0)  

# Fit the model on training data
svc_SMOTE.fit(X_train_resampled, y_train_resampled)


# Calculate and print the accuracy score on the training data
train_accuracy_resampled = svc_SMOTE.score(X_train_resampled, y_train_resampled)
print("Training Accuracy after SMOTE:", train_accuracy_resampled)

# Calculate and print the accuracy score on the test data
test_accuracy_resampled = svc_SMOTE.score(X_test, y_test)
print("Test Accuracy after SMOTE:", test_accuracy_resampled)

y_svc_SMOTE_pred = svc_SMOTE.predict_proba(X_test)
y_SMOTE_pred = svc_SMOTE.predict(X_test)

SMOTE_linear_report = classification_report(y_test, y_SMOTE_pred)


# print classification report
print(SMOTE_linear_report)

Training Accuracy after SMOTE: 0.7592823842823843
Test Accuracy after SMOTE: 0.7283950617283951
              precision    recall  f1-score   support

           0       0.88      0.79      0.83      3012
           1       0.26      0.39      0.31       552

    accuracy                           0.73      3564
   macro avg       0.57      0.59      0.57      3564
weighted avg       0.78      0.73      0.75      3564



We see a definite improvement in our precision (0.26), recall (0.39) and f1-score (0.31) after SMOTE Resampling as compared to the baseline model.
1. Precision (for Class 1):
Precision measures the accuracy of the positive predictions. In the context of fraud detection, precision tells you the proportion of predicted fraud cases that are actually fraud. In our model, it's 26%, meaning that out of the instances predicted as fraud, 26% are actually fraud.

2. Recall (for Class 1):
Recall (or sensitivity) measures the ability of the model to capture all the positive instances. In fraud detection, recall tells you the proportion of actual fraud cases that the model has identified. In our model, it's 39%, meaning that the model has captured 39% of the actual fraud cases.

3. F1-Score (for Class 1):
F1-score is the harmonic mean of precision and recall. It provides a balance between precision and recall. A higher F1-score indicates a better trade-off between precision and recall. In our model, it's 31%, suggesting a moderate balance between precision and recall.

<H2> Combining SMOTE and weighting

In [31]:
from sklearn.utils.class_weight import compute_class_weight

# Assuming y_train is your target variable
class_labels = np.unique(y_train)
class_weights = compute_class_weight(class_weight = 'balanced', classes = class_labels, y = y_train)

# Print the calculated class weights
for label, weight in zip(class_labels, class_weights):
    print(f"Class {label}: {weight}")

Class 0: 0.5933233433233434
Class 1: 3.17885816235504


In [33]:
# Apply resampled data on SVM
# Train an SVM classifier 
svc_SMOTE = SVC(kernel='linear',probability=True, random_state = 0, class_weight = {0: 1, 1: 2})  

# Fit the model on training data
svc_SMOTE.fit(X_train_resampled, y_train_resampled)


# Calculate and print the accuracy score on the training data
train_accuracy_resampled = svc_SMOTE.score(X_train_resampled, y_train_resampled)
print("Training Accuracy after SMOTE:", train_accuracy_resampled)

# Calculate and print the accuracy score on the test data
test_accuracy_resampled = svc_SMOTE.score(X_test, y_test)
print("Test Accuracy after SMOTE:", test_accuracy_resampled)

y_svc_SMOTE_pred = svc_SMOTE.predict_proba(X_test)
y_SMOTE_pred = svc_SMOTE.predict(X_test)

SMOTE_linear_report = classification_report(y_test, y_SMOTE_pred)


# print classification report
print(SMOTE_linear_report)

Training Accuracy after SMOTE: 0.7291042291042291
Test Accuracy after SMOTE: 0.5892255892255892
              precision    recall  f1-score   support

           0       0.91      0.57      0.70      3012
           1       0.23      0.69      0.34       552

    accuracy                           0.59      3564
   macro avg       0.57      0.63      0.52      3564
weighted avg       0.80      0.59      0.65      3564



We see relative improvements in the recall (0.69) and F1-score (0.34) after setting class weights to {0:1, 1:2}. We will proceed with hyper parameter tuning to find the ideal class weights.

<h2> Hyper Parameter Tuning

In [42]:
# Define the hyperparameter grid
param_grid = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf'], #Poly omitted due to time constraints
    'class_weight': ['balanced', {0: 1, 1: 2}, {0: 1, 1: 1.5}]
}

# Create an SVM classifier
svc_tune = SVC(probability=True, random_state=0)

# Instantiate GridSearchCV
grid_search = GridSearchCV(svc_tune, param_grid, scoring= 'recall', cv=3, verbose = 2)

# Fit the grid search to the data
grid_search.fit(X_train_resampled, y_train_resampled)

# Print the best parameters
print("Best Hyperparameters:", grid_search.best_params_)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END ........C=0.1, class_weight=balanced, kernel=linear; total time=  20.6s
[CV] END ........C=0.1, class_weight=balanced, kernel=linear; total time=  25.6s
[CV] END ........C=0.1, class_weight=balanced, kernel=linear; total time=  26.2s
[CV] END ...........C=0.1, class_weight=balanced, kernel=rbf; total time=  44.7s
[CV] END ...........C=0.1, class_weight=balanced, kernel=rbf; total time=  49.5s
[CV] END ...........C=0.1, class_weight=balanced, kernel=rbf; total time=  49.4s
[CV] END ....C=0.1, class_weight={0: 1, 1: 2}, kernel=linear; total time=  21.7s
[CV] END ....C=0.1, class_weight={0: 1, 1: 2}, kernel=linear; total time=  27.5s
[CV] END ....C=0.1, class_weight={0: 1, 1: 2}, kernel=linear; total time=  31.8s
[CV] END .......C=0.1, class_weight={0: 1, 1: 2}, kernel=rbf; total time=  44.9s
[CV] END .......C=0.1, class_weight={0: 1, 1: 2}, kernel=rbf; total time=  48.4s
[CV] END .......C=0.1, class_weight={0: 1, 1: 2}

In [43]:
# Use the best model for predictions
tuned_svc = grid_search.best_estimator_
y_pred_tune = tuned_svc.predict(X_test)
y_pred_prob_tune = tuned_svc.predict_proba(X_test)

# Calculate and print the accuracy score on the training data
train_accuracy_tune = tuned_svc.score(X_train_resampled, y_train_resampled)
print("Training Accuracy after tuning:", train_accuracy_tune)

# Calculate and print the accuracy score on the test data
test_accuracy_tune = tuned_svc.score(X_test, y_test)
print("Test Accuracy after tuning:", test_accuracy_tune)

# View Evaluation Metrics
precision = precision_score(y_test, y_pred_tune, pos_label=1)
recall = recall_score(y_test, y_pred_tune, pos_label=1)
f1 = f1_score(y_test, y_pred_tune, pos_label=1)

print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Training Accuracy after tuning: 0.7252331002331003
Test Accuracy after tuning: 0.5468574635241302
Precision: 0.21893178212585934
Recall: 0.75
F1-Score: 0.3389275480966025


The tuned model has the hyper paramters of C = 0.1, Class_weight of 0:1, 1:2 and kernel of rbf and has the best recall while having a relatively similar precision and F1-score as the previous models. 

<h2> Threshold Tuning

In [46]:
#Youdens Index
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob_tune[:,1])
j_scores = tpr - fpr
optimal_idx = np.argmax(j_scores)
optimal_threshold = thresholds[optimal_idx]
print("Optimal Threshold from Youden's Index: ", optimal_threshold)

Optimal Threshold from Youden's Index:  0.2730460791686107


In [47]:
# Get evaluation with optimal threshold
preds = np.where(y_pred_prob_tune[:,1] > optimal_threshold, 1, 0)
# View Evaluation Metrics
precision = precision_score(y_test, preds, pos_label=1)
recall = recall_score(y_test, preds, pos_label=1)
f1 = f1_score(y_test, preds, pos_label=1)

print("Precision:", precision)
print("Recall:", recall)
print("F1-Score:", f1)

Precision: 0.22204301075268817
Recall: 0.7481884057971014
F1-Score: 0.34245439469320066


To balance out the precision and recall slightly, we can find the optimal threshold from Youden's Index, to get this final evaluation metrics.

| Precision | Recall | F1 - score |
| --------- | ------ | ---------- | 
|   0.22    |  0.75  |    0.34   |