# 0.) Import the Credit Card Fraud Data From CCLE

In [1]:
import pandas as pd
# from google.colab import drive
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# drive.mount('/content/gdrive/', force_remount = True)

In [3]:
df = pd.read_csv("fraudTest.csv")

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2020-06-21 12:14:25,2291163933867244,fraud_Kirlin and Sons,personal_care,2.86,Jeff,Elliott,M,351 Darlene Green,...,33.9659,-80.9355,333497,Mechanical engineer,1968-03-19,2da90c7d74bd46a0caf3777415b3ebd3,1371816865,33.986391,-81.200714,0
1,1,2020-06-21 12:14:33,3573030041201292,fraud_Sporer-Keebler,personal_care,29.84,Joanne,Williams,F,3638 Marsh Union,...,40.3207,-110.436,302,"Sales professional, IT",1990-01-17,324cc204407e99f51b0d6ca0055005e7,1371816873,39.450498,-109.960431,0
2,2,2020-06-21 12:14:53,3598215285024754,"fraud_Swaniawski, Nitzsche and Welch",health_fitness,41.28,Ashley,Lopez,F,9333 Valentine Point,...,40.6729,-73.5365,34496,"Librarian, public",1970-10-21,c81755dbbbea9d5c77f094348a7579be,1371816893,40.49581,-74.196111,0
3,3,2020-06-21 12:15:15,3591919803438423,fraud_Haley Group,misc_pos,60.05,Brian,Williams,M,32941 Krystal Mill Apt. 552,...,28.5697,-80.8191,54767,Set designer,1987-07-25,2159175b9efe66dc301f149d3d5abf8c,1371816915,28.812398,-80.883061,0
4,4,2020-06-21 12:15:17,3526826139003047,fraud_Johnston-Casper,travel,3.19,Nathan,Massey,M,5783 Evan Roads Apt. 465,...,44.2529,-85.017,1126,Furniture designer,1955-07-06,57ff021bd3f328f8738bb535c302a31b,1371816917,44.959148,-85.884734,0


In [5]:
df_select = df[["trans_date_trans_time", "category", "amt", "city_pop", "is_fraud"]]

df_select["trans_date_trans_time"] = pd.to_datetime(df_select["trans_date_trans_time"])
df_select["time_var"] = [i.second for i in df_select["trans_date_trans_time"]]

X = pd.get_dummies(df_select, ["category"]).drop(["trans_date_trans_time", "is_fraud"], axis = 1)
y = df["is_fraud"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_select["trans_date_trans_time"] = pd.to_datetime(df_select["trans_date_trans_time"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_select["time_var"] = [i.second for i in df_select["trans_date_trans_time"]]


# 1.) Use scikit learn preprocessing to split the data into 70/30 in out of sample

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3)

In [8]:
X_test, X_holdout, y_test, y_holdout = train_test_split(X_test, y_test, test_size = .5)

In [9]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_holdout = scaler.transform(X_holdout)

# 2.) Make three sets of training data (Oversample, Undersample and SMOTE)

In [10]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

In [11]:
ros = RandomOverSampler()
over_X, over_y = ros.fit_resample(X_train, y_train)

rus = RandomUnderSampler()
under_X, under_y = rus.fit_resample(X_train, y_train)

smote = SMOTE()
smote_X, smote_y = smote.fit_resample(X_train, y_train)

# 3.) Train three logistic regression models

In [12]:
from sklearn.linear_model import LogisticRegression

In [13]:
over_log = LogisticRegression().fit(over_X, over_y)

under_log = LogisticRegression().fit(under_X, under_y)

smote_log = LogisticRegression().fit(smote_X, smote_y)

# 4.) Test the three models

In [14]:
over_log.score(X_test, y_test)

0.9329278533554068

In [15]:
under_log.score(X_test, y_test)

0.9166966577892943

In [16]:
smote_log.score(X_test, y_test)

0.9326279421291298

In [17]:
# We see SMOTE performing with higher accuracy but is ACCURACY really the best measure?

# 5.) Which performed best in Out of Sample metrics?

In [18]:
# Sensitivity here in credit fraud is more important as seen from last class

In [19]:
from sklearn.metrics import confusion_matrix

In [20]:
y_true = y_test

In [21]:
y_pred = over_log.predict(X_test)
cm = confusion_matrix(y_true, y_pred)
cm

array([[77529,  5502],
       [   89,   238]])

In [22]:
print("Over Sample Sensitivity : ", cm[1,1] /( cm[1,0] + cm[1,1]))

Over Sample Sensitivity :  0.72782874617737


In [23]:
y_pred = under_log.predict(X_test)
cm = confusion_matrix(y_true, y_pred)
cm

array([[76175,  6856],
       [   88,   239]])

In [24]:
print("Under Sample Sensitivity : ", cm[1,1] /( cm[1,0] + cm[1,1]))

Under Sample Sensitivity :  0.7308868501529052


In [25]:
y_pred = smote_log.predict(X_test)
cm = confusion_matrix(y_true, y_pred)
cm

array([[77504,  5527],
       [   89,   238]])

In [26]:
print("SMOTE Sample Sensitivity : ", cm[1,1] /( cm[1,0] + cm[1,1]))

SMOTE Sample Sensitivity :  0.72782874617737


# 6.) Pick two features and plot the two classes before and after SMOTE.

In [27]:
X_train = pd.DataFrame(X_train)
y_train = pd.DataFrame(y_train)
raw_temp = pd.concat([X_train, y_train], axis =1)

In [28]:
raw_temp.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,is_fraud
0,0.07683,-0.288409,1.302759,-0.279462,-0.277039,-0.335731,-0.191139,-0.323052,-0.265399,-0.322047,3.234997,-0.22764,-0.257571,-0.276263,-0.285295,-0.312863,-0.180136,
1,-0.168211,-0.270055,1.129332,-0.279462,-0.277039,-0.335731,5.231787,-0.323052,-0.265399,-0.322047,-0.309119,-0.22764,-0.257571,-0.276263,-0.285295,-0.312863,-0.180136,0.0
2,-0.056799,-0.282218,-0.489321,-0.279462,-0.277039,-0.335731,-0.191139,-0.323052,-0.265399,-0.322047,3.234997,-0.22764,-0.257571,-0.276263,-0.285295,-0.312863,-0.180136,0.0
3,0.020153,-0.282433,-0.489321,3.578299,-0.277039,-0.335731,-0.191139,-0.323052,-0.265399,-0.322047,-0.309119,-0.22764,-0.257571,-0.276263,-0.285295,-0.312863,-0.180136,0.0
4,-0.091777,1.343629,0.898095,-0.279462,-0.277039,-0.335731,-0.191139,-0.323052,-0.265399,3.105136,-0.309119,-0.22764,-0.257571,-0.276263,-0.285295,-0.312863,-0.180136,


In [29]:
#plt.scatter(raw_temp[raw_temp["is_fraud"] == 0]["amt"], raw_temp[raw_temp["is_fraud"] == 0]["city_pop"])

plt.scatter(raw_temp[raw_temp["is_fraud"] == 1]["amt"], raw_temp[raw_temp["is_fraud"] == 1]["city_pop"])
plt.legend(["Fraud", "Not Fraud"])
plt.xlabel("Amount")
plt.ylabel("Population")

plt.show()

KeyError: 'amt'

In [None]:

raw_temp = pd.concat([smote_X, smote_y], axis =1)


In [None]:
#plt.scatter(raw_temp[raw_temp["is_fraud"] == 0]["amt"], raw_temp[raw_temp["is_fraud"] == 0]["city_pop"])

plt.scatter(raw_temp[raw_temp["is_fraud"] == 1]["amt"], raw_temp[raw_temp["is_fraud"] == 1]["city_pop"])
plt.legend([ "Not Fraud", "Fraud"])
plt.xlabel("Amount")
plt.ylabel("Population")

plt.show()

# 7.) We want to compare oversampling, Undersampling and SMOTE across our 3 models (Logistic Regression, Logistic Regression Lasso and Decision Trees).

# Make a dataframe that has a dual index and 9 Rows.
# Calculate: Sensitivity, Specificity, Precision, Recall and F1 score. for out of sample data.
# Notice any patterns across perfomance for this model. Does one totally out perform the others IE. over/under/smote or does a model perform better DT, Lasso, LR?
# Choose what you think is the best model and why. test on Holdout

In [30]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
import pandas as pd

In [34]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
resampling_methods = {
    'over' : RandomOverSampler(),
    'under': RandomUnderSampler(),
    'smote':SMOTE()
}
modelconfigs = {
    'Log':LogisticRegression(),
    'Lasso':LogisticRegression(penalty = 'l1',C = 0.5, solver = "liblinear" ) ,
    'DecisionTree':DecisionTreeClassifier()
}

trained_models = {}
results_df = pd.DataFrame(columns=['Sensitivity', 'Specificity', 'Precision', 'Recall', 'F1 Score'])

for resample_key,resampler in resampling_methods.items():
    resample_X, resample_y = resampler.fit_resample(X_train, y_train)

    for model_name, model in modelconfigs.items():
        combined_key = f"{resample_key}_{model_name}"
        trained_models[combined_key] = model.fit(resample_X,resample_y)
    
        y_pred = trained_models[combined_key].predict(X_test)

        tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
        sensitivity = tp / (tp + fn)
        specificity = tn / (tn + fp)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)

        # Store results in the DataFrame
        results_df.loc[combined_key] = [sensitivity, specificity, precision, recall, f1]
                                        
print(results_df)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


                    Sensitivity  Specificity  Precision    Recall  F1 Score
over_Log               0.727829     0.934073   0.041667  0.727829  0.078821
over_Lasso             0.727829     0.934097   0.041681  0.727829  0.078847
over_DecisionTree      0.550459     0.998458   0.584416  0.550459  0.566929
under_Log              0.727829     0.929942   0.039306  0.727829  0.074585
under_Lasso            0.727829     0.932543   0.040760  0.727829  0.077198
under_DecisionTree     0.954128     0.948875   0.068466  0.954128  0.127764
smote_Log              0.727829     0.933712   0.041449  0.727829  0.078431
smote_Lasso            0.727829     0.933760   0.041478  0.727829  0.078483
smote_DecisionTree     0.730887     0.992533   0.278231  0.730887  0.403035


In [33]:
y_pred = trained_models['over_DecisionTree'].predict(X_holdout)
tn, fp, fn, tp = confusion_matrix(y_test, y_holdout).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
precision = precision_score(y_test, y_holdout)
recall = recall_score(y_test, y_holdout)
f1 = f1_score(y_test, y_holdout)
print(f"Sensitivity: {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Sensitivity: 0.0122
Specificity: 0.9962
Precision: 0.0124
Recall: 0.0122
F1 Score: 0.0123


From the provided results, it appears that the Decision Tree model with over resampling consistently outperforms other combinations in terms of F1 score, offering a good balance between correctly identifying positive cases and avoiding false positives. Logistic Regression and Lasso Regression models show similar performance patterns across different resampling methods, with relatively lower sensitivity and precision. 