In [None]:
!pip install imbalanced-learn



In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
file_name = 'creditcard.csv'

try:
    df = pd.read_csv(file_name)
    print("dataset loaded successfully!")
    print(df.head())
except:
    print("error loading dataset.")

dataset loaded successfully!
   Time        V1        V2        V3        V4        V5        V6        V7  \
0     0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1     0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2     1 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3     1 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4     2 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

   

In [None]:
print("\ndataset info")
df.info()

# check the class imbalance
print("\nclass distribution")
print(df['Class'].value_counts())

print("\nas a percentage:")
print(df['Class'].value_counts(normalize=True) * 100)


dataset info
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39999 entries, 0 to 39998
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    39999 non-null  int64  
 1   V1      39999 non-null  float64
 2   V2      39999 non-null  float64
 3   V3      39999 non-null  float64
 4   V4      39999 non-null  float64
 5   V5      39999 non-null  float64
 6   V6      39999 non-null  float64
 7   V7      39999 non-null  float64
 8   V8      39999 non-null  float64
 9   V9      39999 non-null  float64
 10  V10     39999 non-null  float64
 11  V11     39999 non-null  float64
 12  V12     39999 non-null  float64
 13  V13     39999 non-null  float64
 14  V14     39999 non-null  float64
 15  V15     39999 non-null  float64
 16  V16     39999 non-null  float64
 17  V17     39999 non-null  float64
 18  V18     39999 non-null  float64
 19  V19     39999 non-null  float64
 20  V20     39999 non-null  float64
 21  V21     39999 non-nul

In [None]:
# define features (X) and target (y)
X = df.drop(['Class', 'Time'], axis=1)
y = df['Class']

print(f"shape of features (X): {X.shape}")
print(f"shape of target (y): {y.shape}")

shape of features (X): (39999, 29)
shape of target (y): (39999,)


In [None]:
# 80 twenny split
# stratify=y to ensure the class distribution is the same in both sets
# random_state=42 so we get the same split every time we run it
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("training set distribution")
print(y_train.value_counts(normalize=True))

print("\ntest set distribution")
print(y_test.value_counts(normalize=True))

training set distribution
Class
0    0.997406
1    0.002594
Name: proportion, dtype: float64

test set distribution
Class
0    0.997375
1    0.002625
Name: proportion, dtype: float64


In [None]:
print("training baseline rf model")

# initialize the classifier
baseline_model = RandomForestClassifier(random_state=42)

# train the model
baseline_model.fit(X_train, y_train)

print("baseline model training complete!! woo")

training baseline rf model
baseline model training complete!! woo


In [None]:
print("we evaluating baseline model!")

# predict on test set
y_pred_baseline = baseline_model.predict(X_test)

# calculate and print the confusion matrix
print("\nbaseline model conf matrix:")
print(confusion_matrix(y_test, y_pred_baseline))

# calculate and print the classification report
print("\nbaseline model classification report:")
print(classification_report(y_test, y_pred_baseline, target_names=['Class 0 (Non-Fraud)', 'Class 1 (Fraud)']))

we evaluating baseline model!

baseline model conf matrix:
[[7978    1]
 [   5   16]]

baseline model classification report:
                     precision    recall  f1-score   support

Class 0 (Non-Fraud)       1.00      1.00      1.00      7979
    Class 1 (Fraud)       0.94      0.76      0.84        21

           accuracy                           1.00      8000
          macro avg       0.97      0.88      0.92      8000
       weighted avg       1.00      1.00      1.00      8000



In [None]:
print("training balanced rf model using class_weight='balanced'")

# initialize classifier, this time with class_weight='balanced'
balanced_model = RandomForestClassifier(random_state=42, class_weight='balanced')

# train new model
balanced_model.fit(X_train, y_train)

print("balanced model training complete! yay!!")

training balanced rf model using class_weight='balanced'
balanced model training complete! yay!!


In [None]:
print("evaluating baseline model time!")

# prediction on the test set
y_pred_balanced = balanced_model.predict(X_test)

# calc and print the confusion matrix
print("\nbalanced model confusion matrix:")
print(confusion_matrix(y_test, y_pred_balanced))

# calculate and print the classification report
print("\nbalanced model classification report:")
print(classification_report(y_test, y_pred_balanced, target_names=['Class 0 (Non-Fraud)', 'Class 1 (Fraud)']))

evaluating baseline model time!

balanced model confusion matrix:
[[7978    1]
 [   3   18]]

balanced model classification report:
                     precision    recall  f1-score   support

Class 0 (Non-Fraud)       1.00      1.00      1.00      7979
    Class 1 (Fraud)       0.95      0.86      0.90        21

           accuracy                           1.00      8000
          macro avg       0.97      0.93      0.95      8000
       weighted avg       1.00      1.00      1.00      8000

