In [4]:
!pip install imbalanced-learn



In [35]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score

In [7]:
data = pd.read_csv('Data-PR-As2\creditcard.csv')

X = data.drop(columns = ['Time','Amount','Class'])
y = data['Class']

print(f"Original class distribution: {Counter(y)}")


Original class distribution: Counter({0: 284315, 1: 492})


In [21]:
smote = SMOTE(sampling_strategy = 'auto', k_neighbors = 5, random_state = 12) 
X_balanced, y_balanced = smote.fit_resample(X, y)

print(f"Shape of X_balanced: {X_balanced.shape}")
print(f"Length of y_balanced: {len(y_balanced)}")

print(f"Balanced class distribution: {Counter(y_balanced)}")

Shape of X_balanced: (568630, 28)
Length of y_balanced: 568630
Balanced class distribution: Counter({0: 284315, 1: 284315})


In [30]:
# Split into train / test (80-20)
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size = 0.2, stratify = y_balanced, random_state = 12)

num_items_X_train = X_train.shape[0]
print(num_items_X_train)

# Split train set into labled / unlabeled (30-70)
X_train_lab, X_train_unlab, y_train_lab, _ = train_test_split(X_train, y_train, test_size=0.7, stratify=y_train, random_state=12)

print(f"Train set class distribution: {Counter(y_train)}")
print(f"Test set class distribution: {Counter(y_test)}")

print(f"Labeled training set shape: {X_train_lab.shape}")
print(f"Unlabeled training set shape: {X_train_unlab.shape}")
print(f"Labeled training set class distribution: {Counter(y_train_lab)}")

454904
Train set class distribution: Counter({0: 227452, 1: 227452})
Test set class distribution: Counter({1: 56863, 0: 56863})
Labeled training set shape: (136471, 28)
Unlabeled training set shape: (318433, 28)
Labeled training set class distribution: Counter({0: 68236, 1: 68235})


In [32]:
# Random forest model

rf = RandomForestClassifier()

param_grid = { 
    'n_estimators': [200, 500],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
GS_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv= 5)

GS_rf.fit(X_train, y_train)
GS_rf.best_params_

rf.fit(X_train, y_train)

KeyboardInterrupt: 

In [34]:
rf = RandomForestClassifier()

param_grid = { 
    'n_estimators': [50, 100],      
    'max_depth': [4, 6, 8],          
    'criterion': ['gini']          
}
GS_rf = GridSearchCV(estimator=rf, param_grid=param_grid, cv= 3)

GS_rf.fit(X_train, y_train)
GS_rf.best_params_

rf.fit(X_train, y_train)

In [37]:
print(GS_rf.best_params_)

y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

{'criterion': 'gini', 'max_depth': 8, 'n_estimators': 100}
Accuracy: 0.9998329317834093


In [None]:
with open('semi_supervised_rf.pkl', 'wb') as file: 
    pickle.dump(rf, file) 