In [4]:
!pip install imbalanced-learn



In [4]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.semi_supervised import LabelPropagation

In [5]:
data = pd.read_csv('Data-PR-As2\creditcard.csv')

X = data.drop(columns = ['Time','Amount','Class'])
y = data['Class']

print(f"Original class distribution: {Counter(y)}")


Original class distribution: Counter({0: 284315, 1: 492})


In [6]:
smote = SMOTE(sampling_strategy = 'auto', k_neighbors = 5, random_state = 12) 
X_balanced, y_balanced = smote.fit_resample(X, y)

print(f"Shape of X_balanced: {X_balanced.shape}")
print(f"Length of y_balanced: {len(y_balanced)}")

print(f"Balanced class distribution: {Counter(y_balanced)}")

Shape of X_balanced: (568630, 28)
Length of y_balanced: 568630
Balanced class distribution: Counter({0: 284315, 1: 284315})


In [7]:
# Split into train / test (80-20)
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size = 0.2, stratify = y_balanced, random_state = 12)

num_items_X_train = X_train.shape[0]
print(num_items_X_train)

# Split train set into labled / unlabeled (30-70)
X_train_lab, X_train_unlab, y_train_lab, y_train_unlab = train_test_split(X_train, y_train, test_size=0.7, stratify=y_train, random_state=12)

print(f"Train set class distribution: {Counter(y_train)}")
print(f"Test set class distribution: {Counter(y_test)}")

print(f"Labeled training set shape: {X_train_lab.shape}")
print(f"Unlabeled training set shape: {X_train_unlab.shape}")
print(f"Labeled training set class distribution: {Counter(y_train_lab)}")

454904
Train set class distribution: Counter({0: 227452, 1: 227452})
Test set class distribution: Counter({1: 56863, 0: 56863})
Labeled training set shape: (136471, 28)
Unlabeled training set shape: (318433, 28)
Labeled training set class distribution: Counter({0: 68236, 1: 68235})


In [8]:
# Baseline RF classifier

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_lab, y_train_lab)

In [9]:

y_pred = rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

Accuracy: 0.999621898246663
F1 Score: 0.9996220212195529


In [10]:
with open('semi_supervised_rf.pkl', 'wb') as file: 
    pickle.dump(rf, file) 

In [11]:
# semi-supervised model

y_train_unlab[:] = -1

X_train_semi_sup = np.vstack((X_train_lab, X_train_unlab))
y_train_semi_sup = np.concatenate((y_train_lab, y_train_unlab))

semi_sup_model = LabelPropagation(kernel='knn',n_neighbors=5)
semi_sup_model.fit(X_train_semi_sup, y_train_semi_sup)

y_pred = semi_sup_model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)  # 'weighted' for imbalanced data

print("Accuracy:", accuracy)
print("F1 Score:", f1)




Accuracy: 0.992288482844732
F1 Score: 0.992247787923521


  probabilities /= normalizer


In [14]:
class_distribution = Counter(y_train_semi_sup.flatten())
print("Class distribution in semi-supervised training set:", class_distribution)


Class distribution in semi-supervised training set: Counter({-1: 318433, 0: 68236, 1: 68235})


array([[-0.73123684,  0.85611357,  1.19792821, ..., -0.51056082,
         0.03167188,  0.0630186 ],
       [ 2.00928416, -2.17890035,  0.1816708 , ..., -0.09103516,
         0.04981473, -0.01081575],
       [-4.04133407, -0.82254664, -3.76910332, ..., -0.16122144,
         0.98661017,  0.43301891],
       ...,
       [-0.70658837,  0.43599596,  2.67738832, ..., -0.81429746,
        -0.06350236, -0.15405208],
       [-0.11452743,  1.08790915, -0.66134505, ...,  0.12877101,
         0.32532032,  0.13039883],
       [ 1.20473729, -0.20808131, -0.15291472, ..., -0.43926668,
         0.00766419,  0.01655758]])