In [2]:
!pip install imbalanced-learn
# https://machinelearningmastery.com/semi-supervised-learning-with-label-propagation/ 



In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from collections import Counter
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import pickle
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import classification_report, accuracy_score, f1_score, roc_curve, auc
from sklearn.semi_supervised import LabelPropagation

In [4]:
data = pd.read_csv('Data-PR-As2\creditcard.csv')

X = data.drop(columns = ['Time','Amount','Class'])
y = data['Class']

print(f"Original class distribution: {Counter(y)}")


Original class distribution: Counter({0: 284315, 1: 492})


In [5]:
# https://www.turing.com/kb/smote-for-an-imbalanced-dataset 

smote = SMOTE(sampling_strategy = 'auto', k_neighbors = 5, random_state = 12) 
X_balanced, y_balanced = smote.fit_resample(X, y)

print(f"Balanced class distribution: {Counter(y_balanced)}")

Shape of X_balanced: (568630, 28)
Length of y_balanced: 568630
Balanced class distribution: Counter({0: 284315, 1: 284315})


In [6]:
# Split into train / test (80-20)
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size = 0.2, stratify = y_balanced, random_state = 12)

num_items_X_train = X_train.shape[0]
print(num_items_X_train)

# Split train set into labled / unlabeled (30-70)
X_train_lab, X_train_unlab, y_train_lab, y_train_unlab = train_test_split(X_train, y_train, test_size=0.7, stratify=y_train, random_state=12)

print(f"Train set class distribution: {Counter(y_train)}")
print(f"Test set class distribution: {Counter(y_test)}")

print(f"Labeled training set shape: {X_train_lab.shape}")
print(f"Unlabeled training set shape: {X_train_unlab.shape}")
print(f"Labeled training set class distribution: {Counter(y_train_lab)}")

454904
Train set class distribution: Counter({0: 227452, 1: 227452})
Test set class distribution: Counter({1: 56863, 0: 56863})
Labeled training set shape: (136471, 28)
Unlabeled training set shape: (318433, 28)
Labeled training set class distribution: Counter({0: 68236, 1: 68235})


In [7]:
# Baseline RF classifier

rf = RandomForestClassifier(n_estimators=100, random_state=12)
rf.fit(X_train_lab, y_train_lab)

In [8]:

y_pred = rf.predict(X_test)
accuracy_baseline = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy_baseline)

f1_baseline = f1_score(y_test, y_pred)
print("F1 Score:", f1_baseline)

Accuracy: 0.9996306913106942
F1 Score: 0.9996308146689639


In [9]:
with open('semi_supervised_rf.pkl', 'wb') as file: 
    pickle.dump(rf, file) 

In [10]:
# semi-supervised model - step 3
# https://medium.com/geekculture/semi-supervised-learning-label-propagation-for-classification-1963439531cb

y_train_unlab[:] = -1

X_train_semi_sup = np.vstack((X_train_lab, X_train_unlab))
y_train_semi_sup = np.concatenate((y_train_lab, y_train_unlab))

semi_sup_model = LabelPropagation(kernel='knn')
semi_sup_model.fit(X_train_semi_sup, y_train_semi_sup)

y_pred = semi_sup_model.predict(X_test)


accuracy_semi_sup = accuracy_score(y_test, y_pred)
f1_semi_sup = f1_score(y_test, y_pred)  

print("Accuracy:", accuracy_semi_sup)
print("F1 Score:", f1_semi_sup)




Accuracy: 0.9978456993123824
F1 Score: 0.9978481779776386


  probabilities /= normalizer


In [11]:
class_distribution = Counter(y_train_semi_sup.flatten())
print("Class distribution in semi-supervised training set:", class_distribution)


Class distribution in semi-supervised training set: Counter({-1: 318433, 0: 68236, 1: 68235})


In [12]:
# step 4

retrieved_labels = semi_sup_model.transduction_

retrained_baseline = RandomForestClassifier(n_estimators=100, random_state=12)
retrained_baseline.fit(X_train, retrieved_labels)

y_pred_retrained = retrained_baseline.predict(X_test)

accuracy_retrained_rf = accuracy_score(y_test, y_pred_retrained)
f1_retrained_rf = f1_score(y_test, y_pred_retrained)

print("New Random Forest Accuracy:", accuracy_retrained_rf)
print("New Random Forest F1 Score:", f1_retrained_rf)


New Random Forest Accuracy: 0.5046251516803545
New Random Forest F1 Score: 0.5010671744232387


In [16]:
# 1. Supervised baseline
y_prob_baseline = rf.predict_proba(X_test)[:, 1]  # Probability of positive class

# 2. Semi-supervised model
y_prob_semi_sup = semi_sup_model.predict_proba(X_test)[:, 1]  # Probability of positive class

# 3. Retrained baseline with transduced labels
y_prob_retrained = retrained_baseline.predict_proba(X_test)[:, 1]  # Probability of positive class

# Calculate ROC curves
fpr_baseline, tpr_baseline, _ = roc_curve(y_test, y_prob_baseline)
fpr_semi_sup, tpr_semi_sup, _ = roc_curve(y_test, y_prob_semi_sup)
fpr_retrained, tpr_retrained, _ = roc_curve(y_test, y_prob_retrained)

# Calculate AUCs
auc_baseline = auc(fpr_baseline, tpr_baseline)
auc_semi_sup = auc(fpr_semi_sup, tpr_semi_sup)
auc_retrained = auc(fpr_retrained, tpr_retrained)

# Plotting ROC Curves
plt.figure(figsize=(10, 8))
plt.plot(fpr_baseline, tpr_baseline, label=f'Supervised Baseline (AUC = {auc_baseline:.2f})', color='blue')
plt.plot(fpr_semi_sup, tpr_semi_sup, label=f'Semi-Supervised Model (AUC = {auc_semi_sup:.2f})', color='orange')
plt.plot(fpr_retrained, tpr_retrained, label=f'Retrained Baseline (AUC = {auc_retrained:.2f})', color='green')

# Plot settings
plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves for Supervised Baseline, Semi-Supervised Model, and Retrained Baseline')
plt.legend(loc="lower right")
plt.show()


  probabilities /= normalizer


ValueError: Input contains NaN.