In [22]:
import torch
import numpy as np
import random

random.seed(42)  
np.random.seed(42) 
torch.manual_seed(42)  
torch.cuda.manual_seed_all(42)  


In [23]:
import os
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, roc_auc_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)


Using device: cpu


In [24]:
df = pd.read_csv("/Users/tasosbliagkos/Documents/KD_project/creditcard.csv")

print("Shape:", df.shape)
df.head()


Shape: (284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [25]:
print(df.info())

print("\nClass value counts:")
print(df["Class"].value_counts())
print("\nClass distribution (%):")
print(df["Class"].value_counts(normalize=True) * 100)

print("\nDescribe numeric columns:")
df.describe()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.168375e-15,3.416908e-16,-1.379537e-15,2.074095e-15,9.604066e-16,1.487313e-15,-5.556467e-16,1.213481e-16,-2.406331e-15,...,1.654067e-16,-3.568593e-16,2.578648e-16,4.473266e-15,5.340915e-16,1.683437e-15,-3.660091e-16,-1.22739e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


31 rows opou to prwto einai to time, o xronos se deuterolepta metaksu ths prwths sunallaghs sto dataset. meta exoume tis V1-V28 opou einai oi sthles pou proekupsan apo PCA kai einai ta features twn sunallagwn. kai telos exoume to class pou einai eite 1 eite 0. to 0 shmainei legit sunallagh enw to 1 fraud.

In [26]:
print("NaN in Class:", df["Class"].isna().sum())

print("Unique values in Class:", df["Class"].unique())


NaN in Class: 0
Unique values in Class: [0 1]


In [27]:
df = df[df["Class"].notna()]

df["Class"] = df["Class"].astype(int)

print("After cleaning:")
print("NaN in Class:", df["Class"].isna().sum())
print(df["Class"].value_counts())


After cleaning:
NaN in Class: 0
Class
0    284315
1       492
Name: count, dtype: int64


In [28]:
print(df.shape)


(284807, 31)


Εδώ κόβουμε το dataset σε τρία κομμάτια. Το train είναι αυτό με το οποίο θα εκπαιδεύσουμε τον teacher. Το validation το χρησιμοποιούμε για να παρακολουθούμε τι κάνει κατά τη διάρκεια του training. Το test το κρατάμε για το τέλος, για τελική αξιολόγηση. Το stratify=y κρατάει παρόμοιο fraud ratio σε όλα τα split. Δηλαδη: πόσες συναλλαγές είναι fraud / πόσες συνολικά; Στο dataset μας ειναι 0.17% Χωρις το stratification θα ειχαμε π.χ. 0.3% στο train, 0.1% στο validation, 0.5 sto test που ειναι απαγορευτικο και θα μας εβγαζε λαθος αφου το μοντέλο εκπαιδεύεται σε άλλη κατανομή, αξιολογείται σε άλλη, τα metrics δεν είναι συγκρίσιμα. Συγκεκριμενα βλεπουμε απο το output οτι τα δικα μας δεδομενα ειναι:

Fraud ratio train: 0.0017254870488152324


Fraud ratio val: 0.0017321691907960957


Fraud ratio test: 0.0017321286456626562

In [29]:
X = df.drop(columns=["Class"]).values
y = df["Class"].values

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)


X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp
)

print("Train:", X_train.shape)
print("Val:", X_val.shape)
print("Test:", X_test.shape)

print("Fraud ratio train:", y_train.mean())
print("Fraud ratio val:", y_val.mean())
print("Fraud ratio test:", y_test.mean())


Train: (199364, 30)
Val: (42721, 30)
Test: (42722, 30)
Fraud ratio train: 0.0017254870488152324
Fraud ratio val: 0.0017321691907960957
Fraud ratio test: 0.0017321286456626562


Εδώ κανονικοποιούμε όλα τα features ώστε να έχουν παρόμοια κλίμακα. Αυτό βοηθάει πολύ τα νευρωνικά να εκπαιδευτούν σταθερά, ειδικά επειδή το Amount και το Time μπορεί να είναι σε άλλη κλίμακα από τα PCA components.


Χωρίς scaling:

το Amount και το Time θα “σκεπάζουν” τα PCA features

το μοντέλο θα βασίζεται υπερβολικά σε αυτά

το KD αργότερα θα αποστάζει λάθος patterns

In [30]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_val   = scaler.transform(X_val)
X_test  = scaler.transform(X_test)

edw sthn ousia allazw ta numpy arrays se batches kai tensors wste na xrhsimopoihthoun sto neurwniko mou diktuo. ftiaxnoume nea datasets ta opoia mesa periexoun ta features kai ta labels. Ta dataloaders ta opoia xrhsimopoioume gia na paroume ta dedomena se batches twn 512 kai 1024 gia ta val kai test modes.

In [31]:
import torch
from torch.utils.data import Dataset, DataLoader

class FraudDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


train_ds = FraudDataset(X_train, y_train)
val_ds   = FraudDataset(X_val, y_val)
test_ds  = FraudDataset(X_test, y_test)

train_loader = DataLoader(train_ds, batch_size=512, shuffle=False)
val_loader   = DataLoader(val_ds, batch_size=1024, shuffle=False)
test_loader  = DataLoader(test_ds, batch_size=1024, shuffle=False)

len(train_ds), len(val_ds), len(test_ds)


(199364, 42721, 42722)

Edw exoume ftiaksei to basiko neurwna tou teacher model mas opou to input (30) tha pernaei apo 256 neurwnes sthn arxh, 128 meta kai 64 sto telos kai kathe fora tha efarmozetai RELU

In [32]:
import torch.nn as nn
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

input_dim = X_train.shape[1] #30 inputs, sthn ousia kathe sunallagh monh ths.
print("Input dim:", input_dim)

class MLP(nn.Module):
    def __init__(self, input_dim, hidden_dims, dropout_rate=0.5):
        super().__init__()
        layers = []
        prev = input_dim
        for h in hidden_dims:
            layers.append(nn.Linear(prev, h))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))  # evala dropout gia na mhn kanei overfit to modelo, apenergopoiei tuxaia neurwnes prokeimenou na mhn katalhksoun kapoioi neurwnes ta basizontai polu se allous.
            prev = h
        layers.append(nn.Linear(prev, 2))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)


#phga apo 256-128-64 se 128-64-32
teacher = MLP(input_dim, hidden_dims=[128, 64, 32], dropout_rate=0.5).to(device)

teacher


Using device: cpu
Input dim: 30


MLP(
  (net): Sequential(
    (0): Linear(in_features=30, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=64, out_features=32, bias=True)
    (7): ReLU()
    (8): Dropout(p=0.5, inplace=False)
    (9): Linear(in_features=32, out_features=2, bias=True)
  )
)

εδω εχουμε τον κωδικα του teacher που περιεχει class weights οπου το μοντελο στην ουσια η κλαση με τα λιγοτερα δειγματα fraud ειναι αυτη που τιμωρει περισσοτερο το μοντελο σε περιπτωση λαθους 

εχουμε υλοποιησει εναν adam optimizer o opoios einai algorithmos pou prosarmozei to vhma mathishs kai bohtha sth sugklisi twn dedomenwn. 

kai L2 regularization pou vohthaei sto overfitting 



In [33]:
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

classes = np.array([0, 1])
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=classes,
    y=y_train
)
class_weights = torch.tensor(class_weights, dtype=torch.float32).to(device)
print("Class weights:", class_weights)


def train_teacher(model, train_loader, val_loader, epochs=5, lr=1e-4, patience=3):
   
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-5)  # Προσθήκη weight decay για L2 regularization
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    best_val_loss = float('inf')
    epochs_without_improvement = 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)

            optimizer.zero_grad()
            logits = model(xb)
            loss = criterion(logits, yb)
            loss.backward()
            optimizer.step()

            total_loss += loss.item() * xb.size(0)

        avg_train_loss = total_loss / len(train_loader.dataset)

        model.eval()
        total_val_loss = 0.0
        correct, total = 0, 0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb, yb = xb.to(device), yb.to(device)
                logits = model(xb)
                loss = criterion(logits, yb)
                total_val_loss += loss.item() * xb.size(0)

                preds = logits.argmax(dim=1)
                correct += (preds == yb).sum().item()
                total += yb.size(0)

        avg_val_loss = total_val_loss / len(val_loader.dataset)
        val_acc = correct / total if total > 0 else 0.0

        print(f"Epoch {epoch+1}: train_loss={avg_train_loss:.4f}, val_loss={avg_val_loss:.4f}, val_acc={val_acc:.4f}")
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            epochs_without_improvement = 0  
        else:
            epochs_without_improvement += 1
            if epochs_without_improvement >= patience:
                print("Early stopping triggered!")
                break 


Class weights: tensor([  0.5009, 289.7733])


parathroume ooti to baros sto class 0 (non fraud) einai 0.5009 enw sto class 1 (fraud) einai 289.7733 to opoio einai thetiko shmadi kathws to modelo mas dinei megaluterh varutita stis fraud sunallages ap oti stis legit.

Εδώ κάνουμε δύο πράγματα. Πρώτον, υπολογίζουμε class_weights για να δώσουμε πολύ μεγαλύτερο βάρος στα fraud samples, επειδή είναι πολύ λίγα. Αυτό μπαίνει μέσα στο CrossEntropyLoss ώστε το loss να “πονάει” περισσότερο όταν κάνουμε λάθος στην fraud κλάση. Δεύτερον, γράφουμε τον training loop του teacher: για κάθε epoch, περνάει όλα τα batches, κάνει forward → loss → backward → optimizer step και στο τέλος υπολογίζει validation accuracy για να δούμε αν βελτιώνεται.
Adam optimizer

In [34]:
train_teacher(teacher, train_loader, val_loader, epochs=15, lr=1e-4)


Epoch 1: train_loss=0.5977, val_loss=0.4849, val_acc=0.9990
Epoch 2: train_loss=0.4524, val_loss=0.3382, val_acc=0.9989
Epoch 3: train_loss=0.3028, val_loss=0.2602, val_acc=0.9975
Epoch 4: train_loss=0.2445, val_loss=0.2360, val_acc=0.9945
Epoch 5: train_loss=0.2095, val_loss=0.2292, val_acc=0.9906
Epoch 6: train_loss=0.1775, val_loss=0.2323, val_acc=0.9902
Epoch 7: train_loss=0.1961, val_loss=0.2282, val_acc=0.9891
Epoch 8: train_loss=0.1593, val_loss=0.2328, val_acc=0.9898
Epoch 9: train_loss=0.1581, val_loss=0.2324, val_acc=0.9899
Epoch 10: train_loss=0.1711, val_loss=0.2260, val_acc=0.9893
Epoch 11: train_loss=0.1506, val_loss=0.2269, val_acc=0.9894
Epoch 12: train_loss=0.1545, val_loss=0.2254, val_acc=0.9890
Epoch 13: train_loss=0.1332, val_loss=0.2237, val_acc=0.9881
Epoch 14: train_loss=0.1293, val_loss=0.2246, val_acc=0.9881
Epoch 15: train_loss=0.1361, val_loss=0.2242, val_acc=0.9889


Μετράει πόσο λάθος έκανε ο Δάσκαλος πάνω στα δεδομένα που χρησιμοποιεί για να εκπαιδευτεί.

Μετράει το λάθος του μοντέλου σε δεδομένα που δεν έχει ξαναδεί ποτέ (Validation Set). Είναι σαν ένα "τεστ προσομοίωσης".

Το ποσοστό των σωστών προβλέψεων (Legit ως Legit και Fraud ως Fraud) πάνω στο Validation Set.

Η σταθερή μείωση δείχνει ότι το μοντέλο εκπαιδεύεται ομαλά και ότι δεν αντιμετωπίζει προβλήματα όπως vanishing gradients ή overfitting.

Η validation accuracy δείχνει πόσο καλά το μοντέλο γενικεύει στα δεδομένα που δεν έχει ξαναδεί.

Αναγκαστικά άλλαξα τα layers απο 256-128-64 σε 128-64-32 για να μην εχουμε overfitting στο μοντελο πραγμα που φαινεται απο το val_loss το οποιο παραμενει κατα βαση σταθερο.

In [35]:
from sklearn.metrics import classification_report, roc_auc_score

def evaluate(model, loader):
    model.eval()
    all_y = []
    all_p = []
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(device)
            logits = model(xb)
            probs = F.softmax(logits, dim=1)[:, 1].cpu().numpy()  # Αποθήκευση των probabilities για την κλάση fraud
            all_p.extend(probs)
            all_y.extend(yb.numpy())

    preds = [1 if p > 0.95 else 0 for p in all_p]
    print(classification_report(all_y, preds, digits=4))
    try:
        print("ROC AUC:", roc_auc_score(all_y, all_p))
    except ValueError:
        print("ROC AUC: cannot compute")

print("Teacher performance on test set:")
evaluate(teacher, test_loader)


Teacher performance on test set:
              precision    recall  f1-score   support

           0     0.9997    0.9996    0.9996     42648
           1     0.7692    0.8108    0.7895        74

    accuracy                         0.9993     42722
   macro avg     0.8845    0.9052    0.8945     42722
weighted avg     0.9993    0.9993    0.9993     42722

ROC AUC: 0.9752020943284307


NA UPOLOGISW POSO PERISSORERO XRONO KATANALWNEI OTAN TO THRESHOLD EINAI STO 0.95

Από όλες τις συναλλαγές που το μοντέλο βάφτισε ως απάτη, πόσες ήταν πραγματικά απάτες;

Από όλες τις πραγματικές απάτες που υπήρχαν στο dataset, πόσες κατάφερε το μοντέλο να βρει

Ποια είναι η συνολική ισορροπία μεταξύ Precision και Recall

Τον πραγματικό αριθμό των δειγμάτων (συναλλαγών) που ανήκουν σε κάθε κλάση στο συγκεκριμένο test set.


Το ROC AUC λέει πόσο καλή είναι η ποιότητα του διαχωρισμού που κάνει το μοντέλο σου συνολικά. Αν επιλέξω τυχαία μία πραγματική απάτη και μία νόμιμη συναλλαγή από το dataset, υπάρχει 97.21% πιθανότητα ο Teacher να δώσει υψηλότερο "σκορ απάτης" στην πραγματική απάτη.

ΠΡΟΧΩΡΑΩ ΣΤΟ STUDENT MODEL

Εδω εχουμε το student model που αποτελειται απο ενα νευρωνικο δικτυο με 30 εισοδους (προφανως) και εχει 2 κρυφα επιπεδα των 32 νευρωνων το πρωτο το οποιο συρικνωνεται σε 16 αργοτερα. Student είναι δηλαδή σημαντικά μικρότερος (λιγότερα layers και πολύ λιγότεροι νευρώνες ανά επίπεδο).

Θετω το dropout rate του student model στο 0.2 αντιθετα με το teacher model που ειναι στο 0.5. Αυτο συμβαίνει επειδη στο teacher ειναι πολυ μεγαλυτερο το μοντελο και εχει πολλους περισσοτερους νευρωνες με αποτελεσμα πολλοι απο αυτους πολλες φορες να σταματανε να μαθαινουν και να επηρεαζονται απο αλλους συγκεκριμενους νευρωνες και να προκαλουν overfitting.

In [36]:
student = MLP(input_dim, hidden_dims=[32, 16], dropout_rate=0.2).to(device)

print("Student Architecture:")
print(student)

Student Architecture:
MLP(
  (net): Sequential(
    (0): Linear(in_features=30, out_features=32, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.2, inplace=False)
    (3): Linear(in_features=32, out_features=16, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.2, inplace=False)
    (6): Linear(in_features=16, out_features=2, bias=True)
  )
)


Στη συνεχεια εχουμε την καρδια του KD οπου για αρχη μετραμε το loss που συγκρινει τις προβλεψεις του μαθητη με τις πραγματικες τιμες που ειναι αποθηκευμενες στα labels. Χρησιμοποιω τα class weights για να δινεται περισσοτερη βαση στις απατες πολυ περισσοτερο απ τις κανονικες συναλλαγες.

Στη συνεχεια εχω το soft loss. Το κανω αυτο επειδη θελω ο μαθητης να μαθαινει πραγματικα. ΔΗΛΑΔΗ, οσο εχω το Τ=5 αν ο καθηγητης εμπεριεχει τιμες που ειναι ακραιες οπως 0.99 για legit και 0.01 για fraud το μοντελο του μαθητη δεν θα μπορουσε να μαθει, ενω με το Τ=5 οι ακραιες αυτες τιμες γινονται πιο ομαλες, πχ στο 0.73 και στο 0.27.

Το KL Divergence ειναι ενα metric που χρησιμοποιω για να μετρησω ποσο πολυ διαφερουν οι αποψεις του μαθητη σε σχεση με τις αποψεις του καθηγητη, θελω ο μαθητης να πλησιαζει τις πιθανοτητες του καθηγητη

Με λιγα λογια σε αυτη τη συναρτηση βαζουμε τα θεμελια προκειμενου το μοντελο μαθητη να μη μαθαινει απλα ειναι απατη-ειναι legit και τελος. Μπαινει στη διαδικασια να δει για ποιο λογο εχει παρθει η αποφαση.

In [37]:
def loss_kd(outputs, labels, teacher_outputs, T=5.0, alpha=0.7):
    hard_loss = F.cross_entropy(outputs, labels, weight=class_weights)
    soft_targets = F.log_softmax(outputs / T, dim=1)
    soft_labels = F.softmax(teacher_outputs / T, dim=1)

    distillation_loss = F.kl_div(soft_targets, soft_labels, reduction='batchmean') * (T ** 2)
    return alpha * distillation_loss + (1. - alpha) * hard_loss

Πριν ξεκινήσει οτιδήποτε, θέτουμε τον Δάσκαλο σε eval() mode. Αυτό απενεργοποιεί το Dropout. Λέμε στην PyTorch να μην υπολογίζει κλίσεις για τον Δάσκαλο, εξοικονομώντας μνήμη, αφού ο Δάσκαλος δεν θα αλλάξει πια

In [38]:
def train_student(student_model, teacher_model, train_loader, val_loader, epochs=10, lr=1e-3):
    optimizer = torch.optim.Adam(student_model.parameters(), lr=lr)
    teacher_model.eval()
    
    for epoch in range(epochs):
        student_model.train()
        train_loss = 0.0
        
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            with torch.no_grad():
                teacher_logits = teacher_model(xb)
            
            student_logits = student_model(xb) #edw o mathitis koitazei ta 30 features kai prospathei na mantepsei an prokeitai gia apath. Sthn arxh oi apanthseis tou einai random.
            
            #edw ginetai to distillation. Lambanei shma apo ta dedomena (kata 30%) oti ekanes lathos px kai apo ton daskalo kata 70% prokeimenou na elegksei tis pithanothtes pou tou dinontai.
            loss = loss_kd(student_logits, yb, teacher_logits, T=5.0, alpha=0.7)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item() * xb.size(0)
            
        avg_loss = train_loss / len(train_loader.dataset)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")
train_student(student, teacher, train_loader, val_loader, epochs=15)

Epoch 1/15, Loss: 0.4779
Epoch 2/15, Loss: 0.1165
Epoch 3/15, Loss: 0.1035
Epoch 4/15, Loss: 0.0944
Epoch 5/15, Loss: 0.0891
Epoch 6/15, Loss: 0.0824
Epoch 7/15, Loss: 0.0775
Epoch 8/15, Loss: 0.0737
Epoch 9/15, Loss: 0.0694
Epoch 10/15, Loss: 0.0660
Epoch 11/15, Loss: 0.0631
Epoch 12/15, Loss: 0.0621
Epoch 13/15, Loss: 0.0592
Epoch 14/15, Loss: 0.0583
Epoch 15/15, Loss: 0.0535


In [39]:
print("Student performance on test set (Threshold 0.95):")
evaluate(student, test_loader)

Student performance on test set (Threshold 0.95):
              precision    recall  f1-score   support

           0     0.9996    0.9995    0.9996     42648
           1     0.7468    0.7973    0.7712        74

    accuracy                         0.9992     42722
   macro avg     0.8732    0.8984    0.8854     42722
weighted avg     0.9992    0.9992    0.9992     42722

ROC AUC: 0.9756623358023189


In [40]:
import time
import torch

X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)

teacher.eval()
student.eval()

start_teacher = time.time()
with torch.no_grad():
    _ = teacher(X_test_tensor)
end_teacher = time.time()
teacher_latency = end_teacher - start_teacher

start_student = time.time()
with torch.no_grad():
    _ = student(X_test_tensor)
end_student = time.time()
student_latency = end_student - start_student

print(f"Teacher Inference Time: {teacher_latency:.4f}s")
print(f"Student Inference Time: {student_latency:.4f}s")
print(f"Speedup: {teacher_latency / student_latency:.2f}x")

Teacher Inference Time: 0.0123s
Student Inference Time: 0.0033s
Speedup: 3.72x


In [41]:
import numpy as np

def stable_benchmark(model, input_data, name, iterations=50):
    model.eval()
    with torch.no_grad():
        for _ in range(10):
            _ = model(input_data)
    
    times = []
    with torch.no_grad():
        for _ in range(iterations):
            start = time.time()
            _ = model(input_data)
            times.append(time.time() - start)
    
    avg_time = np.mean(times)
    std_time = np.std(times)
    
    print(f"{name} - Μέσος Χρόνος: {avg_time:.4f}s (+/- {std_time:.4f}s)")
    return avg_time

avg_teacher = stable_benchmark(teacher, X_test_tensor, "Teacher")
avg_student = stable_benchmark(student, X_test_tensor, "Student")

print(f"\n Average speedup: {avg_teacher / avg_student:.2f}")

Teacher - Μέσος Χρόνος: 0.0093s (+/- 0.0008s)
Student - Μέσος Χρόνος: 0.0027s (+/- 0.0002s)

 Average speedup: 3.45


In [45]:
import time
import torch
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score

scale_weight = (len(y_train) - sum(y_train)) / sum(y_train)

print("Training XGBoost...")
xgb_model = XGBClassifier(
    scale_pos_weight=scale_weight, 
    random_state=42, 
    use_label_encoder=False, 
    eval_metric='logloss'
)
xgb_model.fit(X_train, y_train)

print("Training Random Forest...")
rf_model = RandomForestClassifier(
    class_weight='balanced', 
    random_state=42, 
    n_jobs=-1
)
rf_model.fit(X_train, y_train)


def full_benchmark(model, X_test, y_test, name, is_torch=False, threshold=0.95):
   
    start = time.time()
    
    if is_torch:
        model.eval()
        with torch.no_grad():
            
            inputs = torch.tensor(X_test, dtype=torch.float32).to(device)
            logits = model(inputs)
            
            probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
    else:
       
        probs = model.predict_proba(X_test)[:, 1]
    
    end = time.time()
    latency = end - start
    

    preds = [1 if p > threshold else 0 for p in probs]
    

    precision, recall, f1, _ = precision_recall_fscore_support(y_test, preds, average='binary')
    

    auc = roc_auc_score(y_test, probs)
    
    return {
        "Model": name,
        "ROC AUC": auc,
        "F1-Score": f1,
        "Recall": recall,
        "Precision": precision,
        "Latency (s)": latency
    }
results = []

results.append(full_benchmark(teacher, X_test, y_test, "Teacher (Master)", is_torch=True))
results.append(full_benchmark(student, X_test, y_test, "Student (Distilled)", is_torch=True))

results.append(full_benchmark(xgb_model, X_test, y_test, "XGBoost", is_torch=False))
results.append(full_benchmark(rf_model, X_test, y_test, "Random Forest", is_torch=False, threshold=0.5))

df_final = pd.DataFrame(results)

print("\nFinal Comparison Table (Threshold 0.95 except for Random forest which uses 0.5):")
display(df_final.sort_values(by="F1-Score", ascending=False))

xgb_latency = df_final.loc[df_final['Model'] == 'XGBoost', 'Latency (s)'].values[0]
student_latency = df_final.loc[df_final['Model'] == 'Student (Distilled)', 'Latency (s)'].values[0]
print(f"\nStudent is {xgb_latency / student_latency:.2f}x faster than XGBoost")

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



Training Random Forest...

Final Comparison Table (Threshold 0.95 except for Random forest which uses 0.5):


Unnamed: 0,Model,ROC AUC,F1-Score,Recall,Precision,Latency (s)
2,XGBoost,0.972354,0.867647,0.797297,0.951613,0.023554
3,Random Forest,0.930895,0.8125,0.702703,0.962963,0.044272
0,Teacher (Master),0.975202,0.789474,0.810811,0.769231,0.026468
1,Student (Distilled),0.975662,0.771242,0.797297,0.746835,0.003773



Student is 6.24x faster than XGBoost


In [46]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#neo paysim dataset 
df = pd.read_csv('paysim dataset.csv')
df = pd.get_dummies(df, columns=['type'], drop_first=True)

X = df.drop(['isFraud', 'isFlaggedFraud', 'nameOrig', 'nameDest'], axis=1)
y = df['isFraud']
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)
input_dim = X_train.shape[1]
class TeacherNetwork(nn.Module):
    def __init__(self, input_dim):
        super(TeacherNetwork, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 2)
        )
    def forward(self, x): return self.net(x)

class StudentNetwork(nn.Module):
    def __init__(self, input_dim):
        super(StudentNetwork, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 16),
            nn.ReLU(),
            nn.Linear(16, 2)
        )
    def forward(self, x): return self.net(x)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
teacher = TeacherNetwork(input_dim).to(device)
student = StudentNetwork(input_dim).to(device)

print(f"New PaySim Input Dimension: {input_dim} features")

New PaySim Input Dimension: 10 features


In [53]:
T = 2.0  
alpha = 0.5  
learning_rate = 1e-3
fraud_ratio = y_train.mean()
weights = torch.tensor([1.0, np.sqrt(1.0 / fraud_ratio)], dtype=torch.float32).to(device)
criterion_hard = nn.CrossEntropyLoss(weight=weights)

optimizer_teacher = optim.Adam(teacher.parameters(), lr=learning_rate)
optimizer_student = optim.Adam(student.parameters(), lr=learning_rate)
train_teacher(epochs=10)
train_student_kd(epochs=10)

Teacher Epoch 1, Avg Loss: 0.0375
Teacher Epoch 2, Avg Loss: 0.0333
Teacher Epoch 3, Avg Loss: 0.0319
Teacher Epoch 4, Avg Loss: 0.0321
Teacher Epoch 5, Avg Loss: 0.0323
Teacher Epoch 6, Avg Loss: 0.0309
Teacher Epoch 7, Avg Loss: 0.0308
Teacher Epoch 8, Avg Loss: 0.0298
Teacher Epoch 9, Avg Loss: 0.0292
Teacher Epoch 10, Avg Loss: 0.0289
Student KD Epoch 1, Avg Loss: 0.0199
Student KD Epoch 2, Avg Loss: 0.0165
Student KD Epoch 3, Avg Loss: 0.0157
Student KD Epoch 4, Avg Loss: 0.0152
Student KD Epoch 5, Avg Loss: 0.0151
Student KD Epoch 6, Avg Loss: 0.0146
Student KD Epoch 7, Avg Loss: 0.0146
Student KD Epoch 8, Avg Loss: 0.0150
Student KD Epoch 9, Avg Loss: 0.0145
Student KD Epoch 10, Avg Loss: 0.0143


In [59]:
from sklearn.metrics import precision_recall_curve
import numpy as np
import torch

def find_best_threshold(model, X_test, y_test, is_torch=True):
   
    if is_torch:
        model.eval() 
        with torch.no_grad():
            inputs = torch.tensor(X_test, dtype=torch.float32).to(device)
            logits = model(inputs)
            probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
    else:
        probs = model.predict_proba(X_test)[:, 1]
    
 
    precisions, recalls, thresholds = precision_recall_curve(y_test, probs)
    
  
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
    best_idx = np.argmax(f1_scores)
    

    return thresholds[best_idx], f1_scores[best_idx]


best_t_teacher, _ = find_best_threshold(teacher, X_test, y_test, is_torch=True)
best_t_student, _ = find_best_threshold(student, X_test, y_test, is_torch=True)
best_t_xgb, _ = find_best_threshold(xgb_model, X_test, y_test, is_torch=False)

print(f"Best Threshold for Teacher: {best_t_teacher:.4f}")
print(f"Best Threshold for Student: {best_t_student:.4f}")
print(f"Best Threshold for XGBoost: {best_t_xgb:.4f}")

Best Threshold for Teacher: 0.9315
Best Threshold for Student: 0.7835
Best Threshold for XGBoost: 0.8403


In [60]:
import numpy as np
from xgboost import XGBClassifier
scale_weight_tuned = np.sqrt((len(y_train) - sum(y_train)) / sum(y_train))

print(f"Retraining XGBoost with Tuned Scale Weight: {scale_weight_tuned:.2f}")

xgb_model = XGBClassifier(
    scale_pos_weight=scale_weight_tuned, 
    random_state=42, 
    use_label_encoder=False, 
    eval_metric='logloss',
    n_estimators=100, 
    max_depth=6
)

xgb_model.fit(X_train, y_train)

Retraining XGBoost with Tuned Scale Weight: 27.82


Parameters: { "use_label_encoder" } are not used.



In [61]:
best_t_xgb, _ = find_best_threshold(xgb_model, X_test, y_test, is_torch=False)

print(f"Best Threshold for XGBoost on PaySim: {best_t_xgb:.4f}")

Best Threshold for XGBoost on PaySim: 0.8403


In [65]:
results_paysim_final = []

results_paysim_final.append(full_benchmark(
    teacher, X_test, y_test, "Teacher (Master)", is_torch=True, threshold=best_t_teacher
))

results_paysim_final.append(full_benchmark(
    student, X_test, y_test, "Student (Distilled)", is_torch=True, threshold=best_t_student
))

results_paysim_final.append(full_benchmark(
    xgb_model, X_test, y_test, "XGBoost (Tuned)", is_torch=False, threshold=best_t_xgb
))

results_paysim_final.append(full_benchmark(
    rf_model, X_test, y_test, "Random Forest", is_torch=False, threshold=0.5
))

df_paysim_res = pd.DataFrame(results_paysim_final)

print("\n Final comparison: PaySim Dataset (10 Features)")
display(df_paysim_res.sort_values(by="F1-Score", ascending=False))

xgb_latency = df_paysim_res.loc[df_paysim_res['Model'] == 'XGBoost (Tuned)', 'Latency (s)'].values[0]
student_latency = df_paysim_res.loc[df_paysim_res['Model'] == 'Student (Distilled)', 'Latency (s)'].values[0]

print("-" * 50)
print(f"Student model is {xgb_latency / student_latency:.2f}x faster than xgboost")
print(f"Student model: {student_latency:.6f}s XGBoost model: {xgb_latency:.6f}s")


 Final comparison: PaySim Dataset (10 Features)


Unnamed: 0,Model,ROC AUC,F1-Score,Recall,Precision,Latency (s)
2,XGBoost (Tuned),0.999745,0.908788,0.885552,0.933276,0.38475
3,Random Forest,0.993781,0.859232,0.762987,0.983264,0.559111
1,Student (Distilled),0.997468,0.794805,0.74513,0.851577,0.075828
0,Teacher (Master),0.99663,0.786403,0.694805,0.90582,0.8436


--------------------------------------------------
Student model is 5.07x faster than xgboost
Student model: 0.075828s XGBoost model: 0.384750s
