In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import time
import random

In [29]:
df = pd.read_csv("ieee-fraud-detection/train_transaction.csv")

# derive “day” from TransactionDT, then drop the raw column
df["day"] = (df["TransactionDT"] // (3600 * 24)).astype(int)
df.drop("TransactionDT", axis=1, inplace=True)

# drop TransactionID, as it is not useful for modeling
df.drop("TransactionID", axis=1, inplace=True)

In [30]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [31]:
SEED = 42
seed_everything(SEED)

In [32]:
# compute missing % for all columns
nulls = df.isna().mean() * 100

# find columns with more than 80% missing values
cols_80 = nulls[nulls >= 80].index.tolist()

# and drop them!
df.drop(columns=cols_80, inplace=True)

In [33]:
from sklearn.impute import SimpleImputer

# numeric imputation (median) – exclude the target “isFraud”
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
num_cols.remove("isFraud")

# among num_cols, find columns with nans that need to be imputed
nan_cols = [c for c in num_cols if df[c].isna().any()]

# exclude the categorical columns card2, card3, card5, addr1, addr2
cat_cols = ["card2", "card3", "card5", "addr1", "addr2"]
nan_cols = [c for c in nan_cols if c not in cat_cols]

imputer = SimpleImputer(strategy="median")
df[nan_cols] = imputer.fit_transform(df[nan_cols])

In [34]:
# for remaining categoricals, one‐hot encode small‐cardinaliy ones else drop them
cat_cols_rem = df.select_dtypes(include=["object"]).columns.tolist()

# include the cat_cols that were excluded earlier
cat_cols_rem.extend(cat_cols)

# e.g. “ProductCD”, “MISSING” placeholders, etc.
for c in cat_cols_rem:
    n_uniq = df[c].nunique()
    if n_uniq <= 10:
        # ------------------ yy -----------------------
        if c[0] == 'M':
            if c[1] != 4:
                df[c] = df[c].map({'T': 1, 'F': 0})
            else:
                df[c] = df[c].map({'M0': 0, 'M1': 1, 'M2':2})
        else:
        # ---------------------------------------------
            dummies = pd.get_dummies(df[c], prefix=c, drop_first=True)
            df = pd.concat([df.drop(c, axis=1), dummies], axis=1)
    else:
        df.drop(columns=c, inplace=True)

In [37]:
# find all bool columns in df
bool_cols = df.select_dtypes(include="bool").columns

# cast them to int (True→1, False→0)
df[bool_cols] = df[bool_cols].astype(int)

In [38]:
df.head()

Unnamed: 0,isFraud,TransactionAmt,card1,dist1,C1,C2,C3,C4,C5,C6,...,ProductCD_H,ProductCD_R,ProductCD_S,ProductCD_W,card4_discover,card4_mastercard,card4_visa,card6_credit,card6_debit,card6_debit or credit
0,0,68.5,13926,19.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0,0,0,1,1,0,0,1,0,0
1,0,29.0,2755,8.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0,0,0,1,0,1,0,1,0,0
2,0,59.0,4663,287.0,1.0,1.0,0.0,0.0,0.0,1.0,...,0,0,0,1,0,0,1,0,1,0
3,0,50.0,18132,8.0,2.0,5.0,0.0,0.0,0.0,4.0,...,0,0,0,1,0,1,0,0,1,0
4,0,50.0,4497,8.0,1.0,1.0,0.0,0.0,0.0,1.0,...,1,0,0,0,0,1,0,1,0,0


In [39]:
# Create training and CV sets
# all non‐fraud examples
df_norm = df[df.isFraud == 0].copy()
# all fraud examples
df_fraud = df[df.isFraud == 1].copy()

# --------------------- yy -----------------------
# hold out 20% of normals for test
# hold out 20% of training for cross validation, leaving test for assessing model performance only, not model selection or tuning.
norm_train, norm_test = train_test_split(df_norm, test_size=0.2, random_state=42)
norm_train, norm_cv = train_test_split(norm_train, test_size=0.2, random_state=42)

# test set = held‐out test + all frauds
df_test = pd.concat([norm_test, df_fraud], axis=0)
y_test = df_test["isFraud"].values
# SHOULD THE LABEL IN df_test BE DROPPED AS WELL?

# drop labels for modeling
X_train = norm_train.drop("isFraud", axis=1)
X_cv    = norm_cv.drop("isFraud", axis=1)

print(f'train set(64% normal data): {X_train.shape}')
print(f'cv set(16% normal data): {X_cv.shape}')
print(f'test set(20% normal data + fraud): {df_test.shape}')
# ------------------------------------------------

→ Training on normals only: (364720, 337)
→ CV set (normals+fraud): (91181, 337)


train set(64% normal data): (364720, 337)
cv set(16% normal data): (91181, 337)
test set(20% normal data + fraud): (134639, 338)


In [13]:
import numpy as np
from sklearn.preprocessing import StandardScaler

# Identify one-hot columns (all values are 0 or 1)
one_hot_cols = [col for col in X_train.columns if set(X_train[col].unique()) <= {0, 1}]
non_one_hot_cols = [col for col in X_train.columns if col not in one_hot_cols]


In [14]:
scaler = StandardScaler()

X_train_scaled = X_train.copy()
X_cv_scaled    = X_cv.copy()

X_train_scaled[non_one_hot_cols] = scaler.fit_transform(X_train[non_one_hot_cols])
X_cv_scaled[non_one_hot_cols]    = scaler.transform(X_cv[non_one_hot_cols])


In [15]:
X_train_final = X_train_scaled.values
X_cv_final    = X_cv_scaled.values

In [16]:
# Convert to PyTorch tensors
x_train = torch.FloatTensor(X_train_final)
x_valid = torch.FloatTensor(X_cv_final)
y_valid = torch.FloatTensor(y_cv)

In [17]:
# The output must match the input for autoencoders

class FraudDatasetUnsupervised(Dataset):
    
    def __init__(self, x,output=True):
        'Initialization'
        self.x = x
        self.output = output

    def __len__(self):
        'Returns the total number of samples'
        return len(self.x)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample index
        item = self.x[index]
        if self.output:
            return item, item
        else:
            return item

In [18]:
training_set = FraudDatasetUnsupervised(x_train)
valid_set = FraudDatasetUnsupervised(x_valid)

In [19]:
# Build Pytorch loaders
BATCH_SIZE = 64

train_loader = DataLoader(training_set, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(valid_set,   batch_size=BATCH_SIZE, shuffle=False)

In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class DropAutoencoder(nn.Module):
    def __init__(self, input_size, intermediate_size_1, intermediate_size_2, code_size, dropout_rate=0.2):
        super(DropAutoencoder, self).__init__()

        # Encoder
        self.fc1 = nn.Linear(input_size, intermediate_size_1)
        self.fc2 = nn.Linear(intermediate_size_1, intermediate_size_2)
        self.fc3 = nn.Linear(intermediate_size_2, code_size)
        
        # Decoder
        self.fc4 = nn.Linear(code_size, intermediate_size_2)
        self.fc5 = nn.Linear(intermediate_size_2, intermediate_size_1)
        self.fc6 = nn.Linear(intermediate_size_1, input_size)
        
        self.dropout = nn.Dropout(dropout_rate)
        
    def forward(self, x):
        # Encoder with dropout noise
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        
        code = F.relu(self.fc3(x))
        
        # Decoder
        x = F.relu(self.fc4(code))
        x = self.dropout(x)
        
        x = F.relu(self.fc5(x))
        x = self.dropout(x)
        
        output = self.fc6(x)  # Linear activation
        return output


In [21]:
criterion = torch.nn.MSELoss()

In [22]:
def per_sample_mse(model, generator):
    
    model.eval()
    criterion = torch.nn.MSELoss(reduction="none")
    batch_losses = []
    
    for x_batch, y_batch in generator:
        # Forward pass
        y_pred = model(x_batch)
        # Compute Loss
        loss = criterion(y_pred.squeeze(), y_batch)
        loss_app = list(torch.mean(loss,axis=1).detach().cpu().numpy())
        batch_losses.extend(loss_app)
    
    return batch_losses

In [24]:
seed_everything(SEED)
model = DropAutoencoder(x_train.shape[1], 128, 64, 16, dropout_rate=0.2)
losses = per_sample_mse(model, val_loader)

In [26]:
#yy
print(model)

DropAutoencoder(
  (fc1): Linear(in_features=338, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=16, bias=True)
  (fc4): Linear(in_features=16, out_features=64, bias=True)
  (fc5): Linear(in_features=64, out_features=128, bias=True)
  (fc6): Linear(in_features=128, out_features=338, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)


In [27]:
print(losses[0:5])
print(np.mean(losses))

[0.104663715, 3.11748, 0.12473496, 0.24563716, 0.9639551]
2.0011084


In [28]:
def evaluate_model(model,generator,criterion):
    model.eval()
    batch_losses = []
    for x_batch, y_batch in generator:
        # Forward pass
        y_pred = model(x_batch)
        # Compute Loss
        loss = criterion(y_pred.squeeze(), y_batch)
        batch_losses.append(loss.item())
    mean_loss = np.mean(batch_losses)    
    return mean_loss

In [29]:
class EarlyStopping:
    
    def __init__(self, patience=3, verbose=False):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = np.inf
    
    def continue_training(self,current_score):
        if self.best_score > current_score:
            self.best_score = current_score
            self.counter = 0
            if self.verbose:
                print("New best score:", current_score)
        else:
            self.counter+=1
            if self.verbose:
                print(self.counter, " iterations since best score.")
                
        return self.counter <= self.patience 

In [30]:
def training_loop(model,training_generator,valid_generator,optimizer,criterion,max_epochs=100,apply_early_stopping=True,patience=3,verbose=False):
    #Setting the model in training mode
    model.train()

    if apply_early_stopping:
        early_stopping = EarlyStopping(verbose=verbose,patience=patience)
    
    all_train_losses = []
    all_valid_losses = []
    
    #Training loop
    start_time=time.time()
    for epoch in range(max_epochs):
        model.train()
        train_loss=[]
        for x_batch, y_batch in training_generator:
            optimizer.zero_grad()
            # Forward pass
            y_pred = model(x_batch)
            # Compute Loss
            loss = criterion(y_pred.squeeze(), y_batch)
            # Backward pass
            loss.backward()
            optimizer.step()   
            train_loss.append(loss.item())
        
        #showing last training loss after each epoch
        all_train_losses.append(np.mean(train_loss))
        if verbose:
            print('')
            print('Epoch {}: train loss: {}'.format(epoch, np.mean(train_loss)))
        #evaluating the model on the test set after each epoch    
        valid_loss = evaluate_model(model,valid_generator,criterion)
        all_valid_losses.append(valid_loss)
        if verbose:
            print('valid loss: {}'.format(valid_loss))
        if apply_early_stopping:
            if not early_stopping.continue_training(valid_loss):
                if verbose:
                    print("Early stopping")
                break
        
    training_execution_time=time.time()-start_time
    return model,training_execution_time,all_train_losses,all_valid_losses

In [31]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

In [32]:
model,training_execution_time,train_losses,valid_losses = training_loop(model,train_loader,val_loader,optimizer,criterion,verbose=True)


Epoch 0: train loss: 0.6260206680169673
valid loss: 1.2257428836119946
New best score: 1.2257428836119946

Epoch 1: train loss: 0.4889146287071611
valid loss: 1.1323503856671118
New best score: 1.1323503856671118

Epoch 2: train loss: 0.443328338787629
valid loss: 1.041066593511788
New best score: 1.041066593511788

Epoch 3: train loss: 0.41811798419646506
valid loss: 0.9525380145383029
New best score: 0.9525380145383029

Epoch 4: train loss: 0.3982787346001976
valid loss: 0.872528903087169
New best score: 0.872528903087169

Epoch 5: train loss: 0.38633111697516537
valid loss: 0.8427106081773686
New best score: 0.8427106081773686

Epoch 6: train loss: 0.3715525363550502
valid loss: 0.8132988200540665
New best score: 0.8132988200540665

Epoch 7: train loss: 0.3626147477298239
valid loss: 0.7803101856863487
New best score: 0.7803101856863487

Epoch 8: train loss: 0.3554087988436138
valid loss: 0.7745204644150211
New best score: 0.7745204644150211

Epoch 9: train loss: 0.3439126586879341

In [33]:
losses = per_sample_mse(model, val_loader)
print(losses[0:5])
print(np.mean(losses))

[0.005608443, 0.79956955, 0.012906705, 0.033568468, 0.34498852]
0.58798647


In [31]:
genuine_losses = np.array(losses)[y_valid.numpy() == 0]
fraud_losses = np.array(losses)[y_valid.numpy() == 1]
print("Average fraud reconstruction error:", np.mean(fraud_losses))
print("Average genuine reconstruction error:", np.mean(genuine_losses))

Average fraud reconstruction error: 2.977164
Average genuine reconstruction error: 0.22459523


In [32]:
# evaluation
from sklearn.metrics import (average_precision_score, roc_auc_score)

# compute AUC-ROC and Average Precision on the validation set by considering the reconstruction errors as predicted fraud scores

AUC_ROC = roc_auc_score(y_cv, losses)
AP = average_precision_score(y_cv, losses)
    
performances = pd.DataFrame([[AUC_ROC, AP]], columns=['AUC ROC','Average precision'])

In [33]:
performances

Unnamed: 0,AUC ROC,Average precision
0,0.743221,0.477684


In [34]:
from sklearn.metrics import recall_score

thr      = np.percentile(losses, 70)      # e.g. top 30% as “fraud”
y_pred   = (losses >= thr).astype(int)
recall   = recall_score(y_cv, y_pred)    # binary‐class recall
print(f"Recall at threshold {thr:.4f} = {recall:.4f}")

Recall at threshold 0.1756 = 0.6274
