### Based on Jude's July 29v2, edited by Yang Aug 5:
- removed one-hot encoding of M columns, used 0/1 or numerical encoding, replaced nan in these columns with -1
- splitted training data into train and test and then further split training into training and cross-validation set. The cross-validation set is used to choose the best epoch/best weight and the test set is used for evaluation the final model only, never used in training
- replaced Autoencoder class becasue I wasn't sure why the forward function was different from encoder+decoder
- added reLU and dorpout after each linear layer in the encoder as well as decoder
- saved best weights during training to be loaded later for quick model evaluation
- minor rearrangement and varaible renaming for readability

In [2]:
import os
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import time
import random
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader


In [3]:
df = pd.read_csv("train_transaction.csv")#, nrows=10000)

# derive “day” from TransactionDT, then drop the raw column
df["day"] = (df["TransactionDT"] // (3600 * 24)).astype(int)
df.drop("TransactionDT", axis=1, inplace=True)

# drop TransactionID, as it is not useful for modeling
df.drop("TransactionID", axis=1, inplace=True)

In [4]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

In [5]:
SEED = 42
seed_everything(SEED)

In [6]:
# compute missing % for all columns
nulls = df.isna().mean() * 100

# find columns with more than 80% missing values
cols_80 = nulls[nulls >= 80].index.tolist()

# and drop them!
df.drop(columns=cols_80, inplace=True)

In [7]:
# numeric imputation (median) – exclude the target “isFraud”
num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
num_cols.remove("isFraud")

# among num_cols, find columns with nans that need to be imputed
nan_cols = [c for c in num_cols if df[c].isna().any()]

# exclude the categorical columns card2, card3, card5, addr1, addr2
cat_cols = ["card2", "card3", "card5", "addr1", "addr2"]
nan_cols = [c for c in nan_cols if c not in cat_cols]

imputer = SimpleImputer(strategy="median")
df[nan_cols] = imputer.fit_transform(df[nan_cols])


In [8]:
# for remaining categoricals, one‐hot encode small‐cardinaliy ones else drop them
cat_cols_rem = df.select_dtypes(include=["object"]).columns.tolist()

# include the cat_cols that were excluded earlier
cat_cols_rem.extend(cat_cols)

# e.g. “ProductCD”, “MISSING” placeholders, etc.
for c in cat_cols_rem:
    n_uniq = df[c].nunique()
    if n_uniq <= 10:
        # ------------------ yy -----------------------
        if c[0] == 'M':
            if c[1] != 4:
                df[c] = df[c].map({'T': 1, 'F': 0})
            else:
                df[c] = df[c].map({'M0': 0, 'M1': 1, 'M2':2})
            df[c]=df[c].fillna(-1)
        else:
        # ---------------------------------------------
            dummies = pd.get_dummies(df[c], prefix=c, drop_first=True)
            df = pd.concat([df.drop(c, axis=1), dummies], axis=1)
    else:
        df.drop(columns=c, inplace=True)

In [9]:
# find all bool columns in df
bool_cols = df.select_dtypes(include="bool").columns

# cast them to int (True→1, False→0)
df[bool_cols] = df[bool_cols].astype(int)

#df.head()

In [10]:
# Create training and CV sets
# all non‐fraud examples
df_norm = df[df.isFraud == 0].copy()
# all fraud examples
df_fraud = df[df.isFraud == 1].copy()

# --------------------- yy -----------------------
# hold out 20% of normals for test
# hold out 20% of training for cross validation, leaving test for assessing model performance only, not model selection or tuning.
norm_train, norm_test = train_test_split(df_norm, test_size=0.2, random_state=42)
norm_train, norm_cv = train_test_split(norm_train, test_size=0.2, random_state=42)

# test set = held‐out test + all frauds
df_test = pd.concat([norm_test, df_fraud], axis=0)
X_test = df_test.drop("isFraud", axis=1)
y_test = df_test["isFraud"].values
# SHOULD THE LABEL IN df_test BE DROPPED AS WELL?

# drop labels for modeling
X_train = norm_train.drop("isFraud", axis=1)
X_cv    = norm_cv.drop("isFraud", axis=1)

print(f'train set(64% normal data): {X_train.shape}')
print(f'cv set(16% normal data): {X_cv.shape}')
print(f'test set(20% normal data + fraud): {X_test.shape}')
# ------------------------------------------------

train set(64% normal data): (364720, 337)
cv set(16% normal data): (91181, 337)
test set(20% normal data + fraud): (134639, 337)


In [11]:
one_hot_cols = [col for col in X_train.columns if set(X_train[col].unique()) <= {0, 1}]
non_one_hot_cols = [col for col in X_train.columns if col not in one_hot_cols]

scaler = StandardScaler()

X_train_scaled = X_train.copy()
X_cv_scaled    = X_cv.copy()
X_test_scaled = X_test.copy()

X_train_scaled[non_one_hot_cols] = scaler.fit_transform(X_train[non_one_hot_cols])
X_cv_scaled[non_one_hot_cols]    = scaler.transform(X_cv[non_one_hot_cols])
X_test_scaled[non_one_hot_cols]    = scaler.transform(X_test[non_one_hot_cols])

X_train_final = X_train_scaled.values
X_cv_final    = X_cv_scaled.values
X_test_final = X_test_scaled.values

# Convert to PyTorch tensors
# --------------------------- yy ---------------------------
x_train = torch.tensor(X_train_final, dtype=torch.float32)
x_valid = torch.tensor(X_cv_final, dtype=torch.float32)
x_test = torch.tensor(X_test_final, dtype=torch.float32)

In [12]:
# The output must match the input for autoencoders

class FraudDatasetUnsupervised(Dataset):
    
    def __init__(self, x,output=True):
        'Initialization'
        self.x = x
        self.output = output

    def __len__(self):
        'Returns the total number of samples'
        return len(self.x)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample index
        item = self.x[index]
        if self.output:
            return item, item
        else:
            return item

In [13]:
training_set = FraudDatasetUnsupervised(x_train)
valid_set = FraudDatasetUnsupervised(x_valid)

In [14]:
# Build Pytorch loaders
BATCH_SIZE = 64

train_loader = DataLoader(training_set, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(valid_set,   batch_size=BATCH_SIZE, shuffle=False)

## Autoencoder

In [16]:
''' # Jude's autoencoder
import torch
import torch.nn as nn
import torch.nn.functional as F

class DropAutoencoder(nn.Module):
    def __init__(self, input_size, intermediate_size_1, intermediate_size_2, code_size, dropout_rate=0.2):
        super(DropAutoencoder, self).__init__()

        # Encoder
        self.fc1 = nn.Linear(input_size, intermediate_size_1)
        self.fc2 = nn.Linear(intermediate_size_1, intermediate_size_2)
        self.fc3 = nn.Linear(intermediate_size_2, code_size)
        
        # Decoder
        self.fc4 = nn.Linear(code_size, intermediate_size_2)
        self.fc5 = nn.Linear(intermediate_size_2, intermediate_size_1)
        self.fc6 = nn.Linear(intermediate_size_1, input_size)
        
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        # Encoder with dropout noise
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        
        code = F.relu(self.fc3(x))
        
        # Decoder
        x = F.relu(self.fc4(code))
        x = self.dropout(x)
        
        x = F.relu(self.fc5(x))
        x = self.dropout(x)
        
        output = self.fc6(x)  # Linear activation
        return output
        '''


" # Jude's autoencoder\nimport torch\nimport torch.nn as nn\nimport torch.nn.functional as F\n\nclass DropAutoencoder(nn.Module):\n    def __init__(self, input_size, intermediate_size_1, intermediate_size_2, code_size, dropout_rate=0.2):\n        super(DropAutoencoder, self).__init__()\n\n        # Encoder\n        self.fc1 = nn.Linear(input_size, intermediate_size_1)\n        self.fc2 = nn.Linear(intermediate_size_1, intermediate_size_2)\n        self.fc3 = nn.Linear(intermediate_size_2, code_size)\n        \n        # Decoder\n        self.fc4 = nn.Linear(code_size, intermediate_size_2)\n        self.fc5 = nn.Linear(intermediate_size_2, intermediate_size_1)\n        self.fc6 = nn.Linear(intermediate_size_1, input_size)\n        \n        self.dropout = nn.Dropout(dropout_rate)\n\n    def forward(self, x):\n        # Encoder with dropout noise\n        x = F.relu(self.fc1(x))\n        x = self.dropout(x)\n        \n        x = F.relu(self.fc2(x))\n        x = self.dropout(x)\n        

In [17]:
# ---------------------- YY ------------------------
import torch
import torch.nn as nn
#import torch.nn.functional as F

class Autoencoder(nn.Module):
    def __init__(self, input_size, intermediate_size_1, intermediate_size_2, latent_size, dropout_rate=0.5):
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_size, intermediate_size_1),
            nn.ReLU(),
            nn.Dropout(p = dropout_rate),
            nn.Linear(intermediate_size_1, intermediate_size_2),
            nn.ReLU(),
            nn.Dropout(p = dropout_rate),
            nn.Linear(intermediate_size_2, latent_size),
        )        
        self.decoder = nn.Sequential(
            nn.Linear(latent_size, intermediate_size_2),            
            nn.ReLU(),
            nn.Dropout(p = dropout_rate),
            nn.Linear(intermediate_size_2, intermediate_size_1),       
            nn.ReLU(),
            nn.Dropout(p = dropout_rate),
            nn.Linear(intermediate_size_1, input_size),
        )
        #self.dropout = nn.Dropout(dropout_rate)

    def forward(self, x):
        latent = self.encoder(x)
        return self.decoder(latent)
# ---------------------------------------------

In [18]:
def per_sample_mse(model, generator):
    
    model.eval()
    criterion = torch.nn.MSELoss(reduction="none")
    batch_losses = []
    
    for x_batch, y_batch in generator:
        # Forward pass
        y_pred = model(x_batch)
        # Compute Loss
        loss = criterion(y_pred.squeeze(), y_batch)
        loss_app = list(torch.mean(loss,axis=1).detach().cpu().numpy())
        batch_losses.extend(loss_app)
    
    return batch_losses

In [19]:
def evaluate_model(model,generator,criterion):
    model.eval()
    batch_losses = []
    for x_batch, y_batch in generator:
        # Forward pass
        y_pred = model(x_batch)
        # Compute Loss
        loss = criterion(y_pred.squeeze(), y_batch)
        batch_losses.append(loss.item())
    mean_loss = np.mean(batch_losses)    
    return mean_loss

In [20]:
class EarlyStopping:
    
    def __init__(self, patience=3, verbose=False):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = np.inf
    # ------------------------- YY ---------------------
    def continue_training(self,current_score, save=False):
        if self.best_score > current_score:
            self.best_score = current_score
            self.counter = 0
            if self.verbose:
                print("New best score:", current_score)
            save_flag = True # signal outside loop to save training weight
        else:
            self.counter+=1
            if self.verbose:
                print(self.counter, " iterations since best score.")
            save_flag = False
        #-----------------------------------------------
        return self.counter <= self.patience, save_flag

In [21]:
def training_loop(model,training_generator,valid_generator,optimizer,criterion,max_epochs=100,
                  apply_early_stopping=True,patience=3,verbose=False):
    #Setting the model in training mode
    model.train()

    if apply_early_stopping:
        early_stopping = EarlyStopping(verbose=verbose,patience=patience)
    
    all_train_losses = []
    all_valid_losses = []
    
    #Training loop
    start_time=time.time()
    for epoch in range(max_epochs):
        model.train()
        train_loss=[]
        for x_batch, y_batch in training_generator:
            optimizer.zero_grad()
            # Forward pass
            y_pred = model(x_batch)
            # Compute Loss
            loss = criterion(y_pred.squeeze(), y_batch)
            # Backward pass
            loss.backward()
            optimizer.step()   
            train_loss.append(loss.item())
        
        #showing last training loss after each epoch
        all_train_losses.append(np.mean(train_loss))
        if verbose:
            print('')
            print('Epoch {}: train loss: {}'.format(epoch, np.mean(train_loss)))
        #evaluating the model on the test set after each epoch    
        valid_loss = evaluate_model(model,valid_generator,criterion)
        all_valid_losses.append(valid_loss)
        if verbose:
            print('valid loss: {}'.format(valid_loss))
        # --------------yy ----------------------
        if apply_early_stopping:
            early_stop, save_weights = early_stopping.continue_training(valid_loss)
            if save_weights:
                torch.save(model.state_dict(), f'autoencoder_best_weights.pth')
            if not early_stop:
                if verbose:
                    print("Early stopping")
                break
        
    training_execution_time=time.time()-start_time
    return model,training_execution_time,all_train_losses,all_valid_losses

In [22]:
criterion = torch.nn.MSELoss() # this is aggregated loss

In [23]:
'''seed_everything(SEED)'''
model = Autoencoder(x_train.shape[1], 128, 64, 16, dropout_rate=0.2)
losses = per_sample_mse(model, val_loader)

In [24]:
print(losses[0:5])
print(np.mean(losses))

[0.49203458, 0.594671, 0.13820206, 0.13903299, 1.0079634]
1.1732432


In [25]:
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

In [26]:
model,training_execution_time,train_losses,valid_losses = training_loop(model,train_loader,val_loader,optimizer,criterion,verbose=True)


Epoch 0: train loss: 0.6384514754180177
valid loss: 0.6636157513971914
New best score: 0.6636157513971914

Epoch 1: train loss: 0.4970251948064954
valid loss: 0.58618536463955
New best score: 0.58618536463955

Epoch 2: train loss: 0.44899287717889164
valid loss: 0.5269513091967817
New best score: 0.5269513091967817

Epoch 3: train loss: 0.4251902113412217
valid loss: 0.4866589359180969
New best score: 0.4866589359180969

Epoch 4: train loss: 0.4096893847621435
valid loss: 0.46571934109717084
New best score: 0.46571934109717084

Epoch 5: train loss: 0.3991813416213482
valid loss: 0.444648709919369
New best score: 0.444648709919369

Epoch 6: train loss: 0.38653401792922715
valid loss: 0.4182989833177182
New best score: 0.4182989833177182

Epoch 7: train loss: 0.37108506382839956
valid loss: 0.3931804719985577
New best score: 0.3931804719985577

Epoch 8: train loss: 0.3603577736512174
valid loss: 0.3762360926573737
New best score: 0.3762360926573737

Epoch 9: train loss: 0.35394149488692

In [27]:
losses = per_sample_mse(model, val_loader)
print(losses[0:5])
print(np.mean(losses))
losses_sorted = sorted(losses, reverse=True)
print(losses_sorted[:10])


[0.035916753, 0.1652522, 0.018691897, 0.018018825, 0.019540202]
0.26912415
[2612.7449, 1082.4082, 1082.3351, 507.0893, 324.06323, 161.85315, 111.87723, 96.305084, 82.36629, 62.981995]


In [28]:
# ------------------------- YY --------------------------
loaded_model = Autoencoder(x_train.shape[1], 128, 64, 16, dropout_rate=0.2)
loaded_model.load_state_dict(torch.load(f'autoencoder_best_weights.pth'))
loaded_model.eval()

output_test = loaded_model(x_test)
crit = torch.nn.MSELoss(reduction="none")
loss = crit(output_test, x_test)
sample_loss = loss.mean(axis=1).detach().numpy()
# -------------------------------------------------------

In [29]:
genuine_losses = np.array(sample_loss[y_test == 0])
fraud_losses = np.array(sample_loss[y_test == 1])
print("Average fraud reconstruction error:", np.mean(fraud_losses))
print("Average genuine reconstruction error:", np.mean(genuine_losses))

Average fraud reconstruction error: 3.0505252
Average genuine reconstruction error: 0.23194109


In [30]:
# evaluation
from sklearn.metrics import (average_precision_score, roc_auc_score)

# compute AUC-ROC and Average Precision on the validation set by considering the reconstruction errors as predicted fraud scores

AUC_ROC = roc_auc_score(y_test, sample_loss)
AP = average_precision_score(y_test, sample_loss)
    
performances = pd.DataFrame([[AUC_ROC, AP]], columns=['AUC ROC','Average precision'])

In [31]:
performances

Unnamed: 0,AUC ROC,Average precision
0,0.741293,0.475397


In [32]:
from sklearn.metrics import recall_score

thr      = np.percentile(sample_loss, 70)      # e.g. top 30% as “fraud”
y_pred   = (sample_loss >= thr).astype(int)
recall   = recall_score(y_test, y_pred)    # binary‐class recall
print(f"Recall at threshold {thr:.4f} = {recall:.4f}")

Recall at threshold 0.1807 = 0.6264
