In [None]:
### MS Thesis

!pip uninstall scikit-learn -y
!pip uninstall imbalanced-learn -y

!pip install scikit-learn
#imbalanced-learn==0.10.1 --quiet

!pip install --user imbalanced-learn==0.11.0

In [None]:
!pip install -U scikit-learn==1.4.2 imbalanced-learn==0.12.0 --quiet


In [1]:
import warnings
warnings.filterwarnings('ignore')
 
import os
import time
import gc

import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype, is_object_dtype, is_integer_dtype, is_float_dtype

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from imblearn.over_sampling import SMOTE

import lightgbm as lgb
from lightgbm import LGBMClassifier

import xgboost as xgb

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from torch.cuda.amp import autocast, GradScaler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

import shap

In [2]:
def readfile(path, file):
    file_path = os.path.join(path, file)
    df = pd.read_csv(file_path)
    return df
    
    

def reduce_mem_usage(df, verbose=True, convert_obj_to_category=True, skip_cols=[]):
    """
    Reduces memory usage of a DataFrame by downcasting numeric types
    and optionally converting object types to categorical.

    Parameters:
    - df: pandas DataFrame
    - verbose: print memory usage stats
    - convert_obj_to_category: convert object columns to category if cardinality is low
    - skip_cols: list of column names to skip during optimization
    """
    start_mem = df.memory_usage(deep=True).sum() / 1024**2

    for col in df.columns:
        if col in skip_cols:
            continue

        col_data = df[col]
        col_type = col_data.dtype

        try:
            if is_numeric_dtype(col_data):
                c_min = col_data.min()
                c_max = col_data.max()

                if is_integer_dtype(col_data):
                    if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                        df[col] = col_data.astype(np.int8)
                    elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                        df[col] = col_data.astype(np.int16)
                    elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                        df[col] = col_data.astype(np.int32)
                    else:
                        df[col] = col_data.astype(np.int64)
                elif is_float_dtype(col_data):
                    if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                        df[col] = col_data.astype(np.float16)
                    elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                        df[col] = col_data.astype(np.float32)
                    else:
                        df[col] = col_data.astype(np.float64)

            elif convert_obj_to_category and is_object_dtype(col_data):
                num_unique_values = col_data.nunique()
                num_total_values = len(col_data)
                if num_unique_values / num_total_values < 0.5:
                    df[col] = col_data.astype('category')

        except Exception as e:
            if verbose:
                print(f"Could not process column {col}: {e}")

    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    if verbose:
        print(f"Memory usage decreased to {end_mem:5.2f} MB "
              f"({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)")

    return df    

In [3]:
path = r'/kaggle/input/thesis-dataset'


identity_file_name = 'train_identity.csv'
transaction_file_name = 'train_transaction.csv'

df_identity = readfile(path, identity_file_name)
print(df_identity.shape)

df_trans = readfile(path, transaction_file_name)
print(df_trans.shape)

df = pd.merge(df_trans, df_identity,how = "left", on = ['TransactionID'])

print(df.shape)

print(df.columns)

del df_trans, df_identity
gc.collect()


# The classes are heavily skewed we need to solve this issue later.
print('No Frauds', round(df['isFraud'].value_counts()[0]/len(df) * 100,2), '% of the dataset')
print('Frauds', round(df['isFraud'].value_counts()[1]/len(df) * 100,2), '% of the dataset')

(144233, 41)
(590540, 394)
(590540, 434)
Index(['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt',
       'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5',
       ...
       'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38',
       'DeviceType', 'DeviceInfo'],
      dtype='object', length=434)
No Frauds 96.5 % of the dataset
Frauds 3.5 % of the dataset


In [4]:
df_r = reduce_mem_usage(df, convert_obj_to_category=True)

Memory usage decreased to 525.70 MB (79.5% reduction)


In [8]:
# Calculate missing ratio
missing_ratio = df_r.isnull().mean()

# Filter columns with more than 95% missing
high_missing_cols = missing_ratio[missing_ratio > 0.90].sort_values(ascending=False)

# Show them
print(high_missing_cols)

id_24    0.991962
id_25    0.991310
id_07    0.991271
id_08    0.991271
id_21    0.991264
id_26    0.991257
id_22    0.991247
id_23    0.991247
id_27    0.991247
dist2    0.936284
D7       0.934099
id_18    0.923607
dtype: float64


In [5]:
categoric_columns = ['ProductCD',
                     'card1', 'card2', 'card3', 'card4', 'card5', 'card6',
                     'addr1', 'addr2',
                     'P_emaildomain', 'R_emaildomain',
                     'M1', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9',
                     'DeviceType', 'DeviceInfo',
                     'id_12', 'id_13', 'id_14', 'id_15', 'id_16','id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24',
                     'id_25', 'id_26','id_27','id_28','id_29', 'id_30', 'id_31', 'id_32', 'id_33', 'id_34','id_35', 'id_36', 'id_37', 'id_38']
numeric_columns = []

for i in df_r.columns:
    if i not in categoric_columns:
        numeric_columns.append(i)

#print('\n Numerical columns: ', numeric_columns)
#print('\n Categoric columns: ', categoric_columns)

In [6]:
# Calculate the percentage of null values in each column
null_percentage = df_r.isnull().mean() * 100

# Set a threshold for the null percentage
threshold = 10  

# Filter the columns based on the null percentage threshold
filtered_columns = null_percentage[null_percentage > threshold].index
#print(filtered_columns)


# Create a new DataFrame with the filtered columns
df_v1 = df_r[filtered_columns]

for i in filtered_columns:
    if i in categoric_columns:
        categoric_columns.remove(i)
    else:
        if i in numeric_columns:
            numeric_columns.remove(i)
            
#print('Numerical columns: ', numeric_columns)
#print('Categoric columns: ', categoric_columns)

df_null = pd.DataFrame(df_r.isnull().sum())
df_null.reset_index(inplace = True)
df_null.columns = ['Column','Nulls']
df_null['percent']=df_null['Nulls'].apply(lambda x: (x/len(df_r)) * 100)

print(f'There are {df_r.isnull().any().sum()} columns in train dataset with missing values.')

count_null = 0

null_30 = []

for i in range(len(df_null)):
    if df_null.iloc[i,2] >30:
        count_null += 1
        null_30.append(df_null.iloc[i,0])
        

#print(null_30)

df_1 = df_r.drop(columns = null_30, axis = 1)

print(df_1.shape)

print(len(null_30))


There are 414 columns in train dataset with missing values.
(590540, 202)
232


df_1.head()

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
gc.collect()

In [None]:
df_1['TransactionDT']= pd.to_datetime(df_1['TransactionDT'])

df_1['TransactionDT'].describe()

print('Numerical columns: ', numeric_columns)
print('Categoric columns: ', categoric_columns)

In [None]:
# Step 1: Identify categorical and numerical columns
categorical_cols = df_1.select_dtypes(include=['object']).columns.tolist()
numerical_cols = df_1.select_dtypes(include=['float64','int64']).columns.tolist()

# Step 2: Impute missing values
# Numeric: Replace NaN with 0
df_1[numerical_cols] = df_1[numerical_cols].fillna(0)

# Categorical: Replace NaN with 'NA'
df_1[categorical_cols] = df_1[categorical_cols].fillna('NA')

df_1.isnull().sum().sum()

In [None]:
#df_2_v1 = reduce_mem_usage(df_1, convert_obj_to_category=True)

df_2 = df_2_v1.copy()


df_2.drop(columns=['TransactionID','TransactionDT'], inplace = True )

df_2['isFraud'] = df_2['isFraud'].astype('category')

categorical_cols = df_2.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = df_2.select_dtypes(include=['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']).columns.tolist()
#numerical_cols

In [None]:
#df_2.columns

In [None]:
#df_2.dtypes

In [None]:
#categorical_cols

In [None]:
scaler = StandardScaler()

df_2[numerical_cols] = scaler.fit_transform(df_2[numerical_cols])

#df_2.head()

cat_onehot=[]
cat_label = []

for i in categorical_cols:
    if i != 'isFraud':
        if df_2[i].nunique()<=10:
            cat_onehot.append(i)
        else:
            cat_label.append(i)

#print(cat_onehot)
#print(cat_label)


### Label Encoding

label_encoders = {}
for col in cat_label:
    le = LabelEncoder()
    df_2[col + '_LE'] = le.fit_transform(df_2[col].astype(str))
    label_encoders[col] = le  # Store encoder if you need to inverse transform later

#print(df_2.head())

# One-Hot Encoding using scikit-learn
onehot_encoder = OneHotEncoder(sparse=False, drop=None)

encoded_array = onehot_encoder.fit_transform(df_2[cat_onehot])

# Convert to DataFrame
encoded_df = pd.DataFrame(encoded_array, columns=onehot_encoder.get_feature_names_out(cat_onehot))

# Combine with original DataFrame
df_encoded = pd.concat([df_2, encoded_df], axis=1)

df_encoded.drop(columns =['ProductCD', 'card4', 'card6', 'P_emaildomain', 'M6'], inplace = True)



#print(df_encoded.head())


df_encoded.columns

In [None]:

# Separate features and target
X = df_encoded.drop('isFraud', axis=1)
y = df_encoded['isFraud']

# Optional: split before SMOTE to avoid data leakage
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Convert back to DataFrame if needed
X_res_df = pd.DataFrame(X_resampled, columns=X.columns)
y_res_df = pd.Series(y_resampled, name='isFraud')

# Check the class distribution after SMOTE
print(y_res_df.value_counts())


lgbm = LGBMClassifier(
    n_estimators=1000,
    learning_rate=0.05,
    num_leaves=64,
    feature_fraction=0.8,
    bagging_fraction=0.8,
    bagging_freq=5,
    random_state=42,
    n_jobs=-1
)


lgbm.fit(
    X_res_df, y_res_df,
    eval_set=[(X_test, y_test)],
    eval_metric='auc',
    callbacks=[lgb.early_stopping(100)]
)

lgb_pred = lgbm.predict_proba(X_test)[:, 1]
print("LightGBM AUC:", roc_auc_score(y_test, lgb_pred))

xgb_model = xgb.XGBClassifier(
    n_estimators=500,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    use_label_encoder=False,
    eval_metric='auc',
    tree_method='hist'
)

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    early_stopping_rounds=50,
    verbose=100
)

xgb_pred = xgb_model.predict_proba(X_test)[:, 1]
print("XGBoost AUC:", roc_auc_score(y_test, xgb_pred))


In [None]:


class FraudDataset(torch.utils.data.Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X.values, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = FraudDataset(X, y)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_ds, val_ds = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_ds, batch_size=512, shuffle=True, num_workers=4, pin_memory=True)
val_loader = DataLoader(val_ds, batch_size=512, shuffle=False, num_workers=4, pin_memory=True)


In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, heads):
        super().__init__()
        self.attention = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=heads, batch_first=True)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.norm2 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, embed_dim),
            nn.ReLU(),
            nn.Linear(embed_dim, embed_dim)
        )

    def forward(self, x):
        attn_output, _ = self.attention(x, x, x)
        x = self.norm1(x + attn_output)
        ff_output = self.ff(x)
        x = self.norm2(x + ff_output)
        return x

class FraudTransformer(nn.Module):
    def __init__(self, input_dim, embed_dim=64, heads=4, num_blocks=2):
        super().__init__()
        self.embedding = nn.Linear(input_dim, embed_dim)
        self.blocks = nn.Sequential(*[TransformerBlock(embed_dim, heads) for _ in range(num_blocks)])
        self.output_layer = nn.Sequential(
                            nn.Flatten(),
                            nn.Linear(embed_dim * 1, 64),
                            nn.ReLU(),
                            nn.Linear(64, 1)  # No sigmoid
                            )

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)  # [B, 1, D]
        x = self.blocks(x)
        out = self.output_layer(x)
        return out.squeeze()


In [None]:


model = FraudTransformer(input_dim=X.shape[1]).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()
scaler = GradScaler()

for epoch in range(10):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()

        with autocast():
            preds = model(xb)
            loss = loss_fn(preds, yb)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

    # Evaluation
    model.eval()
    val_preds = []
    val_true = []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(device)
            with autocast():
                logits = model(xb)
            probs = torch.sigmoid(logits)  # âœ… Apply sigmoid here
            val_preds.extend(probs.cpu().numpy())
            val_true.extend(yb.numpy())

    auc = roc_auc_score(val_true, val_preds)
    print(f"Epoch {epoch+1}: AUC = {auc:.4f}")
