In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from pytorch_tabnet.tab_model import TabNetClassifier
import torch
import time

In [2]:
with open("train_val_test_1.pkl", "rb") as f:
    X_train, X_val, X_test, y_train, y_val, y_test = pickle.load(f)

print("Data successfully loaded!")

Data successfully loaded!


In [3]:
def check_missing_values(X_train, X_val, X_test):
    train_elements = X_train.size if isinstance(X_train, np.ndarray) else X_train.size
    val_elements = X_val.size if isinstance(X_val, np.ndarray) else X_val.size
    test_elements = X_test.size if isinstance(X_test, np.ndarray) else X_test.size
    
    train_missing = np.isnan(X_train).sum() if isinstance(X_train, np.ndarray) else X_train.isna().sum().sum()
    val_missing = np.isnan(X_val).sum() if isinstance(X_val, np.ndarray) else X_val.isna().sum().sum()
    test_missing = np.isnan(X_test).sum() if isinstance(X_test, np.ndarray) else X_test.isna().sum().sum()
    
    train_pct = 100 * train_missing / train_elements
    val_pct = 100 * val_missing / val_elements
    test_pct = 100 * test_missing / test_elements
    
    print(f"Training set: {train_missing} missing values out of {train_elements} elements ({train_pct:.2f}%)")
    print(f"Validation set: {val_missing} missing values out of {val_elements} elements ({val_pct:.2f}%)")
    print(f"Test set: {test_missing} missing values out of {test_elements} elements ({test_pct:.2f}%)")
    
    if isinstance(X_train, pd.DataFrame):
        missing_cols = X_train.isna().sum().sort_values(ascending=False)
        missing_cols = missing_cols[missing_cols > 0]
        
        print("\nTop columns with missing values:")
        if len(missing_cols) > 0:
            for col, count in missing_cols[:10].items():
                pct = 100 * count / len(X_train)
                print(f"- {col}: {count} missing values ({pct:.2f}%)")
        else:
            print("No columns with missing values!")
    
    return train_missing, val_missing, test_missing

train_missing, val_missing, test_missing = check_missing_values(X_train, X_val, X_test)

if isinstance(X_train, pd.DataFrame):
    print("\nFirst 5 rows of data:")
    print(X_train.head())
else:
    print("\nShape of training data:", X_train.shape)

Training set: 12190772 missing values out of 54892538 elements (22.21%)
Validation set: 150872 missing values out of 21956968 elements (0.69%)
Test set: 163676 missing values out of 21956968 elements (0.75%)

Top columns with missing values:
- il_util: 857435 missing values (92.16%)
- mths_since_rcnt_il: 848672 missing values (91.22%)
- all_util: 846446 missing values (90.98%)
- inq_last_12m: 846444 missing values (90.98%)
- total_cu_tl: 846444 missing values (90.98%)
- open_acc_6m: 846444 missing values (90.98%)
- open_act_il: 846443 missing values (90.98%)
- max_bal_bc: 846443 missing values (90.98%)
- inq_fi: 846443 missing values (90.98%)
- open_il_12m: 846443 missing values (90.98%)

First 5 rows of data:
       acc_open_past_24mths  all_util  annual_inc  application_type  \
39785                   NaN       NaN     22000.0                 0   
39763                   NaN       NaN    150000.0                 0   
39751                   NaN       NaN    125000.0                 0

In [4]:
# After splitting but before modeling:

# 1. Check missing value percentage in each split
train_missing = (X_train.isna().sum() / len(X_train)) * 100
val_missing = (X_val.isna().sum() / len(X_val)) * 100
test_missing = (X_test.isna().sum() / len(X_test)) * 100

# 2. For columns with high missing rates, create indicator features
high_missing_cols = train_missing[train_missing > 30].index
for col in high_missing_cols:
    X_train[f'{col}_missing'] = X_train[col].isna().astype(int)
    X_val[f'{col}_missing'] = X_val[col].isna().astype(int)
    X_test[f'{col}_missing'] = X_test[col].isna().astype(int)

# 3. Apply imputation consistently
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='median')  # or mean, mode, etc.
imputer.fit(X_train)  # Fit only on training data

X_train = imputer.transform(X_train)
X_val = imputer.transform(X_val)
X_test = imputer.transform(X_test)

In [5]:
torch.manual_seed(1)
np.random.seed(1)

# Calculate class imbalance 
class_counts = np.bincount(y_train)
print(f"Class distribution: {class_counts}")
class_weight = class_counts[0] / class_counts[1]
print(f"Class weight ratio: {class_weight:.2f}")

# If pd df convert to np arr
if isinstance(X_train, pd.DataFrame):
    X_train_values = X_train.values
    X_val_values = X_val.values
    X_test_values = X_test.values
else:
    X_train_values = X_train
    X_val_values = X_val
    X_test_values = X_test

if isinstance(y_train, pd.Series):
    y_train_values = y_train.values
    y_val_values = y_val.values
    y_test_values = y_test.values
else:
    y_train_values = y_train
    y_val_values = y_val
    y_test_values = y_test

Class distribution: [169418 760964]
Class weight ratio: 0.22


In [6]:
"""tabnet_model = TabNetClassifier(
    n_d=32,  # Reduced from 64
    n_a=32,  # Reduced from 64
    n_steps=3,  # Reduced from 5
    gamma=1.5,  # Scaling factor for attention
    n_independent=1,  # Reduced from 2
    n_shared=1,  # Reduced from 2
    momentum=0.3,
    mask_type='entmax',
    lambda_sparse=1e-3,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params=dict(
        mode="min",
        patience=10,
        min_lr=1e-5,
        factor=0.5
    ),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
    verbose=1,
    device_name='mps'  
)

# Weights to handle class imbalance
weights = np.ones(y_train_values.shape[0])
weights[y_train_values == 1] = class_weight

start_time = time.time()

tabnet_model.fit(
    X_train_values, y_train_values,
    eval_set=[(X_val_values, y_val_values)],
    max_epochs=50,  # Reduced from 200
    patience=10,  # Reduced from 30
    batch_size=2048,  # Increased from 1024
    weights=weights,  # Apply class weights
    eval_metric=["auc", "accuracy"]
)

training_time = time.time() - start_time
print(f"TabNet training completed in {training_time:.2f} seconds.")"""

'tabnet_model = TabNetClassifier(\n    n_d=32,  # Reduced from 64\n    n_a=32,  # Reduced from 64\n    n_steps=3,  # Reduced from 5\n    gamma=1.5,  # Scaling factor for attention\n    n_independent=1,  # Reduced from 2\n    n_shared=1,  # Reduced from 2\n    momentum=0.3,\n    mask_type=\'entmax\',\n    lambda_sparse=1e-3,\n    optimizer_fn=torch.optim.Adam,\n    optimizer_params=dict(lr=2e-2),\n    scheduler_params=dict(\n        mode="min",\n        patience=10,\n        min_lr=1e-5,\n        factor=0.5\n    ),\n    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,\n    verbose=1,\n    device_name=\'mps\'  \n)\n\n# Weights to handle class imbalance\nweights = np.ones(y_train_values.shape[0])\nweights[y_train_values == 1] = class_weight\n\nstart_time = time.time()\n\ntabnet_model.fit(\n    X_train_values, y_train_values,\n    eval_set=[(X_val_values, y_val_values)],\n    max_epochs=50,  # Reduced from 200\n    patience=10,  # Reduced from 30\n    batch_size=2048,  # Increased 

In [7]:
# Calculate class weights properly
n_samples = len(y_train_values)
n_class_1 = sum(y_train_values == 1)
n_class_0 = n_samples - n_class_1

# If class 0 is the minority class
if n_class_0 < n_class_1:
    # Weight class 0 (minority) higher - a more moderate approach
    class_0_weight = n_class_1 / n_class_0 * 2  # More moderate multiplier
    class_1_weight = 1.0
else:
    # If class 1 is the minority class
    class_0_weight = 1.0
    class_1_weight = n_class_0 / n_class_1 * 2

# Apply weights
weights = np.ones(y_train_values.shape[0])
weights[y_train_values == 0] = class_0_weight
weights[y_train_values == 1] = class_1_weight

print(f"Class 0 weight: {class_0_weight}, Class 1 weight: {class_1_weight}")

Class 0 weight: 8.983272143455832, Class 1 weight: 1.0


In [8]:
tabnet_model = TabNetClassifier(
    n_d=24,  # Slightly increased from 8
    n_a=24,  # Slightly increased from 8
    n_steps=3,  # Increased from 1
    gamma=1.5,
    n_independent=1,
    n_shared=1,
    momentum=0.3,
    mask_type='entmax',
    lambda_sparse=1e-3,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,  # Added back
    scheduler_params=dict(
        mode="min",
        patience=3,
        min_lr=1e-5,
        factor=0.5
    ),
    verbose=1,  # Set to 1 to see progress
    device_name='cpu'
)

# Use class weighting again
weights = np.ones(y_train_values.shape[0])
weights[y_train_values == 1] = class_weight

# Use more data - 50% instead of 20%
sample_size = int(X_train_values.shape[0] * 1)
indices = np.random.choice(X_train_values.shape[0], sample_size, replace=False)
X_sample = X_train_values[indices]
y_sample = y_train_values[indices]

start_time = time.time()

tabnet_model.fit(
    X_sample, y_sample,
    eval_set=[(X_val_values, y_val_values)],  # Use full validation set
    max_epochs=20,  # More epochs
    patience=5,  # More patience
    batch_size=2048,  # Slightly smaller batch
    weights=weights[indices],  # Use weights
    eval_metric=["auc", "accuracy"]  # Track both metrics
)

training_time = time.time() - start_time
print(f"TabNet training completed in {training_time:.2f} seconds.")



epoch 0  | loss: 0.65411 | val_0_auc: 0.63964 | val_0_accuracy: 0.53584 |  0:00:43s
epoch 1  | loss: 0.63432 | val_0_auc: 0.67464 | val_0_accuracy: 0.56617 |  0:01:26s
epoch 2  | loss: 0.62738 | val_0_auc: 0.68965 | val_0_accuracy: 0.59908 |  0:02:09s
epoch 3  | loss: 0.62302 | val_0_auc: 0.69849 | val_0_accuracy: 0.59104 |  0:02:52s
epoch 4  | loss: 0.62117 | val_0_auc: 0.70745 | val_0_accuracy: 0.63459 |  0:03:36s
epoch 5  | loss: 0.61857 | val_0_auc: 0.71139 | val_0_accuracy: 0.60272 |  0:04:19s
epoch 6  | loss: 0.61734 | val_0_auc: 0.71299 | val_0_accuracy: 0.64882 |  0:05:01s
epoch 7  | loss: 0.6171  | val_0_auc: 0.71419 | val_0_accuracy: 0.66815 |  0:05:44s
epoch 8  | loss: 0.61689 | val_0_auc: 0.71259 | val_0_accuracy: 0.66797 |  0:06:27s
epoch 9  | loss: 0.61508 | val_0_auc: 0.71477 | val_0_accuracy: 0.63915 |  0:07:10s
epoch 10 | loss: 0.61471 | val_0_auc: 0.71384 | val_0_accuracy: 0.62091 |  0:07:54s
epoch 11 | loss: 0.61351 | val_0_auc: 0.71472 | val_0_accuracy: 0.62327 |  0



TabNet training completed in 626.74 seconds.


In [9]:
feature_importances = tabnet_model.feature_importances_
if isinstance(X_train, pd.DataFrame):
    feature_names = X_train.columns.tolist()
    importance_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': feature_importances
    }).sort_values('Importance', ascending=False)
    print("\nTop 10 most important features (TabNet):")
    print(importance_df.head(10))
else:
    print("\nTabNet feature importances:")
    top_indices = np.argsort(feature_importances)[-10:][::-1]
    for idx in top_indices:
        print(f"Feature {idx}: {feature_importances[idx]:.4f}")


TabNet feature importances:
Feature 70: 0.2332
Feature 62: 0.1833
Feature 0: 0.1493
Feature 50: 0.1213
Feature 31: 0.0595
Feature 58: 0.0419
Feature 3: 0.0414
Feature 60: 0.0281
Feature 6: 0.0266
Feature 21: 0.0158


In [10]:
y_pred_proba = tabnet_model.predict_proba(X_test_values)
y_pred = np.argmax(y_pred_proba, axis=1)

accuracy = accuracy_score(y_test_values, y_pred)
print(f"\nTest Accuracy: {accuracy:.4f}")

cm = confusion_matrix(y_test_values, y_pred)
print("\nConfusion Matrix:")
print(cm)

class_0_acc = cm[0,0] / (cm[0,0] + cm[0,1]) if (cm[0,0] + cm[0,1]) > 0 else 0
class_1_acc = cm[1,1] / (cm[1,0] + cm[1,1]) if (cm[1,0] + cm[1,1]) > 0 else 0
print(f"Class 0 accuracy: {class_0_acc:.4f}")
print(f"Class 1 accuracy: {class_1_acc:.4f}")

model_path = "tabnet_model.zip"
tabnet_model.save_model(model_path)
print(f"\nModel saved to {model_path}")


Test Accuracy: 0.6599

Confusion Matrix:
[[ 51009  31394]
 [ 95169 194580]]
Class 0 accuracy: 0.6190
Class 1 accuracy: 0.6715
Successfully saved model at tabnet_model.zip.zip

Model saved to tabnet_model.zip


In [11]:
# loaded_model = TabNetClassifier()
# loaded_model.load_model("tabnet_model.zip")