In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
import optuna

class LoanDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.tensor(features, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

class LoanPreprocessor:
    def __init__(self):
        self.categorical_features = [
            'person_home_ownership', 
            'loan_intent', 
            'loan_grade', 
            'cb_person_default_on_file'
        ]
        
        self.numerical_features = [
            'person_age', 
            'person_income', 
            'person_emp_length', 
            'loan_amnt', 
            'loan_int_rate', 
            'loan_percent_income', 
            'cb_person_cred_hist_length'
        ]
        
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), self.numerical_features),
                ('cat', OneHotEncoder(handle_unknown='ignore'), self.categorical_features)
            ])
    
    def fit_transform(self, X, y=None):
        return self.preprocessor.fit_transform(X)
    
    def transform(self, X):
        return self.preprocessor.transform(X)

class LoanNeuralNetwork(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, dropout_rate):
        super(LoanNeuralNetwork, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_size, hidden_size1),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size1, hidden_size2),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(hidden_size2, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.network(x)

class EarlyStopping:
    def __init__(self, patience=5, verbose=False):
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False

    def __call__(self, score):
        if self.best_score is None:
            self.best_score = score
        elif score < self.best_score:  # Assuming lower score is better (e.g., validation loss)
            self.best_score = score
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True
                if self.verbose:
                    print("Early stopping triggered.")

def validate_model(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for features, labels in val_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            loss = criterion(outputs.view(-1), labels)
            total_loss += loss.item()
    return total_loss / len(val_loader)

def train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs=50, patience=5):
    model.train()
    early_stopping = EarlyStopping(patience=patience, verbose=True)
    best_val_loss = float('inf')  # Initialize best validation loss
    best_model_weights = None  # To store the best model weights

    for epoch in range(epochs):
        total_loss = 0
        for features, labels in train_loader:
            features, labels = features.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs.view(-1), labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        avg_loss = total_loss / len(train_loader)
        val_loss = validate_model(model, val_loader, criterion, device)
        print(f'Epoch [{epoch+1}/{epochs}], Training Loss: {avg_loss:.4f}, Validation Loss: {val_loss:.4f}')

        # Check early stopping
        early_stopping(val_loss)
        if early_stopping.early_stop:
            print("Training stopped early.")
            break

        # Save the best model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_weights = model.state_dict()  # Save the model weights
            print("Best model weights saved.")

    return best_model_weights  # Return the best model weights

def objective(trial):
    hidden_size1 = trial.suggest_int('hidden_size1', 32, 128)
    hidden_size2 = trial.suggest_int('hidden_size2', 16, 64)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
    batch_size = trial.suggest_int('batch_size', 16, 64)

    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    model = LoanNeuralNetwork(input_size=X_train.shape[1], 
                               hidden_size1=hidden_size1, 
                               hidden_size2=hidden_size2, 
                               dropout_rate=dropout_rate).to(device)

    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Train the model and get the best model weights
    best_model_weights = train_model(model, train_loader, val_loader, criterion, optimizer, device, epochs=50)

    # Evaluate the model
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for features, labels in val_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = model(features)
            predicted = (outputs.view(-1) > 0.5).float()  # Ensure outputs are flattened
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total

    # Save the best model weights to a file
    model_save_path = f"best_model_trial_{trial.number}.pth"
    torch.save(best_model_weights, model_save_path)
    print(f"Best model weights saved to {model_save_path}")

    return accuracy

data = pd.read_csv('./data/train.csv')
X = data.drop('loan_status', axis=1)
y = data['loan_status']

preprocessor = LoanPreprocessor()
X_processed = preprocessor.fit_transform(X)

X_train, X_temp, y_train, y_temp = train_test_split(
    X_processed, y, test_size=0.3, random_state=420
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=69
)

train_dataset = LoanDataset(X_train, y_train.values)
val_dataset = LoanDataset(X_val, y_val.values)
test_dataset = LoanDataset(X_test, y_test.values)

# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Create an Optuna study
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=6)

# Print the best hyperparameters
print("Best hyperparameters: ", study.best_params)
print("Best accuracy: ", study.best_value)

# Create a DataFrame to store the results
results = []
for trial in study.trials:
    results.append({
        'Trial': trial.number,
        'Hidden Size 1': trial.params['hidden_size1'],
        'Hidden Size 2': trial.params['hidden_size2'],
        'Dropout Rate': trial.params['dropout_rate'],
        'Learning Rate': trial.params['learning_rate'],
        'Batch Size': trial.params['batch_size'],
        'Accuracy': trial.value
    })

results_df = pd.DataFrame(results)

# Display the results
print("\nHyperparameter Tuning Results:")
print(results_df)

# Optionally, save the results to a CSV file
results_df.to_csv('hyperparameter_tuning_results.csv', index=False)

[I 2024-12-20 04:11:18,591] A new study created in memory with name: no-name-80b3b182-7aad-42bb-8910-96b6ed2ad136


Epoch [1/50], Training Loss: 0.2299, Validation Loss: 0.2092
Best model weights saved.
Epoch [2/50], Training Loss: 0.1994, Validation Loss: 0.1951
Best model weights saved.
Epoch [3/50], Training Loss: 0.1969, Validation Loss: 0.1995
Epoch [4/50], Training Loss: 0.1911, Validation Loss: 0.1987
Epoch [5/50], Training Loss: 0.1881, Validation Loss: 0.1924
Best model weights saved.
Epoch [6/50], Training Loss: 0.1898, Validation Loss: 0.2002
Epoch [7/50], Training Loss: 0.1922, Validation Loss: 0.1986
Epoch [8/50], Training Loss: 0.1871, Validation Loss: 0.1979
Epoch [9/50], Training Loss: 0.1848, Validation Loss: 0.1919
Best model weights saved.
Epoch [10/50], Training Loss: 0.1842, Validation Loss: 0.1967
Epoch [11/50], Training Loss: 0.1859, Validation Loss: 0.1934
Epoch [12/50], Training Loss: 0.1839, Validation Loss: 0.1934
Epoch [13/50], Training Loss: 0.1840, Validation Loss: 0.1910
Best model weights saved.
Epoch [14/50], Training Loss: 0.1848, Validation Loss: 0.1964
Epoch [15/5

[I 2024-12-20 04:11:44,882] Trial 0 finished with value: 0.9410026145276799 and parameters: {'hidden_size1': 85, 'hidden_size2': 53, 'dropout_rate': 0.11575027750757352, 'learning_rate': 0.03245367661353664, 'batch_size': 51}. Best is trial 0 with value: 0.9410026145276799.


Epoch [18/50], Training Loss: 0.1815, Validation Loss: 0.1930
Early stopping triggered.
Training stopped early.
Best model weights saved to best_model_trial_0.pth
Epoch [1/50], Training Loss: 0.3072, Validation Loss: 0.2727
Best model weights saved.
Epoch [2/50], Training Loss: 0.2146, Validation Loss: 0.2073
Best model weights saved.
Epoch [3/50], Training Loss: 0.1975, Validation Loss: 0.1920
Best model weights saved.
Epoch [4/50], Training Loss: 0.1951, Validation Loss: 0.1957
Epoch [5/50], Training Loss: 0.1926, Validation Loss: 0.1971
Epoch [6/50], Training Loss: 0.1898, Validation Loss: 0.1894
Best model weights saved.
Epoch [7/50], Training Loss: 0.1924, Validation Loss: 0.1948
Epoch [8/50], Training Loss: 0.1895, Validation Loss: 0.1923
Epoch [9/50], Training Loss: 0.1896, Validation Loss: 0.1962
Epoch [10/50], Training Loss: 0.1898, Validation Loss: 0.1917


[I 2024-12-20 04:12:00,054] Trial 1 finished with value: 0.9438444924406048 and parameters: {'hidden_size1': 68, 'hidden_size2': 26, 'dropout_rate': 0.3167053193207532, 'learning_rate': 0.058590104996889894, 'batch_size': 51}. Best is trial 1 with value: 0.9438444924406048.


Epoch [11/50], Training Loss: 0.1881, Validation Loss: 0.1958
Early stopping triggered.
Training stopped early.
Best model weights saved to best_model_trial_1.pth
Epoch [1/50], Training Loss: 0.2357, Validation Loss: 0.1966
Best model weights saved.
Epoch [2/50], Training Loss: 0.1872, Validation Loss: 0.1936
Best model weights saved.
Epoch [3/50], Training Loss: 0.1827, Validation Loss: 0.1922
Best model weights saved.
Epoch [4/50], Training Loss: 0.1799, Validation Loss: 0.1939
Epoch [5/50], Training Loss: 0.1780, Validation Loss: 0.1874
Best model weights saved.
Epoch [6/50], Training Loss: 0.1761, Validation Loss: 0.1879
Epoch [7/50], Training Loss: 0.1754, Validation Loss: 0.1868
Best model weights saved.
Epoch [8/50], Training Loss: 0.1734, Validation Loss: 0.1864
Best model weights saved.
Epoch [9/50], Training Loss: 0.1724, Validation Loss: 0.1852
Best model weights saved.
Epoch [10/50], Training Loss: 0.1710, Validation Loss: 0.1873
Epoch [11/50], Training Loss: 0.1697, Valida

[I 2024-12-20 04:12:55,129] Trial 2 finished with value: 0.9460043196544277 and parameters: {'hidden_size1': 78, 'hidden_size2': 45, 'dropout_rate': 0.367835810890057, 'learning_rate': 0.0018573713305110227, 'batch_size': 23}. Best is trial 2 with value: 0.9460043196544277.


Epoch [20/50], Training Loss: 0.1610, Validation Loss: 0.1866
Early stopping triggered.
Training stopped early.
Best model weights saved to best_model_trial_2.pth
Epoch [1/50], Training Loss: 0.3112, Validation Loss: 0.2353
Best model weights saved.
Epoch [2/50], Training Loss: 0.2213, Validation Loss: 0.2123
Best model weights saved.
Epoch [3/50], Training Loss: 0.2026, Validation Loss: 0.2025
Best model weights saved.
Epoch [4/50], Training Loss: 0.1934, Validation Loss: 0.1983
Best model weights saved.
Epoch [5/50], Training Loss: 0.1887, Validation Loss: 0.1947
Best model weights saved.
Epoch [6/50], Training Loss: 0.1856, Validation Loss: 0.1924
Best model weights saved.
Epoch [7/50], Training Loss: 0.1839, Validation Loss: 0.1914
Best model weights saved.
Epoch [8/50], Training Loss: 0.1822, Validation Loss: 0.1913
Best model weights saved.
Epoch [9/50], Training Loss: 0.1812, Validation Loss: 0.1913
Best model weights saved.
Epoch [10/50], Training Loss: 0.1801, Validation Loss:

[I 2024-12-20 04:14:26,584] Trial 3 finished with value: 0.9425940661589178 and parameters: {'hidden_size1': 72, 'hidden_size2': 22, 'dropout_rate': 0.3743961454979392, 'learning_rate': 0.00023916543792409985, 'batch_size': 19}. Best is trial 2 with value: 0.9460043196544277.


Best model weights saved to best_model_trial_3.pth
Epoch [1/50], Training Loss: 0.2339, Validation Loss: 0.2144
Best model weights saved.
Epoch [2/50], Training Loss: 0.1914, Validation Loss: 0.2109
Best model weights saved.
Epoch [3/50], Training Loss: 0.1884, Validation Loss: 0.1895
Best model weights saved.
Epoch [4/50], Training Loss: 0.1845, Validation Loss: 0.1913
Epoch [5/50], Training Loss: 0.1833, Validation Loss: 0.1881
Best model weights saved.
Epoch [6/50], Training Loss: 0.1816, Validation Loss: 0.1957
Epoch [7/50], Training Loss: 0.1809, Validation Loss: 0.1924
Epoch [8/50], Training Loss: 0.1790, Validation Loss: 0.1950
Epoch [9/50], Training Loss: 0.1779, Validation Loss: 0.1907


[I 2024-12-20 04:14:38,809] Trial 4 finished with value: 0.9417983403432989 and parameters: {'hidden_size1': 49, 'hidden_size2': 39, 'dropout_rate': 0.2536293032028729, 'learning_rate': 0.017264053227669874, 'batch_size': 64}. Best is trial 2 with value: 0.9460043196544277.


Epoch [10/50], Training Loss: 0.1771, Validation Loss: 0.1898
Early stopping triggered.
Training stopped early.
Best model weights saved to best_model_trial_4.pth
Epoch [1/50], Training Loss: 0.3269, Validation Loss: 0.2389
Best model weights saved.
Epoch [2/50], Training Loss: 0.2211, Validation Loss: 0.2107
Best model weights saved.
Epoch [3/50], Training Loss: 0.1986, Validation Loss: 0.2002
Best model weights saved.
Epoch [4/50], Training Loss: 0.1895, Validation Loss: 0.1949
Best model weights saved.
Epoch [5/50], Training Loss: 0.1852, Validation Loss: 0.1926
Best model weights saved.
Epoch [6/50], Training Loss: 0.1830, Validation Loss: 0.1913
Best model weights saved.
Epoch [7/50], Training Loss: 0.1818, Validation Loss: 0.1909
Best model weights saved.
Epoch [8/50], Training Loss: 0.1808, Validation Loss: 0.1901
Best model weights saved.
Epoch [9/50], Training Loss: 0.1790, Validation Loss: 0.1911
Epoch [10/50], Training Loss: 0.1782, Validation Loss: 0.1899
Best model weights

[I 2024-12-20 04:15:00,432] Trial 5 finished with value: 0.9421393656928498 and parameters: {'hidden_size1': 68, 'hidden_size2': 30, 'dropout_rate': 0.48338386851063697, 'learning_rate': 0.0004319376049765508, 'batch_size': 50}. Best is trial 2 with value: 0.9460043196544277.


Epoch [15/50], Training Loss: 0.1755, Validation Loss: 0.1912
Early stopping triggered.
Training stopped early.
Best model weights saved to best_model_trial_5.pth
Best hyperparameters:  {'hidden_size1': 78, 'hidden_size2': 45, 'dropout_rate': 0.367835810890057, 'learning_rate': 0.0018573713305110227, 'batch_size': 23}
Best accuracy:  0.9460043196544277

Hyperparameter Tuning Results:
   Trial  Hidden Size 1  Hidden Size 2  Dropout Rate  Learning Rate  \
0      0             85             53      0.115750       0.032454   
1      1             68             26      0.316705       0.058590   
2      2             78             45      0.367836       0.001857   
3      3             72             22      0.374396       0.000239   
4      4             49             39      0.253629       0.017264   
5      5             68             30      0.483384       0.000432   

   Batch Size  Accuracy  
0          51  0.941003  
1          51  0.943844  
2          23  0.946004  
3          

In [16]:
# Load the best model weights
best_model_weights = torch.load(f"best_model_trial_{study.best_trial.number}.pth")

preprocessor = LoanPreprocessor()

# Load the best model
best_model = LoanNeuralNetwork(
    input_size=submission_data.shape[1], 
    hidden_size1=study.best_params['hidden_size1'], 
    hidden_size2=study.best_params['hidden_size2'], 
    dropout_rate=study.best_params['dropout_rate']
).to(device)

# Load the best model weights
best_model.load_state_dict(best_model_weights)

criterion = nn.BCELoss()

In [17]:
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True)
test_loss = validate_model(best_model, test_loader, criterion, device)
print(f'Test Loss: {test_loss:.4f}')

Test Loss: 0.1822


In [20]:
best_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for features, labels in test_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = best_model(features)
            predicted = (outputs.view(-1) > 0.5).float()  # Ensure outputs are flattened
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    test_accuracy = correct / total
print(f'Test accuracy: {test_accuracy:.4f}')

Test accuracy: 0.9475


In [21]:
# Load the test data
raw_submission_data = pd.read_csv('./data/test.csv')
submission_data = preprocessor.fit_transform(raw_submission_data)

# Create the test dataset
submission_dataset = LoanDataset(submission_data, np.zeros(len(submission_data)))  # Placeholder for labels

# Create the test data loader
submission_loader = DataLoader(submission_dataset, batch_size=study.best_params['batch_size'], shuffle=False)


In [25]:
# Make predictions on the test data
predictions = []
best_model.eval()
with torch.no_grad():
    for features, _ in submission_loader:  # Ignore labels in test data
        features = features.to(device)
        outputs = best_model(features)
        predicted = (outputs.view(-1) > 0.5).float()  # Ensure outputs are flattened
        predictions.extend(predicted.cpu().numpy())

# Create a DataFrame to store the predictions
predictions_df = pd.DataFrame({
    'id': raw_submission_data['id'],  # Include the original 'id' column
    'loan_status': predictions
})

# Save the predictions to a CSV file
predictions_df.to_csv('test_predictions.csv', index=False)

print("Test predictions saved to test_predictions.csv")

Test predictions saved to test_predictions.csv
