# Neural Network 

Fingerprint: Coulomb

In [1]:
from soap import *
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import KFold  
import joblib  # For saving and loading scaler


Train data shape: (8000, 4)
Test data shape: (4000, 3)
1
1001
2001
3001
4001
5001
6001
7001
With 200 PCA components 93.8845% of the variance is explained


In [2]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=251)


In [3]:
# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Normalize the target (hform)
#target_scaler = MinMaxScaler()  # You can use StandardScaler if needed
#y_train = target_scaler.fit_transform(y_train.reshape(-1, 1) if isinstance(y_train, np.ndarray) else y_train.to_numpy().reshape(-1, 1))
#y_test = target_scaler.transform(y_test.reshape(-1, 1) if isinstance(y_test, np.ndarray) else y_test.to_numpy().reshape(-1, 1))

# Ensure y_train and y_test are properly converted to NumPy arrays
X_train = torch.tensor(X_train, dtype=torch.float32)

# Convert y_train and y_test to NumPy arrays if they are Series or other objects
if isinstance(y_train, pd.Series):
    y_train = y_train.to_numpy()

if isinstance(y_test, pd.Series):
    y_test = y_test.to_numpy()

# Convert to PyTorch tensors
y_train = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)  # Add dimension
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)  # Add dimension



In [4]:
# Define the neural network
class RegressionNN(nn.Module):
    def __init__(self, input_dim):
        super(RegressionNN, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.2),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.2),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.2),
            nn.Linear(128, 1)  # Single output for regression
        )
    
    def forward(self, x):
        return self.model(x)



In [7]:

# Define cross-validation training loop with train_test_split
def cross_val_train(model_class, X_train, y_train, epochs, k_folds):
    kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    fold_results = []
    best_val_loss = float('inf')
    best_model_state = None

    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_train)):
        print(f"\nFold {fold + 1}/{k_folds}")

        # Use train_test_split to split the fold's training data
        X_fold_train, X_val, y_fold_train, y_val = train_test_split(
            X_train[train_idx], y_train[train_idx], test_size=0.2, random_state=42
        )

        # Initialize model, optimizer, scheduler
        model = model_class(X_fold_train.shape[1])
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5)
        criterion = nn.MSELoss()

        for epoch in range(epochs):
            # Training phase
            model.train()
            optimizer.zero_grad()
            outputs = model(X_fold_train)
            loss = criterion(outputs, y_fold_train)
            loss.backward()
            optimizer.step()

            # Step the learning rate scheduler
            scheduler.step()

            # Evaluation phase
            model.eval()
            with torch.no_grad():
                val_outputs = model(X_val)
                val_loss = criterion(val_outputs, y_val)

            # Convert MSE to RMSE for better interpretability
            rmse = torch.sqrt(loss).item()
            val_rmse = torch.sqrt(val_loss).item()

            # Print RMSE every 50 epochs
            if (epoch + 1) % 5 == 0:
                print(f"Epoch [{epoch + 1}/{epochs}], RMSE: {rmse:.4f}, Val RMSE: {val_rmse:.4f}")

        # Store final validation loss for the fold
        fold_results.append(val_loss.item())

        # Save the model state if it's the best so far
        if val_loss.item() < best_val_loss:
            best_val_loss = val_loss.item()
            best_model_state = model.state_dict()

    # Print overall results
    print("\nCross-Validation Results:")
    print(f"Fold Losses: {fold_results}")
    print(f"Mean Validation Loss: {np.mean(fold_results):.4f}")
    print(f"Standard Deviation: {np.std(fold_results):.4f}")

    # Save the best model state
    torch.save(best_model_state, "best_model.pth")
    print("Best model saved as 'best_model.pth'.")

In [8]:
# Initialize loss function
criterion = nn.MSELoss()

# Perform cross-validation
cross_val_train(RegressionNN, X_train, y_train, epochs=100, k_folds=2)

# Save the model
# Example usage: torch.save(model.state_dict(), "enhanced_regression_model.pth")





Fold 1/2
Epoch [5/100], RMSE: 0.7969, Val RMSE: 0.8387
Epoch [10/100], RMSE: 0.7181, Val RMSE: 0.7378
Epoch [15/100], RMSE: 0.6717, Val RMSE: 0.7065
Epoch [20/100], RMSE: 0.6228, Val RMSE: 0.6647
Epoch [25/100], RMSE: 0.5962, Val RMSE: 0.6311
Epoch [30/100], RMSE: 0.5724, Val RMSE: 0.6231
Epoch [35/100], RMSE: 0.5534, Val RMSE: 0.6049
Epoch [40/100], RMSE: 0.5301, Val RMSE: 0.5988
Epoch [45/100], RMSE: 0.4927, Val RMSE: 0.5731
Epoch [50/100], RMSE: 0.4864, Val RMSE: 0.5537
Epoch [55/100], RMSE: 0.4609, Val RMSE: 0.5489
Epoch [60/100], RMSE: 0.4475, Val RMSE: 0.5462
Epoch [65/100], RMSE: 0.4490, Val RMSE: 0.5318
Epoch [70/100], RMSE: 0.4400, Val RMSE: 0.5184
Epoch [75/100], RMSE: 0.4272, Val RMSE: 0.5250
Epoch [80/100], RMSE: 0.4118, Val RMSE: 0.5034
Epoch [85/100], RMSE: 0.4158, Val RMSE: 0.4968
Epoch [90/100], RMSE: 0.4126, Val RMSE: 0.5052
Epoch [95/100], RMSE: 0.3742, Val RMSE: 0.4773
Epoch [100/100], RMSE: 0.3935, Val RMSE: 0.4849

Fold 2/2
Epoch [5/100], RMSE: 0.8066, Val RMSE: 0

KeyboardInterrupt: 

In [9]:
# Load the best model and train it on the full training set
def train_on_full_data(model_class, X_train, y_train, X_test, y_test, criterion, epochs=100, patience=100):
    model = model_class(X_train.shape[1])
    optimizer = torch.optim.AdamW(model.parameters(), lr=0.01, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
    
    # Load the best model state
    best_val_loss = float('inf')
    best_model_state = None
    patience_counter = 0

    model.load_state_dict(torch.load("best_model.pth"))

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train)
        loss = criterion(outputs, y_train)
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        # Validation step
        model.eval()
        with torch.no_grad():
            val_outputs = model(X_test)
            val_loss = criterion(val_outputs, y_test)

        # Step the learning rate scheduler
        scheduler.step(val_loss)

        # Save the best model if validation loss improves
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict()
            patience_counter = 0
        else:
            patience_counter += 1

        # Early stopping
        if patience_counter >= patience:
            print(f"Early stopping triggered at epoch {epoch}")
            break

        # Log progress and learning rate every 100 epochs
        if (epoch + 1) % 5 == 0:
            current_lr = optimizer.param_groups[0]['lr']
            rmse = torch.sqrt(loss).item()
            print(f"Epoch [{epoch + 1}/{epochs}], RMSE: {rmse:.4f}, Learning Rate: {current_lr:.6f}")
        current_lr = optimizer.param_groups[0]['lr']
    print("Training on full dataset completed.")

    # Evaluate on test set
    model.eval()
    with torch.no_grad():
        test_outputs = model(X_test)
        test_loss = criterion(test_outputs, y_test)
        test_rmse = torch.sqrt(test_loss).item()
        print(f"\nTest RMSE: {test_rmse:.4f}")

    # Save the final model
    torch.save(model.state_dict(), "final_model.pth")
    print("Final model saved as 'final_model.pth'.")


# Train the best model on the full training set and evaluate on test set
train_on_full_data(RegressionNN, X_train, y_train, X_test, y_test, criterion, epochs=100)

  model.load_state_dict(torch.load("best_model.pth"))


Epoch [5/100], RMSE: nan, Learning Rate: nan


KeyboardInterrupt: 

In [33]:
# Evaluate submission preparation on known test data
def evaluate_submission_pipeline(model, X_test, y_test):
    # Convert PyTorch tensor back to numpy for scaling compatibility
    X_test_numpy = X_test.numpy()
    
    # Predict using the trained model
    model.eval()
    with torch.no_grad():
        y_pred = model(X_test).numpy()

    # Descend predictions back to original scale
    y_pred = y_pred
    y_test = y_test.numpy()

    # Calculate RMSE on original scale
    rmse = np.sqrt(np.mean((y_pred - y_test) ** 2))
    print(f"RMSE on original scale: {rmse:.4f}")

    # Display some predictions vs. actuals
    comparison_df = pd.DataFrame({
        "Actual": y_test.flatten(),
        "Predicted": y_pred.flatten()
    }).head(10)
    print("\nSample Predictions vs Actuals:")
    print(comparison_df)

    return comparison_df, rmse

# Ensure the final model is loaded
final_model = RegressionNN(X_train.shape[1])
final_model.load_state_dict(torch.load("final_model.pth"))

# Evaluate submission pipeline on known data
comparison_df, rmse = evaluate_submission_pipeline(final_model, X_test, y_test)


RMSE on original scale: 0.5092

Sample Predictions vs Actuals:
     Actual  Predicted
0 -0.033324   0.180106
1 -1.398727  -0.994604
2 -1.187238  -0.954620
3 -0.948828  -0.649058
4  0.066191  -0.083653
5 -1.202062  -1.667965
6 -0.741397  -0.330585
7 -3.545803  -2.085396
8 -0.285965  -0.973162
9 -1.561886  -1.302669


  final_model.load_state_dict(torch.load("final_model.pth"))


[0.11333410441875458]
[0.11041968315839767]