# Initial setup

In [1]:
import os
import numpy as np
from getpass import getpass
from dotenv import load_dotenv
import matplotlib.pyplot as plt

import wandb
import torch
import pytorch_lightning as pl
from sklearn.model_selection import KFold, train_test_split

from torch.utils.data import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger

from models import CNNClassifier, MatrixDataset, get_feature_maps

# Load and prepare data

In [2]:
# Load data
data_path = "../data/"

all_C = np.load(data_path + 'all_C.npy') # 1326 input matrices of size 30x30
# nMF_label = np.load(data_path + 'nMF_labels.npy') # 1326 output labels (0 or 1)
kmeans_label = np.load(data_path + 'KMeans_labels_k3.npy') # 1326 output labels (0, 1 or 2)

print(f"Number of C and kmeans labels:\n {len(all_C)}, {len(kmeans_label)}\n")

Number of C and kmeans labels:
 1326, 1326



In [3]:
# Remove C with zero kmean labels 
non_zero_indices = np.where(kmeans_label != 0)[0]
all_C = all_C[non_zero_indices]
kmeans_label = kmeans_label[non_zero_indices]

print(f"Number of Cs and labels after removing those with kmeans zero labels:\n {len(all_C)}, {len(kmeans_label)}\n")

Number of Cs and labels after removing those with kmeans zero labels:
 1325, 1325



In [4]:
# Replace values in kmeans_label 
kmeans_label[kmeans_label == 1] = 0  # Replace 1 with 0
kmeans_label[kmeans_label == 2] = 1  # Replace 2 with 1

print(f"Updated kmeans_label: {np.unique(kmeans_label, return_counts=True)}")

Updated kmeans_label: (array([0, 1], dtype=int32), array([795, 530]))


In [5]:
class_0_C = all_C[kmeans_label == 0] 
class_1_C = all_C[kmeans_label == 1] 

print(f"Number of input matrices with kmeans label 0: {class_0_C.shape[0]}")
print(f"Number of input matrices with kmeans label 1: {class_1_C.shape[0]}")

Number of input matrices with kmeans label 0: 795
Number of input matrices with kmeans label 1: 530


# Training the Model

In [12]:
# Load environment variables from .env file
load_dotenv()

# Get my API key from environment variable first
wandb_key = os.getenv('WANDB_API_KEY')

# If not found in environment, prompt user
if not wandb_key:
    wandb_key = getpass("Enter your Weights & Biases API key: ")

wandb.login(key=wandb_key)

# First split into train+val and test sets (80-20 split)
X_trainval, X_test, y_trainval, y_test = train_test_split(
    all_C, kmeans_label, test_size=0.2, random_state=42, stratify=kmeans_label
)

# Hyperparameter grid
param_grid = {
    'learning_rate': [1e-3],
    'conv_channels': [[1, 32, 64, 128]],
    'fc_units': [[1152, 128, 2]],
    'dropout_rate': [0.2],
    'batch_size': [128]
}

# K-fold setup for cross-validation on training data
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

# Grid search with k-fold cross validation
best_val_acc = 0
best_params = None

print("Data split sizes:")
print(f"Train + Validation: {len(X_trainval)} samples")
print(f"Test: {len(X_test)} samples\n")

print("Starting grid search with k-fold cross validation")
print(f"Parameter grid:\n{param_grid}")
print(f"Number of folds: {kfold.n_splits}\n")

# Grid search
for lr in param_grid['learning_rate']:
    for conv in param_grid['conv_channels']:
        for fc in param_grid['fc_units']:
            for dropout in param_grid['dropout_rate']:
                for batch_size in param_grid['batch_size']:
                    print("\n" + "="*80)
                    print(f"Training with parameters:")
                    print(f"Learning rate: {lr}")
                    print(f"Conv channels: {conv}")
                    print(f"Fully connected layers: {fc}")
                    print(f"Dropout rate: {dropout}")
                    print(f"Batch size: {batch_size}\n")

                    fold_scores = []
                    
                    # Initialize a new wandb run for this parameter combination
                    run = wandb.init(
                        project="inverse_problem",
                        name=f"lr_{lr}_conv{conv}_fc{fc}_dropout_{dropout}_batch_{batch_size}",
                        config={
                            "learning_rate": lr,
                            "conv_channels": conv,
                            "fc_units": fc,
                            "dropout_rate": dropout,
                            "batch_size": batch_size
                        },
                        reinit=True
                    )
                    
                    # K-fold cross validation
                    for fold, (train_idx, val_idx) in enumerate(kfold.split(X_trainval)):
                        print(f"\nFold {fold+1}/{kfold.n_splits}")
                        print(f"Train size: {len(train_idx)}, Validation size: {len(val_idx)}")
                        
                        # Prepare data for this fold
                        X_train, X_val = X_trainval[train_idx], X_trainval[val_idx]
                        y_train, y_val = y_trainval[train_idx], y_trainval[val_idx]
                        
                        train_dataset = MatrixDataset(X_train, y_train)
                        val_dataset = MatrixDataset(X_val, y_val)
                        
                        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
                        val_loader = DataLoader(val_dataset, batch_size=batch_size)
                        
                        # Initialize model and trainer
                        model = CNNClassifier(
                            learning_rate=lr,
                            conv_channels=conv,
                            fc_units=fc,
                            dropout_rate=dropout
                        )
                        
                        checkpoint_callback = ModelCheckpoint(
                            monitor='val_acc',
                            dirpath='checkpoints/',
                            filename=f'fold{fold}_lr{lr:.4f}_conv{conv}_fc{fc}_dropout{dropout:.2f}_batch{batch_size}.ckpt',
                            save_top_k=1,
                            mode='max'
                        )
                        
                        trainer = pl.Trainer(
                            max_epochs=100,
                            logger=WandbLogger(project="inverse_problem"),
                            callbacks=[checkpoint_callback],
                            accelerator='auto'
                        )

                        # Train model
                        print("Training model...")
                        trainer.fit(model, train_loader, val_loader)
                        
                        fold_score = checkpoint_callback.best_model_score.item()
                        fold_scores.append(fold_score)
                        print(f"Fold {fold+1} best validation accuracy: {fold_score:.4f}")
                    
                    # Calculate average score for this parameter combination
                    avg_score = np.mean(fold_scores)
                    print("\nResults for current parameters:")
                    print(f"Average validation accuracy: {avg_score:.4f}")
                    print(f"Standard deviation: {np.std(fold_scores):.4f}")
                    
                    # Update best parameters if necessary
                    if avg_score > best_val_acc:
                        best_val_acc = avg_score
                        best_params = {
                            'learning_rate': lr,
                            'conv_channels': conv,
                            'fc_units': fc,
                            'dropout_rate': dropout,
                            'batch_size': batch_size
                        }
                        torch.save(model.state_dict(), f'best_model_lr{lr:.4f}_conv{conv}_fc{fc}_dropout{dropout:.2f}_batch{batch_size}.pth')  # Save the model state
                        print("\n🌟 New best model found!")
                        print(f"Best validation accuracy so far: {best_val_acc:.4f}")
                    
                    wandb.finish()
                    print("\nFinished wandb run")

print("\n" + "="*80)
print("Grid search completed!")
print("\nBest parameters found:")
print(f"Learning rate: {best_params['learning_rate']}")
print(f"Conv channels: {best_params['conv_channels']}")
print(f"Fully connected layers: {best_params['fc_units']}")
print(f"Dropout rate: {best_params['dropout_rate']}")
print(f"Batch size: {best_params['batch_size']}")
print(f"Best validation accuracy: {best_val_acc:.4f}")

# Train final model with best parameters on all training data
print("\nTraining final model with best parameters...")
final_train_dataset = MatrixDataset(X_trainval, y_trainval)
final_test_dataset = MatrixDataset(X_test, y_test)

final_train_loader = DataLoader(final_train_dataset, batch_size=best_params['batch_size'], shuffle=True)
final_test_loader = DataLoader(final_test_dataset, batch_size=best_params['batch_size'])

final_model = CNNClassifier(
    learning_rate=best_params['learning_rate'],
    conv_channels=best_params['conv_channels'],
    fc_units=best_params['fc_units'],
    dropout_rate=best_params['dropout_rate']
)

# Initialize wandb with a meaningful run name
run_name = f"Final_Training_lr{best_params['learning_rate']}_conv{best_params['conv_channels']}_fc{best_params['fc_units']}_batch{best_params['batch_size']}_dropout{best_params['dropout_rate']}"
wandb.init(project="inverse_problem", name=run_name)

final_trainer = pl.Trainer(
    max_epochs=100,
    logger=WandbLogger(project="inverse_problem"),
    accelerator='auto'
)

# Train the final model
final_trainer.fit(final_model, final_train_loader)

# Evaluate on test set
test_results = final_trainer.test(final_model, final_test_loader)

# Log final test accuracy to wandb
wandb.log({"final_test_accuracy": test_results[0]['test_acc']})

print(f"\nFinal test accuracy: {test_results[0]['test_acc']:.4f}")

wandb.finish()
print("\nFinished wandb run")



Data split sizes:
Train + Validation: 1060 samples
Test: 265 samples

Starting grid search with k-fold cross validation
Parameter grid:
{'learning_rate': [0.001], 'conv_channels': [[1, 32, 64, 128]], 'fc_units': [[1152, 128, 2]], 'dropout_rate': [0.2], 'batch_size': [128]}
Number of folds: 10


Training with parameters:
Learning rate: 0.001
Conv channels: [1, 32, 64, 128]
Fully connected layers: [1152, 128, 2]
Dropout rate: 0.2
Batch size: 128



GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs



Fold 1/10
Train size: 954, Validation size: 106
Training model...


/Users/zahra/anaconda3/envs/inverse_problem/lib/python3.11/site-packages/pytorch_lightning/loggers/wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.

  | Name        | Type       | Params | Mode 
---------------------------------------------------
0 | conv_layers | Sequential | 93.1 K | train
1 | fc_layers   | Sequential | 147 K  | train
---------------------------------------------------
240 K     Trainable params
0         Non-trainable params
240 K     Total params
0.964     Total estimated model params size (MB)
20        Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/Users/zahra/anaconda3/envs/inverse_problem/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.


                                                                           

/Users/zahra/anaconda3/envs/inverse_problem/lib/python3.11/site-packages/pytorch_lightning/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=3` in the `DataLoader` to improve performance.
/Users/zahra/anaconda3/envs/inverse_problem/lib/python3.11/site-packages/pytorch_lightning/loops/fit_loop.py:298: The number of training batches (8) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Epoch 99: 100%|██████████| 8/8 [00:04<00:00,  1.61it/s, v_num=r1dv]

`Trainer.fit` stopped: `max_epochs=100` reached.


Epoch 99: 100%|██████████| 8/8 [00:04<00:00,  1.60it/s, v_num=r1dv]
Fold 1 best validation accuracy: 0.9906

Fold 2/10
Train size: 954, Validation size: 106


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/Users/zahra/anaconda3/envs/inverse_problem/lib/python3.11/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:654: Checkpoint directory /Users/zahra/Projects/ccnetlab/inverse_problem/code/checkpoints exists and is not empty.

  | Name        | Type       | Params | Mode 
---------------------------------------------------
0 | conv_layers | Sequential | 93.1 K | train
1 | fc_layers   | Sequential | 147 K  | train
---------------------------------------------------
240 K     Trainable params
0         Non-trainable params
240 K     Total params
0.964     Total estimated model params size (MB)
20        Modules in train mode
0         Modules in eval mode


Training model...
Epoch 99: 100%|██████████| 8/8 [00:01<00:00,  4.69it/s, v_num=r1dv]        

`Trainer.fit` stopped: `max_epochs=100` reached.


Epoch 99: 100%|██████████| 8/8 [00:01<00:00,  4.67it/s, v_num=r1dv]
Fold 2 best validation accuracy: 0.9717

Fold 3/10
Train size: 954, Validation size: 106


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name        | Type       | Params | Mode 
---------------------------------------------------
0 | conv_layers | Sequential | 93.1 K | train
1 | fc_layers   | Sequential | 147 K  | train
---------------------------------------------------
240 K     Trainable params
0         Non-trainable params
240 K     Total params
0.964     Total estimated model params size (MB)
20        Modules in train mode
0         Modules in eval mode


Training model...
Epoch 99: 100%|██████████| 8/8 [00:03<00:00,  2.63it/s, v_num=r1dv]        

`Trainer.fit` stopped: `max_epochs=100` reached.


Epoch 99: 100%|██████████| 8/8 [00:03<00:00,  2.62it/s, v_num=r1dv]
Fold 3 best validation accuracy: 1.0000

Fold 4/10
Train size: 954, Validation size: 106


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Training model...



  | Name        | Type       | Params | Mode 
---------------------------------------------------
0 | conv_layers | Sequential | 93.1 K | train
1 | fc_layers   | Sequential | 147 K  | train
---------------------------------------------------
240 K     Trainable params
0         Non-trainable params
240 K     Total params
0.964     Total estimated model params size (MB)
20        Modules in train mode
0         Modules in eval mode


Epoch 99:  25%|██▌       | 2/8 [00:00<00:01,  3.32it/s, v_num=r1dv]        