In [2]:
%load_ext autoreload
%autoreload 2
import torch
import gc
torch.cuda.empty_cache()
gc.collect()
from final_model import train_and_save

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


<h2>Notes for Trevor</h2>
If you want to load a new training_test split (or if you change parameters like augmentation or balance), then select False for use_existing_split

Balance now uses augmented images to balance the dataset (so augmentation is ignored)

Check the final_model folder for outputs. If you want to train on the entire training dataset you should first run it on the train/test split (which will save the training params) and THEN run it on the whole thing. You can adjust the full model's training params in final_model/training_params.


Every time you run a model on the train/test split it will save all of the analytical data I think we need, as well as the config settings in model_results.csv

<h2>Set config values</h2>

In [3]:
config = {
    # Files
    'output_folder': './final_model',
    'train_csv': './COMP90086_2024_Project_train/train.csv',
    'train_img_dir': './preprocessed_images/train',
    'test_csv': './final_model/full_predictions.csv',
    'test_img_dir': './preprocessed_images/test',
    'training_params_file': 'training_params.json',

    # Training parameters
    'model': 'EfficientSpatialChannelAttentionNet',
    'target_column': 'stable_height',
    'additional_columns': [], # You'll need to train these first if you want to use
    'balance_dataset': False,
    'use_augmentation': False,
    'use_quantized': False,
    'val_ratio': 0.1,
    'batch_size': 48,
    'dropout_rate': 0.3,
    'learning_rate': 0.001,
    'lr_factor': 0.5,
    'lr_patience': 2,
    'freeze_layers': False,
    'num_epochs': 1,
    'use_existing_split': True,
    'early_stopping_patience': 3,
    'weight_decay': 1e-7 # L2
}

<h2>Train with validation set</h2>

In [4]:
model_id = train_and_save(config)

Calculating Stats:   0%|          | 0/7680 [00:00<?, ?sample/s]

Dataset mean: tensor([0.4675, 0.4412, 0.4066])
Dataset std: tensor([0.2710, 0.2281, 0.1905])
Split and stats saved to ./final_model/split_and_stats.json
Model: EfficientSpatialChannelAttentionNet
Target feature: stable_height (6 categories)
Epoch 1/1


Training:   0%|          | 0/144 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 14.00 MiB. GPU 0 has a total capacity of 44.34 GiB of which 8.88 MiB is free. Process 3103895 has 19.87 GiB memory in use. Process 3907911 has 494.00 MiB memory in use. Process 1060255 has 16.63 GiB memory in use. Process 3732376 has 7.32 GiB memory in use. Of the allocated memory 6.81 GiB is allocated by PyTorch, and 209.62 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

<h2>Train on whole set and predict</h1>

In [None]:
train_and_save(config, use_full_dataset=True, do_predictions=True, model_id = model_id)

<h2>Batch testing</h2>

In [98]:
for feature in ['shapeset', 'instability_type', 'type', 'cam_angle']:
    try:
        config['target_column'] = feature
        model_id = train_and_save(config, do_predictions=False)
        train_and_save(config, use_full_dataset=True, do_predictions=True, model_id = model_id)
    except Exception as e:
        print(f"Training failed for feature '{feature}'. Error: {str(e)}")
        continue

Loading existing split and stats from ./final_model/split_and_stats.json
Model: EfficientSpatialChannelAttentionNet
Target feature: shapeset (2 categories)
Epoch 1/1


Training:   0%|          | 0/576 [00:00<?, ?it/s]

Validating:   0%|          | 0/16 [00:00<?, ?it/s]

Train Loss: 0.0903, Train Acc: 96.69%
Val Loss: 0.0588, Val Acc: 98.05%
Learning Rate: 0.001000
------------------------------------------------------------
Results recorded in ./final_model/model_results.csv
Loading existing split and stats from ./final_model/split_and_stats_full.json
Training failed for feature 'shapeset'. Error: float() argument must be a string or a real number, not 'NoneType'
Loading existing split and stats from ./final_model/split_and_stats.json
Model: EfficientSpatialChannelAttentionNet
Target feature: instability_type (3 categories)
Epoch 1/1


Training:   0%|          | 0/576 [00:00<?, ?it/s]

Exception ignored in: <function _releaseLock at 0x790fc92a1a20>
Traceback (most recent call last):
  File "/usr/lib/python3.10/logging/__init__.py", line 228, in _releaseLock
    def _releaseLock():
KeyboardInterrupt: 
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x790e50a2c550>
Traceback (most recent call last):
  File "/workspace/COMP90086_Project/.venv/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__
    self._shutdown_workers()
  File "/workspace/COMP90086_Project/.venv/lib/python3.10/site-packages/torch/utils/data/dataloader.py", line 1441, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/usr/lib/python3.10/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
  File "/usr/lib/python3.10/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
  File "/usr/lib/python3.10/multiprocessing/connection.py", line 931, in wait
  

: 

: 

In [None]:

import copy
import json
from typing import Dict, Any, List

def ab_test_models(base_config: Dict[str, Any], variations: Dict[str, List[Any]]) -> Dict[str, str]:
    results = {}
    
    # Train baseline model
    print("Training baseline model...")
    try:
        baseline_model_id = train_and_save(base_config)
        results['baseline'] = baseline_model_id
    except Exception as e:
        print(f"Error training baseline model: {str(e)}")
        results['baseline'] = "Error"
    
    # Train and evaluate variations
    for param, values in variations.items():
        for value in values:
            test_config = copy.deepcopy(base_config)
            test_config[param] = value
            
            test_name = f"{param}_{value}"
            print(f"Training model with {test_name}...")
            
            try:
                model_id = train_and_save(test_config)
                results[test_name] = model_id
            except Exception as e:
                print(f"Error training model {test_name}: {str(e)}")
                results[test_name] = "Error"
    
    # Save results to JSON file
    with open('results.json', 'w') as f:
        json.dump(results, f, indent=4)
    
    return results

# Define the variations to test
variations = {
    'model': ['StabilityPredictor', 'EfficientAttentionNet', 'EfficientChannelAttentionNet', 'ConvnextPredictor'],
    'additional_columns': [
        ['shapeset', 'cam_angle'],
        ['shapeset', 'type', 'instability_type', 'cam_angle']
    ],
    'balance_dataset': [True],
    'use_augmentation': [False],
    'learning_rate': [0.01, 0.0001]
}

# Run the AB tests
results = ab_test_models(config, variations)

# Print results
for test_name, model_id in results.items():
    print(f"{test_name}: {model_id}")

print("Results have been saved to 'results.json'")

In [None]:
import json
import pandas as pd

# Load results.json
with open('results.json', 'r') as f:
    results = json.load(f)

# Load model_results.csv
df = pd.read_csv('final_model/model_results.csv')

# Create a dictionary to store the matched results
matched_results = {}

for key, model_id in results.items():
    if model_id == "Error":
        matched_results[key] = {"accuracy": "N/A", "loss": "N/A"}
    else:
        model_data = df[df['id'] == model_id]
        if not model_data.empty:
            matched_results[key] = {
                "accuracy": model_data['val_acc'].values[0],
                "loss": model_data['val_loss'].values[0]
            }
        else:
            matched_results[key] = {"accuracy": "Not found", "loss": "Not found"}

# Display results
print("Model Results:")
print("--------------")
for key, data in matched_results.items():
    print(f"{key}:")
    print(f"  Accuracy: {data['accuracy']}")
    print(f"  Loss: {data['loss']}")
    print()

# Optional: Create a sorted list of models by accuracy
sorted_models = sorted(
    [(k, v) for k, v in matched_results.items() if v['accuracy'] != 'N/A' and v['accuracy'] != 'Not found'],
    key=lambda x: float(x[1]['accuracy']),
    reverse=True
)

print("Models sorted by accuracy (highest to lowest):")
print("----------------------------------------------")
for model, data in sorted_models:
    print(f"{model}: Accuracy = {data['accuracy']}, Loss = {data['loss']}")