In [None]:
# Ensure reproducibility
import os
print(os.environ.get("CUBLAS_WORKSPACE_CONFIG")) # Default is None
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
print(os.environ.get("CUBLAS_WORKSPACE_CONFIG"))

In [None]:
import json
import torch
import h5py
import torch.nn as nn

from torch import optim
from pprint import pprint
from utils.plant_seed import plant_seed
from torch.utils.data import DataLoader
from utils.optimizer import CosineScheduler
from utils.dataset import FullySupervisedDataset
from sklearn.model_selection import train_test_split
from utils.dataset import extract_parameters_from_datatype, check_indices_overlap
from utils.load_model import load_create_classification_model
from utils.load_data import get_fully_supervised_pretrain_indices
from utils.train_model import train_fully_supervised_pretrain_model, eval_best_model, eval_last_model


seed = 0
seed_worker, g = plant_seed(seed)

In [None]:
label_map = {
    0: "BROWSE",
    1: "PLAY",
    2: "READ",
    3: "SEARCH",
    4: "WATCH",
    5: "WRITE"
}

In [None]:
config_file_path = "utils/DesktopActivity_config.json"
with open(config_file_path, "r") as file:
    config = json.load(file)
pprint(config)

In [None]:
overlap, window_seconds, window_length = extract_parameters_from_datatype(config['data_type'])
print(f"overlap {overlap}, window seconds: {window_seconds}, window length: {window_length}")

# assign the window length to the config
config['kdd_model']['max_seq_len'] = window_length

In [None]:
subjects_dict_path = os.path.join(config['data_path'], config['data_type'], 'starting_indices.json')
with open(subjects_dict_path, "r") as file:
    subjects = json.load(file)
pprint(subjects)

data_file_path = os.path.join(config['data_path'], config['data_type'], f"{config['data_type']}.h5")
with h5py.File(data_file_path, 'r') as h5_file:
    last_index = h5_file['training_data'].shape[0] - 1
    # print(last_index)

In [None]:
# Pretrain Loop
total_acc = 0
total_f1 = 0
for leave_out_subject in subjects:
    print(f"Leave out subject: {leave_out_subject}")
    pretrain_test_indices, pretrain_train_indices = get_fully_supervised_pretrain_indices(subjects, leave_out_subject, last_index)
    # split train indices into train and validation
    pretrain_train_indices, pretrain_val_indices = train_test_split(pretrain_train_indices, test_size=config['pretrain_proportion'], random_state=seed)
    
    overlap_check = check_indices_overlap(pretrain_train_indices, pretrain_val_indices, pretrain_test_indices)
    if not overlap_check:
        print("Warning: Overlap detected between datasets. Please review the index splitting process.")
    else:
        print("No overlap detected between datasets.")
    
    train_dataset = FullySupervisedDataset(data_file_path, pretrain_train_indices, label_map)
    val_dataset = FullySupervisedDataset(data_file_path, pretrain_val_indices, label_map)
    test_dataset = FullySupervisedDataset(data_file_path, pretrain_test_indices, label_map)
    
    train_loader = DataLoader(train_dataset, batch_size=config['pretrain_batch_size'], shuffle=True, num_workers=0, worker_init_fn=seed_worker, generator=g) 
    val_loader = DataLoader(train_dataset, batch_size=config['pretrain_batch_size'], shuffle=False, num_workers=0, worker_init_fn=seed_worker, generator=g)
    test_loader = DataLoader(test_dataset, batch_size=config['pretrain_batch_size'], shuffle=False, num_workers=0, worker_init_fn=seed_worker, generator=g)
    
    loaders = (train_loader, val_loader, test_loader)
    
    model, model_config = load_create_classification_model(config, num_classes=len(label_map))
    
    criterion = nn.CrossEntropyLoss(label_smoothing=config['label_smoothing'])
    optimizer = optim.Adam(model.parameters(), lr=1.0, betas=(0.9, 0.999))
    scheduler = CosineScheduler(max_update=config['pretrain_max_update_epochs'], base_lr=config['pretrain_base_lr'], final_lr=config['pretrain_final_lr'], warmup_steps=config['pretrain_warmup_epochs'], warmup_begin_lr=0.0)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=scheduler)
    
    tensorboard_writer = train_fully_supervised_pretrain_model(model, criterion, optimizer, scheduler, loaders, model_config, config, leave_out_subject)
    
    best_model_acc, best_model_f1 = eval_best_model(model, test_loader, config, label_map)
    
    last_model_acc, last_model_f1 = eval_last_model(model, test_loader, config, label_map)
    
    test_acc = max(best_model_acc, last_model_acc)
    test_f1 = max(best_model_f1, last_model_f1)
    
    # Accumulate accuracy and F1 score
    total_acc += test_acc
    total_f1 += test_f1

In [None]:
# Compute the average accuracy and F1 score after the loop
average_acc = total_acc / len(subjects)
average_f1 = total_f1 / len(subjects)

print(f"Average Accuracy across all folds: {average_acc}")
print(f"Average F1 Score across all folds: {average_f1}")

# Get the current directory from the config
current_dir = config['model_path']

# Navigate to the parent directory of current_dir
parent_dir = os.path.dirname(current_dir)

# Define the filename for the average scores
filename = "average_acc_and_f1.txt"

# Full path for the file
file_path = os.path.join(parent_dir, filename)

# Write the average scores to the file
with open(file_path, 'w') as f:
    f.write(f"Average Accuracy: {average_acc}\n")
    f.write(f"Average F1 Score: {average_f1}\n")

print(f"Saved average scores to {file_path}")