In [None]:
# Ensure reproducibility
import os
import sys
import numpy as np
import torch.nn as nn

print(os.environ.get("CUBLAS_WORKSPACE_CONFIG")) # Default is None
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
print(os.environ.get("CUBLAS_WORKSPACE_CONFIG"))

In [None]:
import json
import torch
import h5py

from torch import optim
from pprint import pprint
from utils.plant_seed import plant_seed
from torch.utils.data import DataLoader
from utils.optimizer import CosineScheduler
from utils.dataset import SelfSupervisedDataset, FullySupervisedDataset
from sklearn.model_selection import train_test_split
from utils.dataset import extract_parameters_from_datatype, check_indices_overlap
from utils.load_model import load_create_imputation_model, load_create_classification_model, load_create_original_imputation_model, load_create_original_classification_model
from utils.load_data import get_self_supervised_pretrain_indices, get_fully_supervised_finetune_indices, safe_train_test_split
from modules.loss import MaskedMSELoss
from utils.train_model import train_self_supervised_pretrain_model, eval_best_imputation_model, eval_last_imputation_model, train_self_supervised_finetune_model, eval_last_model, eval_best_model, train_self_supervised_finetune_model_no_val


seed = 0
seed_worker, g = plant_seed(seed)

In [None]:
label_map = {
    0: "BROWSE",
    1: "PLAY",
    2: "READ",
    3: "SEARCH",
    4: "WATCH",
    5: "WRITE"
}

In [None]:
config_file_path = "utils/DesktopActivity_config.json"
with open(config_file_path, "r") as file:
    config = json.load(file)
pprint(config)

In [None]:
overlap, window_seconds, window_length = extract_parameters_from_datatype(config['data_type'])
print(f"overlap {overlap}, window seconds: {window_seconds}, window length: {window_length}")

# assign the window length to the config
config['kdd_model']['max_seq_len'] = window_length
config['kdd_original_model']['max_seq_len'] = window_length

In [None]:
subjects_dict_path = os.path.join(config['data_path'], config['data_type'], 'starting_indices.json')
with open(subjects_dict_path, "r") as file:
    subjects = json.load(file)
pprint(subjects)

data_file_path = os.path.join(config['data_path'], config['data_type'], f"{config['data_type']}.h5")
with h5py.File(data_file_path, 'r') as h5_file:
    last_index = h5_file['training_data'].shape[0] - 1
    # print(last_index)

In [None]:
total_acc = 0
total_f1 = 0
# Pretrain Loop
for leave_out_subject in subjects:
    print(f"Leave out subject: {leave_out_subject}")
    pretrain_test_indices, pretrain_train_indices = get_self_supervised_pretrain_indices(subjects, leave_out_subject, last_index)
    
    # Print the sizes of each split for pretraining
    print(f"Pretraining data split:")
    print(f"  Train set size: {len(pretrain_train_indices)}")
    print(f"  Test set size: {len(pretrain_test_indices)}")

    train_dataset = SelfSupervisedDataset(data_file_path, pretrain_train_indices, mean_mask_length=3, masking_ratio=0.10)
    test_dataset = SelfSupervisedDataset(data_file_path, pretrain_test_indices, mean_mask_length=3, masking_ratio=0.10)

    train_loader = DataLoader(train_dataset, batch_size=config['pretrain_batch_size'], shuffle=True, num_workers=0, worker_init_fn=seed_worker, generator=g) 
    test_loader = DataLoader(test_dataset, batch_size=config['pretrain_batch_size'], shuffle=False, num_workers=0, worker_init_fn=seed_worker, generator=g)

    loaders = (train_loader, test_loader)

    model, model_config = load_create_original_imputation_model(config)

    criterion = MaskedMSELoss()
    # optimizer = optim.Adam(model.parameters(), lr=1.0, betas=(0.9, 0.999))
    optimizer = optim.RAdam(model.parameters(), lr=1.0, betas=(0.9, 0.999))
    scheduler = CosineScheduler(max_update=config['pretrain_max_update_epochs'], base_lr=config['pretrain_base_lr'], final_lr=config['pretrain_final_lr'], warmup_steps=config['pretrain_warmup_epochs'], warmup_begin_lr=0.0)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=scheduler)


    tensorboard = train_self_supervised_pretrain_model(model, criterion, optimizer, scheduler, loaders, model_config, config, leave_out_subject)

    eval_last_imputation_model(model, test_loader, config, os.path.join(config['model_path'], "imputate_result"))
    
    
    
    # ========================================================================================================================
    # Finetune the model
    config['pretrain_model_path'] = config['model_path']
    # config['model_path'] is where the model is saved
    finetune_test_indices, finetune_train_indices = get_fully_supervised_finetune_indices(pretrain_test_indices, data_file_path, finetune_proportion=config['finetune_proportion'])
    
    if config['finetune_train_proportion']:
        available_pretrain_train_indices, _ = safe_train_test_split(pretrain_train_indices, train_size=config['pretrain_label_availability'], random_state=seed)
        
        all_train_indices = np.concatenate([finetune_train_indices, available_pretrain_train_indices])
        finetune_train_indices, finetune_val_indices = train_test_split(all_train_indices, train_size=config['finetune_train_proportion'], random_state=seed)
        finetune_test_indices = np.array(finetune_test_indices)
        
        # Print the sizes of each split for finetuning
        print(f"Finetuning data split:")
        print(f"  Train set size: {len(finetune_train_indices)}")
        print(f"  Validation set size: {len(finetune_val_indices)}")
        print(f"  Test set size: {len(finetune_test_indices)}")
    
        train_dataset = FullySupervisedDataset(data_file_path, finetune_train_indices, label_map)
        val_dataset = FullySupervisedDataset(data_file_path, finetune_val_indices, label_map)
        test_dataset = FullySupervisedDataset(data_file_path, finetune_test_indices, label_map)
    
        train_loader = DataLoader(train_dataset, batch_size=config['finetune_batch_size'], shuffle=True, num_workers=0, worker_init_fn=seed_worker, generator=g) 
        val_loader = DataLoader(val_dataset, batch_size=config['finetune_batch_size'], shuffle=False, num_workers=0, worker_init_fn=seed_worker, generator=g)
        test_loader = DataLoader(test_dataset, batch_size=config['finetune_batch_size'], shuffle=False, num_workers=0, worker_init_fn=seed_worker, generator=g)
    
        loaders = (train_loader, val_loader, test_loader)
    else:
        available_pretrain_train_indices, _ = safe_train_test_split(pretrain_train_indices, train_size=config['pretrain_label_availability'], random_state=seed)
        
        finetune_train_indices = np.concatenate([finetune_train_indices, available_pretrain_train_indices])
        finetune_test_indices = np.array(finetune_test_indices)
        
        # Print the sizes of each split for finetuning without validation
        print(f"Finetuning data split (no validation):")
        print(f"  Train set size: {len(finetune_train_indices)}")
        print(f"  Test set size: {len(finetune_test_indices)}")
        
        # sys.exit("Not yet done!")
        train_dataset = FullySupervisedDataset(data_file_path, finetune_train_indices, label_map)
        test_dataset = FullySupervisedDataset(data_file_path, finetune_test_indices, label_map)
    
        train_loader = DataLoader(train_dataset, batch_size=config['finetune_batch_size'], shuffle=True, num_workers=0, worker_init_fn=seed_worker, generator=g) 
        test_loader = DataLoader(test_dataset, batch_size=config['finetune_batch_size'], shuffle=False, num_workers=0, worker_init_fn=seed_worker, generator=g)
        
        loaders = (train_loader, test_loader)

    model, model_config = load_create_original_classification_model(config, num_classes=len(label_map))
    
    criterion = nn.CrossEntropyLoss(label_smoothing=config['label_smoothing'])
    # optimizer = optim.Adam(model.parameters(), lr=1.0, betas=(0.9, 0.999))
    optimizer = optim.RAdam(model.parameters(), lr=1.0, betas=(0.9, 0.999))
    scheduler = CosineScheduler(max_update=config['finetune_max_update_epochs'], base_lr=config['finetune_base_lr'], final_lr=config['finetune_final_lr'], warmup_steps=config['finetune_warmup_epochs'], warmup_begin_lr=0.0)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=scheduler)
    
    if config['finetune_train_proportion']:
        tensorboard_writer = train_self_supervised_finetune_model(model, criterion, optimizer, scheduler, loaders, model_config, config)
        
        best_model_acc, best_model_f1 = eval_best_model(model, test_loader, config, label_map)
        last_model_acc, last_model_f1 = eval_last_model(model, test_loader, config, label_map)
        
        test_acc = max(best_model_acc, last_model_acc)
        test_f1 = max(best_model_f1, last_model_f1)
    else:
        # sys.exit("Not yet done!")
        tensorboard_writer = train_self_supervised_finetune_model_no_val(model, criterion, optimizer, scheduler, loaders, model_config, config)
        
        last_model_acc, last_model_f1 = eval_last_model(model, test_loader, config, label_map)
        
        test_acc = last_model_acc
        test_f1 = last_model_f1
    
    # Accumulate accuracy and F1 score
    total_acc += test_acc
    total_f1 += test_f1
    
    # clear the config['pretrain_model_path'] to None for next subject
    config['pretrain_model_path'] = None

In [None]:
# Compute the average accuracy and F1 score after the loop
average_acc = total_acc / len(subjects)
average_f1 = total_f1 / len(subjects)

print(f"Average Accuracy across all folds: {average_acc}")
print(f"Average F1 Score across all folds: {average_f1}")

# Get the current directory from the config
current_dir = config['model_path']

# Define the filename for the average scores
filename = "average_acc_and_f1.txt"

# Full path for the file
file_path = os.path.join(current_dir, filename)

# Write the average scores to the file
with open(file_path, 'w') as f:
    f.write(f"Average Accuracy: {average_acc}\n")
    f.write(f"Average F1 Score: {average_f1}\n")

print(f"Saved average scores to {file_path}")