In [1]:
# Ensure reproducibility
import os
import sys
import numpy as np
import torch.nn as nn

print(os.environ.get("CUBLAS_WORKSPACE_CONFIG")) # Default is None
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
print(os.environ.get("CUBLAS_WORKSPACE_CONFIG"))

None
:4096:8


In [2]:
import json
import torch
import h5py

from torch import optim
from pprint import pprint
from utils.plant_seed import plant_seed
from torch.utils.data import DataLoader
from utils.optimizer import CosineScheduler
from utils.dataset import FullySupervisedDataset
from sklearn.model_selection import train_test_split
from utils.dataset import extract_parameters_from_datatype, check_indices_overlap
from utils.load_model import load_create_classification_model, load_create_original_classification_model
from utils.load_data import split_leave_out_rest_sub_sample_indices, get_fully_supervised_finetune_indices, safe_train_test_split
from utils.train_model import train_fully_supervised_model, eval_best_model, eval_last_model, train_fully_supervised_model_no_val


seed = 0
seed_worker, g = plant_seed(seed)

In [3]:
label_map = {
    0: "BROWSE",
    1: "PLAY",
    2: "READ",
    3: "SEARCH",
    4: "WATCH",
    5: "WRITE"
}

In [4]:
config_file_path = "utils/DesktopActivity_config.json"
with open(config_file_path, "r") as file:
    config = json.load(file)
pprint(config)

{'V1Conv_5sec': {'dilation': 1, 'kernel_size': 20, 'padding': 0, 'stride': 10},
 'V2Conv_10sec': {'dilation': 1, 'kernel_size': 30, 'padding': 0, 'stride': 15},
 'V3Conv_15sec': {'dilation': 1, 'kernel_size': 40, 'padding': 0, 'stride': 10},
 'V4Conv_20sec': {'dilation': 1, 'kernel_size': 30, 'padding': 0, 'stride': 15},
 'V5Conv_25sec': {'dilation': 1, 'kernel_size': 40, 'padding': 0, 'stride': 20},
 'V6Conv_30sec': {'dilation': 1, 'kernel_size': 30, 'padding': 0, 'stride': 15},
 'data_path': 'dataset/training_data/DesktopActivity_std_norm',
 'data_type': 'overlap_0.8_window_15s',
 'downstream_proportion': 0.1,
 'downstream_training_proportion': None,
 'finetune_base_lr': 0.0001,
 'finetune_batch_size': 64,
 'finetune_epoch': 11,
 'finetune_final_lr': 0.0001,
 'finetune_max_update_epochs': 11,
 'finetune_proportion': 0.1,
 'finetune_train_proportion': 0.8,
 'finetune_warmup_epochs': 10,
 'fully_supervised_base_lr': 0.0001,
 'fully_supervised_batch_size': 64,
 'fully_supervised_epoch':

In [5]:
overlap, window_seconds, window_length = extract_parameters_from_datatype(config['data_type'])
print(f"overlap {overlap}, window seconds: {window_seconds}, window length: {window_length}")

# assign the window length to the config
config['kdd_model']['max_seq_len'] = window_length
config['kdd_original_model']['max_seq_len'] = window_length

overlap 0.8, window seconds: 15, window length: 450


In [6]:
subjects_dict_path = os.path.join(config['data_path'], config['data_type'], 'starting_indices.json')
with open(subjects_dict_path, "r") as file:
    subjects = json.load(file)
pprint(subjects)

data_file_path = os.path.join(config['data_path'], config['data_type'], f"{config['data_type']}.h5")
with h5py.File(data_file_path, 'r') as h5_file:
    last_index = h5_file['training_data'].shape[0] - 1
    # print(last_index)

{'P01': 0,
 'P02': 582,
 'P03': 1164,
 'P04': 1746,
 'P05': 2328,
 'P06': 2910,
 'P07': 3492,
 'P08': 4074}


In [7]:
# Training Loop
total_acc = 0
total_f1 = 0
for leave_out_subject in subjects:
    print(f"Leave out subject: {leave_out_subject}")
    
    leave_out_sub_sample_indices, rest_sub_sample_indices = split_leave_out_rest_sub_sample_indices(subjects, leave_out_subject, last_index)
        
    leave_out_sub_test_indices, leave_out_sub_train_indices = get_fully_supervised_finetune_indices(leave_out_sub_sample_indices, data_file_path, finetune_proportion=config['downstream_proportion'])
    
    if config['downstream_training_proportion']:
        available_rest_sub_train_indices, _ = safe_train_test_split(rest_sub_sample_indices, train_size=config['upstream_label_availability'], random_state=seed)
        
        all_train_indices = np.concatenate([leave_out_sub_train_indices, available_rest_sub_train_indices])
        
        train_indices, val_indices = train_test_split(all_train_indices, train_size=config['downstream_training_proportion'], random_state=seed)
        
        test_indices = leave_out_sub_test_indices
        
        print(f"Train indices: {train_indices.shape[0]}, Val indices: {val_indices.shape[0]}, Test indices: {test_indices.shape[0]}")
        
        train_dataset = FullySupervisedDataset(data_file_path, train_indices, label_map)
        val_dataset = FullySupervisedDataset(data_file_path, val_indices, label_map)
        test_dataset = FullySupervisedDataset(data_file_path, test_indices, label_map)
        
        train_loader = DataLoader(train_dataset, batch_size=config['fully_supervised_batch_size'], shuffle=True, num_workers=0, worker_init_fn=seed_worker, generator=g) 
        val_loader = DataLoader(val_dataset, batch_size=config['fully_supervised_batch_size'], shuffle=False, num_workers=0, worker_init_fn=seed_worker, generator=g)
        test_loader = DataLoader(test_dataset, batch_size=config['fully_supervised_batch_size'], shuffle=False, num_workers=0, worker_init_fn=seed_worker, generator=g)
        
        loaders = (train_loader, val_loader, test_loader)
        
    else:
        available_rest_sub_train_indices, _ = safe_train_test_split(rest_sub_sample_indices, train_size=config['upstream_label_availability'], random_state=seed)
        
        train_indices = np.concatenate([leave_out_sub_train_indices, available_rest_sub_train_indices])
        
        test_indices = np.array(leave_out_sub_test_indices)
        
        print(f"Train indices: {train_indices.shape[0]}, Test indices: {test_indices.shape[0]}")
        
        train_dataset = FullySupervisedDataset(data_file_path, train_indices, label_map)
        test_dataset = FullySupervisedDataset(data_file_path, test_indices, label_map)
        
        train_loader = DataLoader(train_dataset, batch_size=config['fully_supervised_batch_size'], shuffle=True, num_workers=0, worker_init_fn=seed_worker, generator=g) 
        test_loader = DataLoader(test_dataset, batch_size=config['fully_supervised_batch_size'], shuffle=False, num_workers=0, worker_init_fn=seed_worker, generator=g)
    
        loaders = (train_loader, test_loader)
        
    model, model_config = load_create_original_classification_model(config, num_classes=len(label_map))
    criterion = nn.CrossEntropyLoss(label_smoothing=config['label_smoothing'])
    # optimizer = optim.Adam(model.parameters(), lr=1.0, betas=(0.9, 0.999))
    optimizer = optim.RAdam(model.parameters(), lr=1.0, betas=(0.9, 0.999))
    scheduler = CosineScheduler(max_update=config['fully_supervised_max_update_epochs'], base_lr=config['fully_supervised_base_lr'], final_lr=config['fully_supervised_final_lr'], warmup_steps=config['fully_supervised_warmup_epochs'], warmup_begin_lr=0.0)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=scheduler)
    
    if config['downstream_training_proportion']:
        tensorboard_writer = train_fully_supervised_model(model, criterion, optimizer, scheduler, loaders, model_config, config, leave_out_subject)
    
        best_model_acc, best_model_f1 = eval_best_model(model, test_loader, config, label_map)
        
        last_model_acc, last_model_f1 = eval_last_model(model, test_loader, config, label_map)
        
        test_acc = max(best_model_acc, last_model_acc)
        test_f1 = max(best_model_f1, last_model_f1)
        
    else:
        tensorboard_writer = train_fully_supervised_model_no_val(model, criterion, optimizer, scheduler, loaders, model_config, config, leave_out_subject)
        
        last_model_acc, last_model_f1 = eval_last_model(model, test_loader, config, label_map)
        
        test_acc = last_model_acc
        test_f1 = last_model_f1

    # Accumulate accuracy and F1 score
    total_acc += test_acc
    total_f1 += test_f1

Leave out subject: P01
Distribution of indices across labels for train set:
  Label b'BROWSE': 9 indices
  Label b'PLAY': 9 indices
  Label b'READ': 9 indices
  Label b'SEARCH': 9 indices
  Label b'WATCH': 9 indices
  Label b'WRITE': 9 indices
Total train indices: 54
Total test indices: 528
Train indices: 4128, Test indices: 528
{'conv_config': {'dilation': 1, 'kernel_size': 40, 'padding': 0, 'stride': 10},
 'd_model': 64,
 'dim_feedforward': 256,
 'emb_dropout': 0.1,
 'embedding': 'convolution',
 'enc_dropout': 0.1,
 'feat_dim': 2,
 'max_len': 450,
 'n_heads': 8,
 'n_layers': 3,
 'num_classes': 6,
 'pre_norm': False}
Convolutional embedding: 42 sequence length.
Run cmd: tensorboard --logdir=results/DesktopActivity/overlap_0.8_window_15s/fully_supervised_downstream_0.1_upstream_avail_1.0/feat_dim_2_d_model_64_n_heads_8_n_layers_3_d_ff_256_emb_dropout_0.1_enc_dropout_0.1_embedding_convolution_conv_config_V3Conv_15sec/epochs_11_max_update_steps_11_warmup_steps_10_batch_size_64_base_lr_0.

KeyboardInterrupt: 

In [None]:
# Compute the average accuracy and F1 score after the loop
average_acc = total_acc / len(subjects)
average_f1 = total_f1 / len(subjects)

print(f"Average Accuracy across all folds: {average_acc}")
print(f"Average F1 Score across all folds: {average_f1}")

# Get the current directory from the config
current_dir = config['model_path']

# Navigate to the parent directory of current_dir
parent_dir = os.path.dirname(current_dir)

# Define the filename for the average scores
filename = "average_acc_and_f1.txt"

# Full path for the file
file_path = os.path.join(parent_dir, filename)

# Write the average scores to the file
with open(file_path, 'w') as f:
    f.write(f"Average Accuracy: {average_acc}\n")
    f.write(f"Average F1 Score: {average_f1}\n")

print(f"Saved average scores to {file_path}")