In [1]:
# Ensure reproducibility
import os
print(os.environ.get("CUBLAS_WORKSPACE_CONFIG")) # Default is None
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
print(os.environ.get("CUBLAS_WORKSPACE_CONFIG"))

None
:4096:8


In [2]:
import json
import torch
import h5py
import torch.nn as nn

from torch import optim
from pprint import pprint
from utils.plant_seed import plant_seed
from torch.utils.data import DataLoader
from utils.optimizer import CosineScheduler
from utils.dataset import FullySupervisedDataset
from sklearn.model_selection import train_test_split
from utils.dataset import extract_parameters_from_datatype
from utils.load_model import load_create_classification_model
from utils.load_data import get_fully_supervised_pretrain_indices
from utils.train_model import train_fully_supervised_pretrain_model


seed = 0
seed_worker, g = plant_seed(seed)

In [3]:
label_map = {
    0: "BROWSE",
    1: "PLAY",
    2: "READ",
    3: "SEARCH",
    4: "WATCH",
    5: "WRITE"
}

In [4]:
config_file_path = "utils/DesktopActivity_config.json"
with open(config_file_path, "r") as file:
    config = json.load(file)
pprint(config)

{'V1Conv': {'dilation': 1, 'kernel_size': 3, 'padding': 1, 'stride': 1},
 'V2Conv': {'dilation': 1, 'kernel_size': 3, 'padding': 1, 'stride': 1},
 'data_path': 'dataset/training_data/DesktopActivity_Standardized',
 'data_type': 'overlap_0.0_window_10s',
 'finetune_base_lr': 0.001,
 'finetune_batch_size': 64,
 'finetune_epoch': 100,
 'finetune_final_lr': 0.0001,
 'finetune_max_update_epochs': 100,
 'finetune_proportion': 0.8,
 'finetune_warmup_epochs': 10,
 'kdd_model': {'conv_config': None,
               'd_model': 128,
               'dim_feedforward': 512,
               'emb_dropout': 0.1,
               'embedding': 'linear',
               'enc_dropout': 0.1,
               'feat_dim': 2,
               'max_seq_len': None,
               'n_heads': 8,
               'n_layers': 6},
 'label_smoothing': 0.1,
 'pretrain_base_lr': 0.001,
 'pretrain_batch_size': 64,
 'pretrain_epoch': 100,
 'pretrain_final_lr': 0.0001,
 'pretrain_max_update_epochs': 100,
 'pretrain_model_path': None,

In [5]:
overlap, window_seconds, window_length = extract_parameters_from_datatype(config['data_type'])
print(f"overlap {overlap}, window seconds: {window_seconds}, window length: {window_length}")

# assign the window length to the config
config['kdd_model']['max_seq_len'] = window_length

overlap 0.0, window seconds: 10, window length: 300


In [6]:
subjects_dict_path = os.path.join(config['data_path'], config['data_type'], 'starting_indices.json')
with open(subjects_dict_path, "r") as file:
    subjects = json.load(file)
pprint(subjects)

data_file_path = os.path.join(config['data_path'], config['data_type'], f"{config['data_type']}.h5")
with h5py.File(data_file_path, 'r') as h5_file:
    last_index = h5_file['training_data'].shape[0] - 1
    # print(last_index)

{'P01': 0,
 'P02': 179,
 'P03': 359,
 'P04': 539,
 'P05': 719,
 'P06': 899,
 'P07': 1079,
 'P08': 1259}


In [7]:

# Pretrain Loop
for leave_out_subject in subjects:
    print(f"Leave out subject: {leave_out_subject}")
    pretrain_test_indices, pretrain_train_indices = get_fully_supervised_pretrain_indices(subjects, leave_out_subject, last_index)
    # split train indices into train and validation
    pretrain_train_indices, pretrain_val_indices = train_test_split(pretrain_train_indices, test_size=config['pretrain_proportion'], random_state=seed)
    
    train_dataset = FullySupervisedDataset(data_file_path, pretrain_train_indices, label_map)
    val_dataset = FullySupervisedDataset(data_file_path, pretrain_val_indices, label_map)
    test_dataset = FullySupervisedDataset(data_file_path, pretrain_test_indices, label_map)
    
    train_loader = DataLoader(train_dataset, batch_size=config['pretrain_batch_size'], shuffle=True, num_workers=os.cpu_count(), worker_init_fn=seed_worker, generator=g) 
    val_loader = DataLoader(train_dataset, batch_size=config['pretrain_batch_size'], shuffle=False, num_workers=os.cpu_count(), worker_init_fn=seed_worker, generator=g) 
    test_loader = DataLoader(train_dataset, batch_size=config['pretrain_batch_size'], shuffle=False, num_workers=os.cpu_count(), worker_init_fn=seed_worker, generator=g)
    
    loaders = (train_loader, val_loader, test_loader)
    
    model, model_config = load_create_classification_model(config, num_classes=len(label_map))
    
    criterion = nn.CrossEntropyLoss(label_smoothing=config['label_smoothing'])
    optimizer = optim.Adam(model.parameters(), lr=1.0, betas=(0.9, 0.999))
    scheduler = CosineScheduler(max_update=config['pretrain_max_update_epochs'], base_lr=config['pretrain_base_lr'], final_lr=config['pretrain_final_lr'], warmup_steps=config['pretrain_warmup_epochs'], warmup_begin_lr=0.0)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=scheduler)
    
    tensorboard_writer = train_fully_supervised_pretrain_model(model, criterion, optimizer, scheduler, loaders, model_config, config, leave_out_subject)

Leave out subject: P01
{'conv_config': None,
 'd_model': 128,
 'dim_feedforward': 512,
 'emb_dropout': 0.1,
 'embedding': 'linear',
 'enc_dropout': 0.1,
 'feat_dim': 2,
 'max_len': 300,
 'n_heads': 8,
 'n_layers': 6,
 'num_classes': 6}
Number of classes: 6
Linear embedding: 300 sequence length.
Run cmd: tensorboard --logdir=results/DesktopActivity/overlap_0.0_window_10s/feat_dim_2_d_model_128_n_heads_8_n_layers_6_d_ff_512_emb_dropout_0.1_enc_dropout_0.1_embedding_linear_conv_config_None/epochs_100_max_update_steps_100_warmup_steps_10_batch_size_64_base_lr_0.001_final_lr_0.0001_label_smoothing_0.1/P01_leave_out/TensorBoard_Log then open http://localhost:6006
Leave out subject: P02
{'conv_config': None,
 'd_model': 128,
 'dim_feedforward': 512,
 'emb_dropout': 0.1,
 'embedding': 'linear',
 'enc_dropout': 0.1,
 'feat_dim': 2,
 'max_len': 300,
 'n_heads': 8,
 'n_layers': 6,
 'num_classes': 6}
Number of classes: 6
Linear embedding: 300 sequence length.
Run cmd: tensorboard --logdir=results/