# Debug

Debugging on machine learning project can be very difficult. In this project, subject model and object model are rather seperated, the debug task can be divided into debugging `SubjectModel` and debugging `ObjectModel`.

Some resources on dubugging ML projects:
[How to unit test machine learning code](https://thenerdstation.medium.com/how-to-unit-test-machine-learning-code-57cf6fd81765)
[What should I do when my neural network doesn't learn?](https://stats.stackexchange.com/questions/352036/what-should-i-do-when-my-neural-network-doesnt-learn)

## Preparation (load data etc.)

First we import necessary libraries.

In [None]:
import os
import time
from datetime import datetime
import json

from tqdm.auto import tqdm
from tqdm.auto import trange
import torch
import torch.utils.data as Data
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertTokenizer

from data_gen import BertDataGenerator, DevDataGenerator, MyDataset, MyDevDataset, collate_fn, dev_collate_fn
from model_bert_based import SubjectModel, ObjectModel
from utils import para_eval
import config

Load training and dev data and define constants.

In [None]:
BERT_MODEL_NAME = config.bert_model_name
LEARNING_RATE = config.learning_rate
WORD_EMB_SIZE = config.word_emb_size # default bert embedding size
EPOCH_NUM = config.epoch_num
BATCH_SIZE = config.batch_size
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

file_dir = os.getcwd()
train_path = os.path.join(file_dir, 'generated/train_data_me.json')
dev_path = os.path.join(file_dir, 'generated/dev_data_me.json')
generated_schema_path = os.path.join(file_dir, 'generated/schemas_me.json')
generated_char_path = os.path.join(file_dir, 'generated/all_chars_me.json')
train_data = json.load(open(train_path))
dev_data = json.load(open(dev_path))
id2predicate, predicate2id = json.load(open(generated_schema_path))
id2predicate = {int(i): j for i, j in id2predicate.items()}
id2predicate[0] = "未分类"
predicate2id["未分类"] = 0
id2char, char2id = json.load(open(generated_char_path))

NUM_CLASSES = len(predicate2id)
config.num_classes = NUM_CLASSES

In [None]:
# Set debug mode to True to only train on a small batch of data
config.debug_mode = True

Process data

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)
dg = BertDataGenerator(train_data, bert_tokenizer)
T, S1, S2, K1, K2, O1, O2, attention_masks = dg.pro_res()

In [None]:
train_dataset = MyDataset(T, S1, S2, K1, K2, O1, O2, attention_masks)
train_loader = Data.DataLoader(
    dataset=train_dataset,      # torch TensorDataset format
    batch_size=BATCH_SIZE,      # mini batch size
    shuffle=True,               # random shuffle for training
    num_workers=1,
    collate_fn=collate_fn,      # subprocesses for loading data
)

Define subject model

In [None]:
subject_model = SubjectModel(WORD_EMB_SIZE).to(device)
if torch.cuda.device_count() > 1:
    print('Using', torch.cuda.device_count(), "GPUs!")
    subject_model = nn.DataParallel(subject_model)
print("word embeding size is", WORD_EMB_SIZE)

Define update related variables and metrics

In [None]:
params = list(subject_model.parameters())
optimizer = torch.optim.Adam(params, lr=LEARNING_RATE)
loss_fn = F.binary_cross_entropy

Use a tensorboard writer to log training stats

In [None]:
from torch.utils.tensorboard import SummaryWriter
now = datetime.now()
dt_string = now.strftime("%m_%d_%H_%M")
log_dir = os.path.join('logs', 'subject', dt_string)
writer = SummaryWriter(log_dir=log_dir)
print("Logs are saved at:", log_dir)
print("Run this command at the current folder to launch tensorboard:")
print("tensorboard --logdir=logs/subject")

Train subject model!

In [None]:
# macos only: use this command to work around the libomp issue (multiple libs are loaded)
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [None]:
total_step_cnt = 0 # a counter for tensorboard writer
for i in trange(EPOCH_NUM, desc='Epoch'):
    train_tqdm = tqdm(iter(train_loader), desc="Train")
    for step, batch in enumerate(train_tqdm):
        tokens = batch["T"].to(device) # text (in the form of index, zero-padding)
        subject_start_pos = batch["K1"].to(device) # subject start index
        subject_end_pos = batch["K2"].to(device) # subject end index
        subject_start = batch["S1"].to(device) # subject start in 1-0 vector (may have multiple subject)
        subject_end = batch["S2"].to(device) # subject end in 1-0 vector (may have multiple)
        object_start = batch["O1"].to(device) # object start in 1-0 vector (may have multiple object)
        object_end = batch["O2"].to(device) # object end in 1-0 vector (may have multiple objects)
        att_mask = batch['masks'].to(device)
        # predict
        subject_preds, hidden_states = subject_model(tokens)
        # calc loss
        s1_loss = loss_fn(subject_preds[:,:,0], subject_start, reduction='none') # (bsz, sent_len)
        s1_loss = torch.sum(s1_loss * att_mask) / torch.sum(att_mask) # ()
        s2_loss = loss_fn(subject_preds[:,:,1], subject_end, reduction='none')
        s2_loss = torch.sum(s2_loss * att_mask)/torch.sum(att_mask)
        loss_sum = s1_loss + s2_loss
        # loggings
        writer.add_scalar('subject/loss', loss_sum.item(), total_step_cnt)
        # print(loss_sum.item(), total_step_cnt)
        total_step_cnt += 1
        train_tqdm.set_postfix(loss=loss_sum.item())
        #updates
        optimizer.zero_grad()
        loss_sum.backward()
        optimizer.step()