# This notebook trains classifier model to filter out the papers that are relevant and not very relevant for magnetic materials research

In [1]:
import os
import glob
import random
import itertools
import numpy as np
import pandas as pd
import pickle as pkl
from tqdm.auto import tqdm
import matplotlib.pyplot as plt
import datasets
from datasets import Dataset
from datasets import load_dataset
from datasets import load_from_disk

import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import get_scheduler
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Below are the helper functions

In [2]:
### Function to set the random seed
def set_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

In [3]:
### Function to concatenate two given path components, and create this directory if it doesn't exists
def join_and_create_folder(main_f, sub_f, sub_f_replace_hifen = False, ):
    if sub_f_replace_hifen == True:
        sub_f = sub_f.replace("-", "_")
        sub_f = sub_f.replace("/", "_")
    MODEL_DIR=os.path.join(main_f, sub_f)
    isExist = os.path.exists(MODEL_DIR)
    if not isExist:
        os.mkdir(MODEL_DIR)
    return MODEL_DIR

In [4]:
def tokenize_function(example):
    return tokenizer(example["title_abstract"], padding="max_length", truncation=True, max_length=512)

In [5]:
### Function to get tain-, validation- and test- loaders from the dataset
def get_data_loaders(data_root_):
    magnetics_dataset = load_from_disk(data_root_)
    # magnetics_dataset
    ### Remove the columns corresponding to values the model does not expect.
    magnetics_dataset = magnetics_dataset.remove_columns(['title', 'abstract', 'text', 'doi', 'abstract_length'])
    ### Tokenize and remove the columns corresponding to values the model does not expect.
    magnetics_dataset_tokenized = magnetics_dataset.map(tokenize_function, batched=True, remove_columns=["title_abstract"])

    df_pandas_train = pd.DataFrame(magnetics_dataset_tokenized['train'])
    df_pandas_train = df_pandas_train[['input_ids', 'token_type_ids', 'attention_mask', 'labels']]

    df_pandas_val = pd.DataFrame(magnetics_dataset_tokenized['val'])
    df_pandas_val = df_pandas_val[['input_ids', 'token_type_ids', 'attention_mask', 'labels']]

    df_pandas_test = pd.DataFrame(magnetics_dataset_tokenized['test'])
    df_pandas_test = df_pandas_test[['input_ids', 'token_type_ids', 'attention_mask', 'labels']]

    magnetics_dataset_tokenized['train'] = Dataset.from_pandas(df_pandas_train)
    magnetics_dataset_tokenized['val'] = Dataset.from_pandas(df_pandas_val)
    magnetics_dataset_tokenized['test'] = Dataset.from_pandas(df_pandas_test)

    ### Set the format of the datasets so they return PyTorch tensors instead of lists
    magnetics_dataset_tokenized.set_format("torch")
    # magnetics_dataset_tokenized
    # magnetics_dataset_tokenized['train']['input_ids'][3].shape  # token_type_ids, attention_mask, labels

    set_seed(0)
    train_dataloader = DataLoader(magnetics_dataset_tokenized['train'], shuffle=True, batch_size=train_batch_size)
    val_dataloader = DataLoader(magnetics_dataset_tokenized['val'], shuffle=True, batch_size=val_batch_size)
    test_dataloader = DataLoader(magnetics_dataset_tokenized['test'], batch_size=test_batch_size)
    
    return train_dataloader, val_dataloader, test_dataloader

# train_dataloader, val_dataloader, test_dataloader = get_data_loaders(data_root)

In [6]:
### Evaluation funtion used for the validation
def eval_loss_acc_entr(loader, model, device):
    entropLST = []; AccLST = []; lossLST = []
    with torch.inference_mode():
#     with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss_val = outputs.loss
            lossLST.append(loss_val.item())
            
            y_hat = torch.nn.functional.softmax(outputs.logits, dim=1)
            entr = torch.special.entr(y_hat)
            entrop = torch.mean(torch.sum(entr,axis=1)).item()
            entropLST.append(entrop)
            
            Acc = torch.mean( (batch['labels'] == torch.argmax(y_hat,axis=1)).float()).item()
            AccLST.append(Acc)
        
    return np.mean(AccLST), np.mean(entropLST), np.mean(lossLST)

# Setting the parameters like batch size and device

In [7]:
train_batch_size = 16
val_batch_size = 32
test_batch_size = 32

data_root = "../Corpus/magnetics_train_val_test_by_text"

device = torch.device("cuda:7") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda', index=7)

# Using the weights of the four pretrained models and adding a classification head

In [8]:
model_checkpoint="bert-base-uncased"
# model_checkpoint="m3rg-iitd/matscibert"
# model_checkpoint="nlp-magnets/magbert"        #bert-base trained on magnet corpus
# model_checkpoint="nlp-magnets/magmatbert"     #matscibert trained on magnet corpus

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
# model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Function to train the classification model

In [9]:
def classification_train(num_epochs, learning_rate, train_dataloader, val_dataloader, model_checkpoint, train_batch_size):
    set_seed(0)

    train_loader_len = len(train_dataloader)
    num_training_steps = num_epochs * train_loader_len

    LOG_DIR='./outputs'
    MODEL_DIR = join_and_create_folder(LOG_DIR, model_checkpoint, True)
    
    lgfile = "numEpoch_%.0f_trainBS_%.0f_lr_%.8f"%(num_epochs, train_batch_size, learning_rate)
    lgfile_DIR = join_and_create_folder(MODEL_DIR, lgfile, False)

    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)
    model.to(device)
    
    optimizer = AdamW( model.parameters(), lr=learning_rate) 
    lr_scheduler = get_scheduler(name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    progress_bar=tqdm(range(num_training_steps))
    
    ep = 0; max_accuracy_val= 0.;
    losses_train = []; losses_train_epoch = []
    accuracy_train = []; accuracy_train_epoch = []
    losses_val = []; losses_val_epoch = []
    accuracy_val = []; accuracy_val_epoch = []
    entropy_train_epoch = []; entropy_val_epoch =[]
    losses_val_epoch_01 = []; accuracy_val_epoch_01 = []; entropy_val_epoch_01 =[]

    acc_train, ent_train, loss_train = eval_loss_acc_entr(train_dataloader, model, device)
    accuracy_train_epoch.append(acc_train); entropy_train_epoch.append(ent_train); losses_train_epoch.append(loss_train)

    acc_val, ent_val, loss_val = eval_loss_acc_entr(val_dataloader, model, device)
    accuracy_val_epoch.append(acc_val); entropy_val_epoch.append(ent_val); losses_val_epoch.append(loss_val)
    accuracy_val_epoch_01.append([ep,acc_val]); entropy_val_epoch_01.append([ep,ent_val]); losses_val_epoch_01.append([ep,loss_val]) 

    for epoch in range(1, num_epochs + 1):
        for batch_idx, batch in enumerate(train_dataloader):
            model.train()

            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()

            outputs = model(**batch)
            loss = outputs.loss
            y_hat = torch.nn.functional.softmax(outputs.logits, dim=1)
            losses_train.append(loss.item())

            loss.backward()
            optimizer.step()

            accuracy_train.append( torch.mean((batch["labels"] == torch.argmax(y_hat,axis=1)).float()).item() )
            entropy_train = torch.special.entr(y_hat)

            progress_bar.update(1)
            lr_scheduler.step() 

            ### evaluating after every 0.1 epochs
            model.eval()
            if batch_idx % (np.floor(train_loader_len/10)) == 0 and batch_idx != 0:
                ep += 0.1

                accuracy_val_01, entropy_val_01, losses_val_01 = eval_loss_acc_entr(val_dataloader, model, device)
                accuracy_val_epoch_01.append([ep,accuracy_val_01]); entropy_val_epoch_01.append([ep,entropy_val_01]) 
                losses_val_epoch_01.append([ep,losses_val_01]) 
                
                if accuracy_val_01 > max_accuracy_val:
                    old_max_acc = max_accuracy_val
                    best_epoch = epoch

                    checkpoint = {
                        'epoch': np.round(ep, 3),
                        'model_state': model.state_dict(),
                        'optimizer_state': optimizer.state_dict(),
                        'lr_sched': lr_scheduler.state_dict(),
                    }
                   
                    best_filename = os.path.join(lgfile_DIR + "/best_checkpoint.pth")
                    torch.save(checkpoint, best_filename)
                    model.save_pretrained(lgfile_DIR)
                    tokenizer.save_pretrained(lgfile_DIR)

                    max_accuracy_val = accuracy_val_01
                   
        losses_train_epoch.append(np.mean(losses_train))
        accuracy_train_epoch.append(np.mean(accuracy_train))
        entropy_train_epoch.append(torch.mean(torch.sum(entropy_train,axis=1)).item())


        ### evaluating after every epoch
        accuracy_val, entropy_val, losses_val = eval_loss_acc_entr(val_dataloader, model, device)
        losses_val_epoch.append(losses_val); accuracy_val_epoch.append(accuracy_val); entropy_val_epoch.append(entropy_val)

        if accuracy_val > max_accuracy_val:
            old_max_acc = max_accuracy_val
            best_epoch = epoch

            checkpoint = {
                'epoch': epoch,
                'model_state': model.state_dict(),
                'optimizer_state': optimizer.state_dict(),
                'lr_sched': lr_scheduler.state_dict(),
            }
            
            best_filename = os.path.join(lgfile_DIR + "/best_checkpoint.pth")
            torch.save(checkpoint, best_filename)
            model.save_pretrained(lgfile_DIR)
            tokenizer.save_pretrained(lgfile_DIR)

            max_accuracy_val = accuracy_val

    dict = {"train_accuracy": accuracy_train_epoch, "train_loss": losses_train_epoch,\
            "train_entr": entropy_train_epoch, "val_accuracy": accuracy_val_epoch, \
            "val_loss": losses_val_epoch, "val_entr": entropy_val_epoch, \
            "val_accuracy_01": accuracy_val_epoch_01, "val_loss_01": losses_val_epoch_01,\
            "val_entr_01": entropy_val_epoch_01, "lr": learning_rate, "train_BS": train_batch_size, "num_of_epochs": num_epochs}

    logfile = lgfile_DIR +"/numEpoch_%.0f_trainBS_%.0f_lr_%.6f_max_%.8f.log"%(num_epochs, train_batch_size, learning_rate, max_accuracy_val)
    pkl.dump(dict, open(logfile,"wb"))
    
    print("saved at:", lgfile_DIR)
    
    return max_accuracy_val

# Training the classification model

In [None]:
num_epochs = 15
lr_lst = [1e-5, 2e-5, 5e-5]
# batch_size_lst = [32]  #16

# for num_epochs, learning_rate, train_batch_size in itertools.product(epoch_lst, lr_lst, batch_size_lst):
for learning_rate in lr_lst:
    train_dataloader, val_dataloader, test_dataloader = get_data_loaders(data_root)
    classification_train(num_epochs, learning_rate, train_dataloader, val_dataloader, model_checkpoint, train_batch_size)

Map:   0%|          | 0/15859 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/14880 [00:00<?, ?it/s]