In [1]:
import os
import pandas as pd
import numpy as np
import json
import re
from nltk.tokenize import sent_tokenize 
from transformers import BertTokenizer, AutoTokenizer
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import transformers
from tqdm import tqdm
import glob
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

# Config

In [2]:
platform = 'Azure'
model_name = 'albert_base_uncased_cleaned_extra_label_100per_data.bin'

if platform == 'Azure':
    bert_path = '/home/thanish/transformer_models/bert_base_uncased'
    test_path = '../test/*'
    model_path = '../output/'
elif platform == 'Kaggle':
    bert_path = '../input/bertlargeuncasedpytorch'
    test_path = '/kaggle/input/coleridgeinitiative-show-us-the-data/test/*'
    model_path = '../input/coleridgemodels/'
else:
    bert_path = 'C:/Users/thanisb/Documents/transformer_models/bert_base_uncased/'
    test_path = '../test/*'
    model_path = '../output/'
    
config = {'MAX_LEN':128,
          'tokenizer': AutoTokenizer.from_pretrained('albert-base-v2' , do_lower_case=True),
          'batch_size':64,
          'Epoch': 4,
          'test_path':test_path, 
          'device': 'cuda' if torch.cuda.is_available() else 'cpu',
          'model_path':model_path,
          'model_name':model_name
         }

In [3]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

In [4]:
def data_joining(data_dict_id):
    '''
    This function is to join all the text data from different sections in the json to a single
    text file. 
    '''
    data_length = len(data_dict_id)

    #     temp = [clean_text(data_dict_id[i]['text']) for i in range(data_length)]
    temp = [data_dict_id[i]['text'] for i in range(data_length)]
    temp = '. '.join(temp)
    
    return temp

# Reading the dataset

In [5]:
unique_df = pd.read_csv("../labelled_data/unique_train_df_5_len_128_cleaned_extra_labels_Albert.csv")
unique_df

Unnamed: 0,id,train_sentences,kword,label,sent_len
0,0007f880-0a9b-492d-9a58-76eb0b0e0bd7,in fact organizations are now identifying digi...,['program for the international assessment of ...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",45
1,0008656f-0ba2-4632-8602-3017b44c2e90,besides not enough young people are entering s...,['trends in international mathematics and scie...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",94
2,000e04d6-d6ef-442f-b070-4309493221ba,1 manages access to results of the agricultura...,['agricultural resources management survey'],"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'B', ...",26
3,000e04d6-d6ef-442f-b070-4309493221ba,the agricultural resources management survey a...,['agricultural resources management survey'],"['O', 'B', 'B', 'B', 'B', 'O', 'O', 'O', 'O', ...",29
4,000e04d6-d6ef-442f-b070-4309493221ba,the resulting statistics provide the fulcrum f...,['farm income and wealth statistics'],"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",19
...,...,...,...,...,...
58673,ffd4d86a-0f26-44cc-baed-f0e209cc22af,data used in the preparation of this article w...,['alzheimer s disease neuroimaging initiative ...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",23
58674,ffe7f334-245a-4de7-b600-d7ff4e28bfca,interestingly the genome sequences of sars cov...,['genome sequences of sars cov 2'],"['O', 'O', 'O', 'B', 'B', 'B', 'B', 'B', 'B', ...",38
58675,ffeb3568-7aed-4dbe-b177-cbd7f46f34af,as part of the program for international stude...,['trends in international mathematics and scie...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",66
58676,ffee2676-a778-4521-b947-e1e420b126c5,analysis considered first time beginning posts...,['beginning postsecondary students'],"['O', 'O', 'O', 'O', 'B', 'B', 'B', 'B', 'O', ...",37


# Dropping all the sentences less than 512

In [6]:
unique_df = unique_df.loc[unique_df.sent_len<512, :].reset_index(drop=True)
unique_df

Unnamed: 0,id,train_sentences,kword,label,sent_len
0,0007f880-0a9b-492d-9a58-76eb0b0e0bd7,in fact organizations are now identifying digi...,['program for the international assessment of ...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",45
1,0008656f-0ba2-4632-8602-3017b44c2e90,besides not enough young people are entering s...,['trends in international mathematics and scie...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",94
2,000e04d6-d6ef-442f-b070-4309493221ba,1 manages access to results of the agricultura...,['agricultural resources management survey'],"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B', 'B', ...",26
3,000e04d6-d6ef-442f-b070-4309493221ba,the agricultural resources management survey a...,['agricultural resources management survey'],"['O', 'B', 'B', 'B', 'B', 'O', 'O', 'O', 'O', ...",29
4,000e04d6-d6ef-442f-b070-4309493221ba,the resulting statistics provide the fulcrum f...,['farm income and wealth statistics'],"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",19
...,...,...,...,...,...
58673,ffd4d86a-0f26-44cc-baed-f0e209cc22af,data used in the preparation of this article w...,['alzheimer s disease neuroimaging initiative ...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",23
58674,ffe7f334-245a-4de7-b600-d7ff4e28bfca,interestingly the genome sequences of sars cov...,['genome sequences of sars cov 2'],"['O', 'O', 'O', 'B', 'B', 'B', 'B', 'B', 'B', ...",38
58675,ffeb3568-7aed-4dbe-b177-cbd7f46f34af,as part of the program for international stude...,['trends in international mathematics and scie...,"['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ...",66
58676,ffee2676-a778-4521-b947-e1e420b126c5,analysis considered first time beginning posts...,['beginning postsecondary students'],"['O', 'O', 'O', 'O', 'B', 'B', 'B', 'B', 'O', ...",37


# Take unqiue of the dataset

In [7]:
unique_df = unique_df.drop_duplicates()
unique_df.shape

(56987, 5)

# Taking the sample of the dataset

In [8]:
# unique_df = unique_df.sample(int(unique_df.shape[0]*0.5)).reset_index(drop=True)
unique_df.shape

(56987, 5)

# Train and validation split

In [9]:
np.random.seed(100)
train_df, valid_df = train_test_split(unique_df, test_size=0.05)

train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)

print(train_df.shape, valid_df.shape)

(54137, 5) (2850, 5)


# Converting the DataFrame back to list

In [10]:
tags_2_idx = {'O': 0 , 'B': 1, 'P': 2}

def dataset_2_list(df):
    id_list = df.id.values.tolist()
    sentences_list = df.train_sentences.values.tolist()
    keywords_list = df.kword.apply(lambda x : eval(x)).values.tolist()
    
    labels_list = df.label.apply(lambda x : eval(x)).values.tolist()    
    labels_list = [list(map(tags_2_idx.get, lab)) for lab in labels_list]
    
    return id_list, sentences_list, keywords_list, labels_list

final_train_id_list, final_train_sentences, final_train_keywords, final_train_labels = dataset_2_list(df=unique_df)
final_valid_id_list, final_valid_sentences, final_valid_keywords, final_valid_labels = dataset_2_list(df=valid_df)

In [11]:
# # verification
# ind = 800
# final_train_sentences[ind], config['tokenizer'].tokenize(final_train_sentences[ind]), final_train_keywords[ind], final_train_labels[ind]

# Forming the input

In [12]:
class form_input():
    def __init__(self, ID, sentence, kword, label, data_type='test'):
        self.id = ID
        self.sentence = sentence
        self.kword = kword
        self.label = label
        self.max_length = config['MAX_LEN']
        self.tokenizer = config['tokenizer']
        self.data_type = data_type
    
    def __len__(self):
        return len(self.sentence)
    
    def __getitem__(self, item):
        toks = config['tokenizer'].tokenize(self.sentence[item])
        
        if len(toks)>self.max_length:
            toks = toks[:self.max_length]
            
        ########################################
        # Forming the inputs
        ids = config['tokenizer'].convert_tokens_to_ids(toks)
        tok_type_id = [0] * len(ids)
        att_mask = [1] * len(ids)
        
        # Padding
        pad_len = self.max_length - len(ids)        
        ids = ids + [2] * pad_len
        tok_type_id = tok_type_id + [0] * pad_len
        att_mask = att_mask + [0] * pad_len
        
        ########################################
        # Forming the label
        if self.data_type != 'test':
            label = self.label[item] 
            if len(label)>self.max_length:
                label = label[:self.max_length]
            else:
                label = label + [2] * pad_len

        else:
            label = 1
        ########################################
        
#         print(item, len(ids), len(tok_type_id), len(att_mask), len(label))
        
        return {'pub_id': self.id[item],
                'item': item,
                #'sentence': self.sentence[item],
                #'kword' : self.kword[item],
                'ids': torch.tensor(ids, dtype = torch.long),
                'tok_type_id': torch.tensor(tok_type_id, dtype = torch.long),
                'att_mask': torch.tensor(att_mask, dtype = torch.long),
                'target': torch.tensor(label, dtype = torch.long)
               }
            

# Define the dataloader

In [14]:
train_prod_input = form_input(ID=final_train_id_list, 
                              sentence=final_train_sentences, 
                              kword=final_train_keywords, 
                              label=final_train_labels, 
                              data_type='train')

valid_prod_input = form_input(ID=final_valid_id_list, 
                              sentence=final_valid_sentences, 
                              kword=final_valid_keywords, 
                              label=final_valid_labels, 
                              data_type='valid')

train_prod_input_data_loader = DataLoader(train_prod_input, 
                                          batch_size= config['batch_size'], 
                                          shuffle=True)

valid_prod_input_data_loader = DataLoader(valid_prod_input, 
                                          batch_size= config['batch_size'], 
                                          shuffle=True)


In [15]:
# ind = 8443
# train_prod_input[ind]

In [16]:
ind = 360
len(train_prod_input[ind]['ids']), len(train_prod_input[ind]['att_mask']), len(train_prod_input[ind]['tok_type_id']), len(train_prod_input[ind]['target'])

(128, 128, 128, 128)

In [18]:
def flat_accuracy(preds, labels):
    actual_flat = labels.flatten()
    pred_flat = preds.flatten()
    
    # Get all non-padded index
    non_padded_index = np.where(actual_flat != tags_2_idx["P"])[0]
    
    # Only non-padded positions for the actual and the predicted
    actual_flat = actual_flat[non_padded_index]
    pred_flat = pred_flat[non_padded_index]
    
    print(confusion_matrix(actual_flat, pred_flat))
    
    # Accuracy
    Accuracy = np.sum(actual_flat == pred_flat ) / len(pred_flat )
    
    # Recall calculator
    condition = (actual_flat == 1)
    actual_index = np.where(condition)[0]
    actual_flat_rec = actual_flat[actual_index]
    pred_flat_rec = pred_flat[actual_index]
    Recall = np.sum(actual_flat_rec == pred_flat_rec) / len(pred_flat_rec)
    
    # Precision calculator
    condition = (pred_flat == 1)
    pred_index = np.where(condition)[0]
    actual_flat_prc = actual_flat[pred_index]
    pred_flat_prc = pred_flat[pred_index]
    Precision = np.sum(actual_flat_prc == pred_flat_prc) / len(pred_flat_prc)

    # Jaccard similarity
    common_index = np.intersect1d(actual_index, pred_index)
    jaccard_similarity = float(len(common_index))/( len(actual_index) + len(pred_index) - len(common_index))
    
    return Accuracy, Recall, Precision, jaccard_similarity



In [19]:
def train_fn(data_loader, model, optimizer):
    
    model.train()
    train_loss = 0
    for index, dataset in enumerate(tqdm(data_loader, total = len(data_loader))):
        batch_input_ids = dataset['ids'].to(config['device'], dtype = torch.long)
        batch_att_mask = dataset['att_mask'].to(config['device'], dtype = torch.long)
        batch_tok_type_id = dataset['tok_type_id'].to(config['device'], dtype = torch.long)
        batch_target = dataset['target'].to(config['device'], dtype = torch.long)
                
        model.zero_grad()
        output = model(batch_input_ids, 
                       token_type_ids=None,
                       attention_mask=batch_att_mask,
                       labels=batch_target)
        
        step_loss = output[0]
        prediction = output[1]
        
        step_loss.sum().backward()
        optimizer.step()        
        train_loss += step_loss
        optimizer.zero_grad()
        
    tr_loss = train_loss.sum()/len(data_loader)
    
    return tr_loss

In [20]:
def eval_fn(data_loader, model):
    
    model.eval()
    
    eval_loss = 0
    predictions = np.array([], dtype = np.int64).reshape(0, config['MAX_LEN'])
    true_labels = np.array([], dtype = np.int64).reshape(0, config['MAX_LEN'])
    
    with torch.no_grad():
        for index, dataset in enumerate(tqdm(data_loader, total = len(data_loader))):
            batch_input_ids = dataset['ids'].to(config['device'], dtype = torch.long)
            batch_att_mask = dataset['att_mask'].to(config['device'], dtype = torch.long)
            batch_tok_type_id = dataset['tok_type_id'].to(config['device'], dtype = torch.long)
            batch_target = dataset['target'].to(config['device'], dtype = torch.long)

            output = model(batch_input_ids, 
                           token_type_ids=None,
                           attention_mask=batch_att_mask,
                           labels=batch_target)

            step_loss = output[0]
            eval_prediction = output[1]

            eval_loss += step_loss
            
            eval_prediction = np.argmax(eval_prediction.detach().to('cpu').numpy(), axis = 2)
            actual = batch_target.to('cpu').numpy()
            
            predictions = np.concatenate((predictions, eval_prediction), axis = 0)
            true_labels = np.concatenate((true_labels, actual), axis = 0)
           
        ev_loss = eval_loss.sum()/len(data_loader)
        Accuracy, Recall, Precision, jaccard_similarity = flat_accuracy(preds=predictions , labels=true_labels)
        
        print(f'Eval Accuracy: {Accuracy}, Recall: {Recall}, Precision: {Precision}, jaccard_similarity: {jaccard_similarity}')
            
    return ev_loss, Precision, predictions, true_labels

In [21]:
def train_engine(epoch, train_data, valid_data):
    model = transformers.AlbertForTokenClassification.from_pretrained('albert-base-v2',  num_labels = len(tags_2_idx))
    model = nn.DataParallel(model)
    model = model.to(config['device'])
    
    params = model.parameters()
    optimizer = torch.optim.Adam(params, lr= 3e-5)
    
    best_eval_loss = 1000000
    best_Precision = 0
    for i in range(epoch):
        train_loss = train_fn(data_loader = train_data, 
                              model=model, 
                              optimizer=optimizer)
        eval_loss, Precision, eval_predictions, true_labels = eval_fn(data_loader = valid_data, 
                                                                      model=model)
        
        print(f"Epoch {i} , Train loss: {train_loss}, Eval loss: {eval_loss}")

#         if Precision > best_Precision:
#             best_Precision = Precision           
        
        saving_name = config['model_path'] + f"epoch_{i}_" + config['model_name']
        print("Saving the model:", saving_name)
        torch.save(model.state_dict(), saving_name)
            
    return model, eval_predictions, true_labels 

In [22]:
model, eval_predictions, eval_true_labels = train_engine(epoch=config['Epoch'], 
                                                         train_data=train_prod_input_data_loader, 
                                                         valid_data=valid_prod_input_data_loader)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForTokenClassification: ['predictions.dense.weight', 'predictions.decoder.bias', 'predictions.bias', 'predictions.decoder.weight', 'predictions.LayerNorm.bias', 'predictions.LayerNorm.weight', 'predictions.dense.bias']
- This IS expected if you are initializing AlbertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForTokenClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably

[[107639      8]
 [ 14844     19]]
Eval Accuracy: 0.8787690800750959, Recall: 0.0012783421920204534, Precision: 0.7037037037037037, jaccard_similarity: 0.0012776544953264743
Epoch 0 , Train loss: 0.3496091961860657, Eval loss: 0.34251123666763306
Saving the model: ../output/epoch_0_albert_base_uncased_cleaned_extra_label_100per_data.bin


100%|██████████| 891/891 [06:55<00:00,  2.14it/s]
100%|██████████| 45/45 [00:08<00:00,  5.50it/s]
  0%|          | 0/891 [00:00<?, ?it/s]

[[107376    271]
 [   182  14681]]
Eval Accuracy: 0.9963023426659048, Recall: 0.987754827423804, Precision: 0.9818753344034243, jaccard_similarity: 0.9700673979119863
Epoch 1 , Train loss: 0.04204551503062248, Eval loss: 0.01248400378972292
Saving the model: ../output/epoch_1_albert_base_uncased_cleaned_extra_label_100per_data.bin


100%|██████████| 891/891 [06:55<00:00,  2.14it/s]
100%|██████████| 45/45 [00:08<00:00,  5.55it/s]
  0%|          | 0/891 [00:00<?, ?it/s]

[[107468    179]
 [   114  14749]]
Eval Accuracy: 0.9976083585013469, Recall: 0.9923299468478772, Precision: 0.9880091103965702, jaccard_similarity: 0.9805212072862651
Epoch 2 , Train loss: 0.010658901184797287, Eval loss: 0.007601642981171608
Saving the model: ../output/epoch_2_albert_base_uncased_cleaned_extra_label_100per_data.bin


100%|██████████| 891/891 [06:55<00:00,  2.14it/s]
100%|██████████| 45/45 [00:08<00:00,  5.47it/s]


[[107489    158]
 [   131  14732]]
Eval Accuracy: 0.9976410088972328, Recall: 0.991186166991859, Precision: 0.9893888515782404, jaccard_similarity: 0.9807602689567938
Epoch 3 , Train loss: 0.008217979222536087, Eval loss: 0.007283493876457214
Saving the model: ../output/epoch_3_albert_base_uncased_cleaned_extra_label_100per_data.bin
