In [1]:
!pip install transformers
!pip install datasets
!pip install seqeval
!pip install wandb

Collecting transformers
  Downloading transformers-4.16.1-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 7.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 18.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 47.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.10.1
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 58.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
  

In [2]:
# huggingface library
from transformers import (RobertaForTokenClassification, 
                          RobertaTokenizerFast, RobertaConfig, 
                          DataCollatorForTokenClassification, 
                          get_cosine_schedule_with_warmup, AdamW)

from datasets import load_metric

#pytorch library
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import GradScaler, autocast

# pandas/numpy
import pandas as pd
import numpy as np

# python libraries
import random
import os
import glob
import json

# visualization and timer
from tqdm.notebook import tqdm

# sklearn
from sklearn.model_selection import train_test_split

## Logging on to wandb

In [3]:
import wandb


In [4]:
class CONFIG:
    seed = 24
    DEBUG = False #will only use 5% of the data for code debugging

    #base model
    base_hf_model = "roberta-large"

    #training params
    train_batchsize= 4
    val_batchsize= 4
    epochs = 8
    gradient_accumulation = 1 ## As we are using a very big model, we want to accumulate the gradient before we update the weights

    #token mapping
    id2label={
        0:"O",
        1:"B-ORG",
        2:"I-ORG",
        3:"B-SEG",
        4:"I-SEG",
        5:"B-SEGNUM",
        6:"I-SEGNUM"
    }
    label2id = {v:k for k,v in id2label.items()} #reverse mapping
    num_labels = len(id2label)

    #optimizer config (AdamW for this project)
    learning_rate = 5e-6

    #scheduler config (cosine annealing without restart for this project)
    warm_up_ratio = 0.1 


    # tokenizer setting
    tokenizer_max_length = 512  # this is consistent with model max length - do not change
    tokenizer_truncation = True #  do not change
    tokenizer_return_offsets_mapping = True # do not change


In [5]:
## converting to dictionary for weights and biases
config_dict = {}
for name, value in CONFIG.__dict__.items():
    if not name.startswith("__"):
        config_dict[name] = value

wandb.init(project="company_segment_ner", entity="zuozhe", config=config_dict)

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


## Seeding as much as possible

In [6]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CONFIG.seed)

## Preparing the dataset

In [7]:
!gdown --id 1dYju9zo96ddl4EfcCv858ZxSHUZbAWmc -O training_df.csv

Downloading...
From: https://drive.google.com/uc?id=1dYju9zo96ddl4EfcCv858ZxSHUZbAWmc
To: /content/training_df.csv
100% 5.82M/5.82M [00:00<00:00, 43.3MB/s]


In [8]:
training_df = pd.read_csv("training_df.csv", index_col=0)
training_df.head()

Unnamed: 0,text,label
0,KAM HING INTERNATIONAL HOLDINGS LIMITED is an ...,"[{""start"": 0, ""end"": 39, ""text"": ""KAM HING INT..."
1,Peking University Resources (Holdings) Company...,"[{""start"": 0, ""end"": 54, ""text"": ""Peking Unive..."
2,Rare Earth Magnesium Technology Group Holdings...,"[{""start"": 0, ""end"": 54, ""text"": ""Rare Earth M..."
3,Chu Kong Shipping Enterprises (Group) Company ...,"[{""start"": 0, ""end"": 53, ""text"": ""Chu Kong Shi..."
4,Superland Group Holdings Ltd is an investment ...,"[{""start"": 0, ""end"": 28, ""text"": ""Superland Gr..."


In [9]:
tokenizer = RobertaTokenizerFast.from_pretrained(CONFIG.base_hf_model, use_fast=True, add_prefix_space=True)

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/482 [00:00<?, ?B/s]

In [10]:
def convert_single_example(text, json_labels, tokenizer, config):
    """
    This function expects the use of huggingface fasttokenizers
    This function converts the labels into class tokens. It will add in -100 in the first and the last index as hf will ignore the loss for -100 classes
    This function will also cross check the word alignment so that we know that are getting the correct text
    """
    if len(text) == 0:
        raise ValueError("input text is empty!")

    model_inputs = tokenizer(text, padding=False, max_length=config.tokenizer_max_length, 
                             return_offsets_mapping=config.tokenizer_return_offsets_mapping, 
                             truncation=config.tokenizer_truncation)
    tokenized_length = len(model_inputs['input_ids'])
    tokenized_labels= np.zeros(tokenized_length, dtype=np.int64)

    labels = json.loads(json_labels)
    try:
        for label in labels:
            # expected format from labelstudio csv 
            # {"start": 0, "end": 39, "text": "KAM HING INTERNATIONAL HOLDINGS LIMITED", "labels": ["ORG"]}

            ## due to mislabelling sometimes there are spaces at the front or the back, we want to remove those
            # TODO an improvement is to make it handle more than 1 space
            if label["text"][0]==' ':
                label['start'] +=1
            if label["text"][-1]==' ':
                label['end'] -=1

            token_begin = model_inputs.char_to_token(label['start'])
            token_end = model_inputs.char_to_token(label['end']-1) 
            label_text = label['labels'][0]

            # assigning all the I-XXX first
            tokenized_labels[token_begin:token_end] = config.label2id["I-"+label_text]
            # assigning the first token as begin
            tokenized_labels[token_begin] = config.label2id["B-"+label_text]

            ## checking if the mapping is correct
            token_start_idx = model_inputs['offset_mapping'][token_begin][0]
            token_end_idx = model_inputs['offset_mapping'][token_end][1]

            if text[token_start_idx:token_end_idx] != label['text'].strip():
                # Due to the heurestic tokenation, some of the tokens a combination of punctuation and hence we we are not able to seperate them unless 
                # train our own tokenizer
                if tokenizer.convert_ids_to_tokens(model_inputs['input_ids'][token_end]) not in [":,",".,",").","),",");",".),",".)",".;",".),"]:
                    print("Misalignment detected")
                    print("tokenized text")
                    print(text[token_start_idx:token_end_idx])
                    print("original text")
                    print(label['text'])
                    print(tokenizer.convert_ids_to_tokens(model_inputs['input_ids'][token_end]))
                    print(text)

    except Exception as e:
        print(text)
        print(label['text'])
        print(label['start'])
        print(model_inputs['offset_mapping'])
        raise ValueError()

    # first and last labels are -100 so that we will include them loss calculation
    tokenized_labels[0] = -100
    tokenized_labels[-1] = -100

    return tokenized_labels

In [11]:
sample_list = []

num_training_examples = len(training_df)

for i in tqdm(range(num_training_examples)):
    row = training_df.iloc[i]
    single_sample= convert_single_example(row['text'], row['label'],tokenizer, CONFIG)
    sample_list.append(single_sample)


  0%|          | 0/4121 [00:00<?, ?it/s]

In [12]:
training_df['tokenized_labels'] = sample_list
training_df.head()

Unnamed: 0,text,label,tokenized_labels
0,KAM HING INTERNATIONAL HOLDINGS LIMITED is an ...,"[{""start"": 0, ""end"": 39, ""text"": ""KAM HING INT...","[-100, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, ..."
1,Peking University Resources (Holdings) Company...,"[{""start"": 0, ""end"": 54, ""text"": ""Peking Unive...","[-100, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, ..."
2,Rare Earth Magnesium Technology Group Holdings...,"[{""start"": 0, ""end"": 54, ""text"": ""Rare Earth M...","[-100, 1, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, ..."
3,Chu Kong Shipping Enterprises (Group) Company ...,"[{""start"": 0, ""end"": 53, ""text"": ""Chu Kong Shi...","[-100, 1, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, ..."
4,Superland Group Holdings Ltd is an investment ...,"[{""start"": 0, ""end"": 28, ""text"": ""Superland Gr...","[-100, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# Train Test split

In [13]:
if CONFIG.DEBUG:
    train_df, val_df = train_test_split(training_df.sample(len(training_df)//20), test_size=0.1, shuffle=True, random_state=CONFIG.seed)
else: 
    train_df, val_df = train_test_split(training_df, test_size=0.1, shuffle=True, random_state=CONFIG.seed)

## For simplicity, we will be using Huggingface NER model (linear at the top)
The advantage is that we will be able to use huggingface pipeline for postprocessing


In [14]:
model = RobertaForTokenClassification.from_pretrained(CONFIG.base_hf_model, num_labels=CONFIG.num_labels)
model.config.id2label=CONFIG.id2label # changing the default label mapping 
model.config.label2id=CONFIG.label2id # changing the default label mapping 

Downloading:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large were not used when initializing RobertaForTokenClassification: ['lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be ab

## Creating the dataset

In [15]:
class SegmentDataSet(Dataset):
    def __init__(self, df, tokenizer):
        super().__init__()
        self.texts = df['text'].tolist()
        self.labels = df['tokenized_labels'].tolist()
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        model_inputs = self.tokenizer(self.texts[idx], padding=False,
                                      truncation=CONFIG.tokenizer_truncation)
        
        model_inputs['labels'] =self.labels[idx].tolist()
        return model_inputs



In [16]:
train_ds = SegmentDataSet(train_df, tokenizer)
val_ds = SegmentDataSet(val_df, tokenizer)

# dynamic batching to reduce training time
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer,
                                                   pad_to_multiple_of=8)

train_loader = DataLoader(train_ds, collate_fn=data_collator, shuffle=True, batch_size=CONFIG.train_batchsize, pin_memory=True)
val_loader = DataLoader(val_ds, collate_fn=data_collator, shuffle=False, batch_size=CONFIG.val_batchsize, pin_memory=True)

## Training

In [17]:
class AvgMeter:
    """
    This is a utility class to store the loss and metrics
    """
    def __init__(self):
        self.total = 0
        self.count = 0

    def update(self, value, count):
        self.total += value
        self.count += count

    def get_average(self):
        return self.total/self.count

def calculate_seq_eval(logits_batches, labels_batches, seqeval_metric):
    """ This function calculate the seqeval between list of logits and list of true labels """
    
    cleaned_predictions = []
    cleaned_labels = []
    
    for logits, labels in zip(logits_batches,labels_batches):
        predictions = np.argmax(logits, axis=2) # logits shape (batch, seq_length, classes)

        assert len(logits) == len(labels), "logits and labels are of different shape!"

        # we will need to remove the padding and special tokens
        
        for pred, label in zip(predictions, labels):
            cleaned_prediction = []
            cleaned_label = []
            for pred_token, label_token in zip(pred, label):
                if label_token != -100:
                    cleaned_prediction.append(CONFIG.id2label[pred_token])
                    cleaned_label.append(CONFIG.id2label[label_token])

            cleaned_predictions.append(cleaned_prediction)
            cleaned_labels.append(cleaned_label)

    results = seqeval_metric.compute(predictions=cleaned_predictions, references=cleaned_labels)   
    return {
            "precision": results["overall_precision"],
            "recall": results["overall_recall"],
            "f1": results["overall_f1"],
            "accuracy": results["overall_accuracy"],
        }
        


def train_step(train_loader, model, optimizer, scheduler, scaler, device):
    """ This function define a single training loop thru the entire dataset """
    model.train()
    loss_meter = AvgMeter()
    progbar = tqdm(train_loader, total=len(train_loader))
    for step, model_inputs in enumerate(progbar):
        with autocast():
            model_inputs = {k:v.to(device) for k,v in model_inputs.items()}
            model_output = model(**model_inputs)
            loss = model_output.loss
            # loss = loss / CONFIG.gradient_accumulation

        # logging the loss
        loss_meter.update(loss.detach().item(), len(model_inputs['input_ids']))

        # if step % CONFIG.gradient_accumulation == 0 or step == len(train_loader) -1: #after last step
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()
        optimizer.zero_grad()

        progbar.set_postfix({'running_loss':loss_meter.get_average()})
    
    return {"train_loss" :loss_meter.get_average()}

def eval_step(val_loader, model, seqeval_metric, device):
    model.eval()
    loss_meter = AvgMeter()
    progbar = tqdm(val_loader, total=len(val_loader))

    logits_list = []
    labels_list = []

    for model_input in progbar:
        with torch.no_grad():
            model_input = {k:v.to(device) for k, v in model_input.items()}
            model_output = model(**model_input)
            logits = model_output.logits.cpu().detach().numpy()
            labels = model_input['labels'].cpu().detach().numpy()
            loss = model_output.loss

            # storing all the logits and labesl
            logits_list.append(logits)
            labels_list.append(labels)

            # logging the loss
            loss_meter.update(loss.detach().item(), len(model_input['input_ids']))

        progbar.set_postfix({'running_loss':loss_meter.get_average()})
    
    ## calculating the seqeval scores
    result = calculate_seq_eval(logits_list, labels_list, seqeval_metric)
    print(result) # to be logged to wandb

    return {"val_loss" :loss_meter.get_average(), **result}



## Main training loop

In [18]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

model.to(device)
optimizer = AdamW(model.parameters(), lr=CONFIG.learning_rate)
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=int(len(train_loader)*CONFIG.epochs*CONFIG.warm_up_ratio), num_training_steps=len(train_loader)*CONFIG.epochs)
scaler = GradScaler()
seqeval_metric = load_metric("seqeval")



Downloading:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

In [19]:
best_evalscore = float('inf')

for epoch_step in range(CONFIG.epochs):
    print(f"EPOCH {epoch_step+1}/{CONFIG.epochs}")
    train_metrics = train_step(train_loader, model, optimizer, scheduler, scaler, device)
    eval_metrics = eval_step(val_loader, model, seqeval_metric, device)

    wandb.log({**train_metrics, ** eval_metrics})

    if eval_metrics["val_loss"] < best_evalscore:
        best_evalscore = eval_metrics["val_loss"]
        model.save_pretrained(f"best_epoch_{epoch_step}")

EPOCH 1/8


  0%|          | 0/927 [00:00<?, ?it/s]



  0%|          | 0/104 [00:00<?, ?it/s]

{'precision': 0.908641975308642, 'recall': 0.9328263624841572, 'f1': 0.9205753595997498, 'accuracy': 0.99521344436942}
EPOCH 2/8


  0%|          | 0/927 [00:00<?, ?it/s]

  0%|          | 0/104 [00:00<?, ?it/s]

{'precision': 0.9264282778462803, 'recall': 0.9522602450359104, 'f1': 0.9391666666666667, 'accuracy': 0.9960087028284192}
EPOCH 3/8


  0%|          | 0/927 [00:00<?, ?it/s]

  0%|          | 0/104 [00:00<?, ?it/s]

{'precision': 0.9343853820598007, 'recall': 0.9505703422053232, 'f1': 0.9424083769633508, 'accuracy': 0.9962187711006077}
EPOCH 4/8


  0%|          | 0/927 [00:00<?, ?it/s]

  0%|          | 0/104 [00:00<?, ?it/s]

{'precision': 0.9311340206185567, 'recall': 0.9539501478664977, 'f1': 0.9424040066777963, 'accuracy': 0.9957986345562307}
EPOCH 5/8


  0%|          | 0/927 [00:00<?, ?it/s]

  0%|          | 0/104 [00:00<?, ?it/s]

{'precision': 0.9387417218543046, 'recall': 0.9581749049429658, 'f1': 0.948358770646038, 'accuracy': 0.9963688198664566}
EPOCH 6/8


  0%|          | 0/927 [00:00<?, ?it/s]

  0%|          | 0/104 [00:00<?, ?it/s]

{'precision': 0.9398090493980905, 'recall': 0.9564850021123785, 'f1': 0.9480737018425461, 'accuracy': 0.9964738540025508}
EPOCH 7/8


  0%|          | 0/927 [00:00<?, ?it/s]

  0%|          | 0/104 [00:00<?, ?it/s]

{'precision': 0.9417394923012901, 'recall': 0.9560625264047318, 'f1': 0.948846960167715, 'accuracy': 0.9965638832620601}
EPOCH 8/8


  0%|          | 0/927 [00:00<?, ?it/s]

  0%|          | 0/104 [00:00<?, ?it/s]

{'precision': 0.9413721413721414, 'recall': 0.9564850021123785, 'f1': 0.9488683989941324, 'accuracy': 0.9965488783854752}


In [20]:
wandb.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0,1
accuracy,▁▅▆▄▇███
f1,▁▆▆▆████
precision,▁▅▆▆▇███
recall,▁▆▆▇██▇█
train_loss,█▂▁▁▁▁▁▁
val_loss,█▃▂▂▁▁▁▁

0,1
accuracy,0.99655
f1,0.94887
precision,0.94137
recall,0.95649
train_loss,0.00226
val_loss,0.00347
