# Comet ml

In [1]:
# $ export COMET_API_KEY="ZNgNJ1VVgmaAbL0ga1t4mw3JI"

In [2]:
# !comet check

In [3]:
# import comet_ml

# comet_ml.init(project_name="roberta-base-ner")

# Arguments

In [4]:
model_checkpoint = "roberta-base"
SEED = 42

# Tokenizer

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

  from .autonotebook import tqdm as notebook_tqdm


# Data loader

In [6]:
ner_tags = [
"MAT",
"NMAT",
"DIMENSION",
"WEIGHT",
"TARGET_USER",
"PROPERTY",
"COLOR",
"SHAPE",
"SIZE",
]

processed_ner_tags = ['O']
for tag in ner_tags:
        processed_ner_tags.extend([f"B-{tag}", f"I-{tag}"])

print(processed_ner_tags)
print(len(processed_ner_tags))

ner_tags_2_number = {t: i for (i, t) in enumerate(processed_ner_tags)}
number_2_ner_tags = {t: i for (t, i) in enumerate(ner_tags_2_number)}

['O', 'B-MAT', 'I-MAT', 'B-NMAT', 'I-NMAT', 'B-DIMENSION', 'I-DIMENSION', 'B-WEIGHT', 'I-WEIGHT', 'B-TARGET_USER', 'I-TARGET_USER', 'B-PROPERTY', 'I-PROPERTY', 'B-COLOR', 'I-COLOR', 'B-SHAPE', 'I-SHAPE', 'B-SIZE', 'I-SIZE']
19


In [7]:
import pandas as pd

raw_excel_dataset = pd.read_excel("../data/raw_data_restore_uppercase.xlsx")
raw_excel_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3952 entries, 0 to 3951
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   sentence  3952 non-null   object
 1   locs      3952 non-null   object
 2   words     3952 non-null   object
dtypes: object(3)
memory usage: 92.8+ KB


In [8]:
import ast

def assign_ner_tags_roberta(example):
    token_input = tokenizer(example['sentence'])
    example['tokens'] = tokenizer.convert_ids_to_tokens(
        token_input['input_ids'])

    ner_tags = [0 for token in example['tokens']]
    if str(type(example['locs'])) == "<class 'list'>":
        locs = example['locs']
    else:
        locs = ast.literal_eval(example['locs'])

    locs = [(int(loc[0]), int(loc[1]), loc[2]) for loc in locs]
    locs = sorted(locs)
    bg_id = 1
    pre_loc = 0
    text = example['sentence']
    for loc in locs:
        loc0 = int(loc[0])
        loc1 = int(loc[1])

        pre_text = text[pre_loc:loc0]
        if 0 < loc0 and len(pre_text) > 0 and pre_text[-1] == ' ':
            pre_text = text[pre_loc:loc0 - 1]
        token_input = tokenizer(pre_text)
        pre_token = tokenizer.convert_ids_to_tokens(
            token_input['input_ids'])
        bg_id = bg_id + len(pre_token) - 2
        pre_loc = loc1

        word = example['sentence'][loc0: loc1].strip()
        if loc0 > 0 and example['sentence'][loc0 - 1] == ' ':
            word = " " + word
        token_input = tokenizer(word)
        word_token = tokenizer.convert_ids_to_tokens(token_input['input_ids'])

        label_number = ner_tags_2_number[f"B-{loc[2]}"]
        ner_tags[bg_id] = label_number
        bg_id += 1
        for idx in range(bg_id, bg_id + len(word_token) - 3):
            ner_tags[idx] = label_number + 1
        bg_id = bg_id + len(word_token) - 3

        # visualize_ner_tags(example['tokens'], ner_tags)

    ner_tags[0] = -100
    ner_tags[-1] = -100
    return ner_tags

In [9]:
from datasets import Dataset, DatasetDict, ClassLabel, Features, Value, Sequence

list_input_ids = []
list_attention_mask = []
list_labels = []

for index, row in raw_excel_dataset.iterrows():
    try:
        label = assign_ner_tags_roberta(row)
        token_input = tokenizer(row['sentence'])
        list_input_ids.append(token_input['input_ids'])
        list_attention_mask.append(token_input['attention_mask'])
        list_labels.append(label)
    except Exception as error:
        print(error)
        break
        print(index)
        print(row['sentence'])

tokenized_datasets = pd.DataFrame()
tokenized_datasets['input_ids'] = pd.Series(list_input_ids)
tokenized_datasets['attention_mask'] = pd.Series(list_attention_mask)
tokenized_datasets['labels'] = pd.Series(list_labels)

train_dataset = Dataset.from_pandas(tokenized_datasets)
val_dataset = Dataset.from_pandas(tokenized_datasets)
test_dataset = Dataset.from_pandas(tokenized_datasets)

tokenized_datasets = DatasetDict(
    {"train": train_dataset, "val": val_dataset, "test": test_dataset})
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3952
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3952
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3952
    })
})

In [10]:
from torch.utils.data import DataLoader
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
train_dataloader = DataLoader(
    tokenized_datasets["train"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=4,
)
eval_dataloader = DataLoader(
    tokenized_datasets["test"], collate_fn=data_collator, batch_size=4
)

# Train model

In [11]:
def postprocess(predictions, labels):
    predictions = predictions.detach().cpu().clone().numpy()
    labels = labels.detach().cpu().clone().numpy()

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    return true_labels, true_predictions

In [12]:
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter
import numpy as np

def get_class_weight(train_dataset, num_class):
    list_label = []
    for label in train_dataset['labels']:
        for cl in label:
            if cl != -100:
                list_label.append(cl)
    class_weight = compute_class_weight(class_weight='balanced', classes=np.arange(num_class), y=list_label)
    return class_weight

class_weight = get_class_weight(train_dataset, len(processed_ner_tags))
print(class_weight)
print(len(class_weight))

[6.43653645e-02 4.08587006e+00 5.34026128e+00 1.58049209e+01
 4.86108108e+01 6.38254081e+00 1.04679316e+00 2.35418848e+01
 1.31862170e+01 6.54989075e+00 1.75988258e+01 1.85269881e+00
 1.18735147e+00 2.25388471e+01 9.56702128e+01 3.84316239e+01
 4.75820106e+01 5.76474359e+01 1.45048387e+02]
19


## Model classes

In [13]:
# https://www.kaggle.com/code/wuharlem/simple-bert-w-hinge-loss

from transformers import RobertaForTokenClassification, DebertaV2ForTokenClassification
from typing import List, Optional, Tuple, Union
from transformers.modeling_outputs import TokenClassifierOutput
from torch.nn import CrossEntropyLoss
import torch
 
class CustomRoberta(RobertaForTokenClassification):
  def __init__(self, config, class_weight=None):
    super().__init__(config)
    self.class_weight = torch.tensor(class_weight, dtype=torch.float32).to('cuda' if torch.cuda.is_available() else 'cpu')
    
  def forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.FloatTensor] = None,
        token_type_ids: Optional[torch.LongTensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        head_mask: Optional[torch.FloatTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.roberta(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        sequence_output = outputs[0]
        
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        loss = None
        if labels is not None:
            # move labels to correct device to enable model parallelism
            labels = labels.to(logits.device)
            if self.class_weight is None:
                loss_fct = CrossEntropyLoss()
            else:
                loss_fct = CrossEntropyLoss(weight=self.class_weight)
                
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output

        return TokenClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )

## Setup training

In [14]:
from transformers import AutoModelForTokenClassification
from transformers.models.bert import modeling_bert
from transformers import RobertaForTokenClassification, DebertaV2ForTokenClassification

label_names = processed_ner_tags
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

model = CustomRoberta.from_pretrained(
    model_checkpoint,
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True,
    class_weight=class_weight
)

print(model.config)

Some weights of the model checkpoint at roberta-base were not used when initializing CustomRoberta: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing CustomRoberta from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing CustomRoberta from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of CustomRoberta were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-MAT",
    "2": "I-MAT",
    "3": "B-NMAT",
    "4": "I-NMAT",
    "5": "B-DIMENSION",
    "6": "I-DIMENSION",
    "7": "B-WEIGHT",
    "8": "I-WEIGHT",
    "9": "B-TARGET_USER",
    "10": "I-TARGET_USER",
    "11": "B-PROPERTY",
    "12": "I-PROPERTY",
    "13": "B-COLOR",
    "14": "I-COLOR",
    "15": "B-SHAPE",
    "16": "I-SHAPE",
    "17": "B-SIZE",
    "18": "I-SIZE"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-COLOR": 13,
    "B-DIMENSION": 5,
    "B-MAT": 1,
    "B-NMAT": 3,
    "B-PROPERTY": 11,
    "B-SHAPE": 15,
    "B-SIZE": 17,
    "B-TARGET_USER": 9,
    "B-WEIGHT": 7,
    "I-COLOR": 14,
    "I-DIMENSION": 

In [15]:
import evaluate
import numpy as np
from torch.optim import AdamW
from transformers import get_scheduler

metric = evaluate.load("seqeval")
optimizer = AdamW(model.parameters(), lr=2e-5)

num_train_epochs = 10
num_update_steps_per_epoch = len(train_dataloader)
num_training_steps = num_train_epochs * num_update_steps_per_epoch
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

In [16]:
from accelerate import Accelerator

accelerator = Accelerator()
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader
)

In [17]:
from comet_ml import Experiment, ExistingExperiment
from comet_ml.integration.pytorch import log_model
import comet_ml

experiment = Experiment(
    api_key="ZNgNJ1VVgmaAbL0ga1t4mw3JI",
    project_name="roberta-base-ner-attribution-extraction",
    workspace="luunvt",
    log_code=True
)

hyper_params = {
    "model": model_checkpoint, 
    "num_epochs": num_train_epochs, 
    "optimizer": "adamW",
    "use_class_weight": True,
    "num_train_sample": len(raw_excel_dataset),
}
experiment.log_parameters(hyper_params)
experiment.add_tags(["roberta-ner-classweight"])

[1;38;5;39mCOMET INFO:[0m Couldn't find a Git repository in '/home/smooth/luunvt/attribute_extraction/notebooks' nor in any parent directory. Set `COMET_GIT_DIRECTORY` if your Git Repository is elsewhere.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/luunvt/roberta-base-ner-attribution-extraction/6a50fac91dd94653851024f12f69b059



In [None]:
import comet_ml
from tqdm.auto import tqdm
import torch
from datetime import datetime
import os
import logging

date_time = datetime.now()
format_date = date_time.strftime('%Y-%m-%d')
format_time = date_time.strftime('%H:%M:%S')

print(f"Date: {format_date}")
print(f"Time: {format_time}")

output_dir = f"../models/model_from_{format_date}/deberta-base_{format_time}"

best_f1_score = 0
    
for epoch in range(num_train_epochs):
    # Training
    model.train()
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

    # Evaluation
    model.eval()
    for batch in eval_dataloader:
        with torch.no_grad():
            outputs = model(**batch)

        predictions = outputs.logits.argmax(dim=-1)
        labels = batch["labels"]

        # Necessary to pad predictions and labels for being gathered
        predictions = accelerator.pad_across_processes(
            predictions, dim=1, pad_index=-100)
        labels = accelerator.pad_across_processes(
            labels, dim=1, pad_index=-100)

        predictions_gathered = accelerator.gather(predictions)
        labels_gathered = accelerator.gather(labels)

        true_predictions, true_labels = postprocess(
            predictions_gathered, labels_gathered)
        metric.add_batch(predictions=true_predictions, references=true_labels)

    results = metric.compute()
    print(
        f"epoch {epoch}:",
        {
            key: results[f"overall_{key}"]
            for key in ["precision", "recall", "f1", "accuracy"]
        },
    )

    if experiment:
        experiment.set_epoch(epoch)
        experiment.log_metric("precision", results["overall_precision"])
        experiment.log_metric("recall", results["overall_recall"])
        experiment.log_metric("f1", results["overall_f1"])
        experiment.log_metric("accuracy", results["overall_accuracy"])

    # Save and upload
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
    if accelerator.is_main_process and results[f"overall_f1"] > best_f1_score:
        output_ckpt = os.path.join(output_dir, f'best_f1')
        best_f1_score = results[f"overall_f1"]
        print(
            f"Save best f1 model at epoch {epoch} with better f1 score {best_f1_score}")
        tokenizer.save_pretrained(output_dir)
        unwrapped_model.save_pretrained(
            output_dir, save_function=accelerator.save)
    if (epoch + 1) % 5 == 0:
        output_ckpt = os.path.join(output_dir, f'epoch_{epoch + 1}')
        print(f"Save model at epoch {epoch + 1}")
        tokenizer.save_pretrained(output_ckpt)
        unwrapped_model.save_pretrained(
            output_ckpt, save_function=accelerator.save)
experiment.end()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Date: 2023-11-01
Time: 16:37:04
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
epoch 0: {'precision': 0.5900492355532522, 'recall': 0.19347438185062452, 'f1': 0.2914000511901715, 'accuracy': 0.7495010739346978}
Save best f1 model at epoch 0 with better f1 score 0.2914000511901715
epoch 1: {'precision': 0.6998358814891595, 'recall': 0.31530199252801994, 'f1': 0.4347382824028117, 'accuracy': 0.8375929816758063}
Save best f1 model at epoch 1 with better f1 score 0.434738282402811