In [1]:
%%capture
!pip install transformers
!pip install datasets
!pip install accelerate
!pip install seqeval

# Training stage 1: Entity extraction
The source code for the training process is the same as approach 1, the only difference is the input data and processing the prediction results to enter stage 2.

## Create dataset

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

# read data from CSV file
data = pd.read_csv("data/subtask1_train.csv")
text, tag = data['Text'].astype(str), data['Tags'].astype(str)
for id, sen in enumerate(tag):
    tag[id] = sen.replace('0', "O").replace('1', 'B').replace('2', 'I')
text = [s.split(" ") for s in text]
print("Max length", max([len(i) for i in text]))

set_label = set(['O'])
for l in tag:
  set_label.update(l.split(' '))
print("Total of tag:", len(set_label))

# create dictionary for index2tag and tag2index
index2tag = {0:'O', 1: 'B', 2:'I'}
tag2index = {'O':0, 'B': 1, 'I':2}
tag = [[tag2index[e] for e in t.split(" ")] for t in tag]
train_dataset =  Dataset.from_dict({'tokens': text, 'ner_tags': tag})

# Create DatasetDict
data_dict = {'train': train_dataset}
data = DatasetDict(data_dict)

# Information os Dataset
print(data)


## Creating a Custom Model for Token Classification

In [None]:
import torch.nn as nn
from transformers import XLMRobertaConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers.models.roberta.modeling_roberta import RobertaModel
from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel

class XLMRobertaForTokenClassification(RobertaPreTrainedModel):
    config_class = XLMRobertaConfig

    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        # Load model body
        self.roberta = RobertaModel(config, add_pooling_layer=False)
        # Set up token classification head
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
        # Load and initialize weights
        self.init_weights()

    def forward(self, input_ids=None, attention_mask=None, token_type_ids=None,
                labels=None, **kwargs):
        # Use model body to get encoder representations
        outputs = self.roberta(input_ids, attention_mask=attention_mask,
                               token_type_ids=token_type_ids, **kwargs)
        # Apply classifier to encoder representation
        sequence_output = self.dropout(outputs[0])
        logits = self.classifier(sequence_output)
        # Calculate losses
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        # Return model output object
        return TokenClassifierOutput(loss=loss, logits=logits,
                                     hidden_states=outputs.hidden_states,
                                     attentions=outputs.attentions)

## Loading a pretrained Model

In [None]:
# hide_output
from transformers import AutoConfig, AutoTokenizer
# Load pretrained xlm-roberta-base model from hugging file
xlmr_model_name = "xlm-roberta-base"
xlmr_tokenizer = AutoTokenizer.from_pretrained(xlmr_model_name)
xlmr_config = AutoConfig.from_pretrained(xlmr_model_name,
                                         num_labels=len(index2tag),
                                         id2label=index2tag, label2id=tag2index)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [None]:
# hide_output
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
xlmr_model = (XLMRobertaForTokenClassification
              .from_pretrained(xlmr_model_name, config=xlmr_config)
              .to(device))

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of XLMRobertaForTokenClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tag_text(text, tags, model, tokenizer):
    # Get tokens with special characters
    tokens = tokenizer(text).tokens()
    # Encode the sequence into IDs
    input_ids = xlmr_tokenizer(text, return_tensors="pt").input_ids.to(device)
    # Get predictions as distribution over 7 possible classes
    outputs = model(input_ids)[0]
    # Take argmax to get most likely class per token
    predictions = torch.argmax(outputs, dim=2)
    # Convert to DataFrame
    preds = [index2tag[p] for p in predictions[0].cpu().numpy()]
    return pd.DataFrame([tokens, preds], index=["Tokens", "Tags"])


## Tokenizing Texts for NER

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = xlmr_tokenizer(examples["tokens"], truncation=True, max_length=256,
                                      is_split_into_words=True)
    labels = []
    for idx, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=idx)
        #print("word_ids")
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None or word_idx == previous_word_idx:
                label_ids.append(-100)
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
def encode_dataset(corpus):
    return corpus.map(tokenize_and_align_labels, batched=True,
                      remove_columns=['ner_tags', 'tokens'])

In [None]:
# hide_output
data_encoded = encode_dataset(data)

  0%|          | 0/40 [00:00<?, ?ba/s]

## Performance Measures

In [None]:
from seqeval.metrics import classification_report

y_true = [["O", "O", "O", "B-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]
y_pred = [["O", "O", "B-MISC", "I-MISC", "I-MISC", "I-MISC", "O"],
          ["B-PER", "I-PER", "O"]]
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

        MISC       0.00      0.00      0.00         1
         PER       1.00      1.00      1.00         1

   micro avg       0.50      0.50      0.50         2
   macro avg       0.50      0.50      0.50         2
weighted avg       0.50      0.50      0.50         2



In [None]:
import numpy as np

def align_predictions(predictions, label_ids):
    preds = np.argmax(predictions, axis=2)
    batch_size, seq_len = preds.shape
    labels_list, preds_list = [], []

    for batch_idx in range(batch_size):
        example_labels, example_preds = [], []
        for seq_idx in range(seq_len):
            # Ignore label IDs = -100
            if label_ids[batch_idx, seq_idx] != -100:
                example_labels.append(index2tag[label_ids[batch_idx][seq_idx]])
                example_preds.append(index2tag[preds[batch_idx][seq_idx]])

        labels_list.append(example_labels)
        preds_list.append(example_preds)

    return preds_list, labels_list

## Fine-Tuning XLM-RoBERTa

In [None]:
# hide_output
from transformers import TrainingArguments

num_epochs = 25
batch_size = 16
logging_steps = len(data_encoded["train"]) // batch_size
# set name for the model we will push to hugging face
model_name = f"your_model_name"
training_args = TrainingArguments(
    output_dir=model_name, log_level="error", num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size, save_strategy = "no",
    per_device_eval_batch_size=batch_size, evaluation_strategy="epoch",
     disable_tqdm=False, logging_strategy = "no", push_to_hub=True)

In [None]:
#use the access token of your hugging face account to login
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from seqeval.metrics import f1_score

def compute_metrics(eval_pred):
    y_pred, y_true = align_predictions(eval_pred.predictions,
                                       eval_pred.label_ids)
    return {"f1": f1_score(y_true, y_pred)}

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(xlmr_tokenizer)

In [None]:
def model_init():
    return (XLMRobertaForTokenClassification
            .from_pretrained(xlmr_model_name, config=xlmr_config)
            .to(device))

In [None]:
from transformers import Trainer

trainer = Trainer(model_init=model_init, args=training_args,
                  data_collator=data_collator, compute_metrics=compute_metrics,
                  train_dataset=data_encoded["train"],
                  eval_dataset=data_encoded["train"],
                  tokenizer=xlmr_tokenizer)

In [None]:
# train custom model and push to hugging face
trainer.train()
trainer.push_to_hub(commit_message="Training completed!")

# Training stage 2: Entity classification

In [None]:
!pip install --upgrade transformers
!pip install transformers[sentencepiece]
!pip install transformers[torch]
!pip install datasets

Collecting transformers
  Downloading transformers-4.38.1-py3-none-any.whl (8.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.5/8.5 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.37.2
    Uninstalling transformers-4.37.2:
      Successfully uninstalled transformers-4.37.2
Successfully installed transformers-4.38.1
Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2
Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
Collecting di

## Create dataset

In [None]:
import pandas as pd
from datasets import DatasetDict, Dataset
import torch

# read data
data = pd.read_csv("data/subtask1_approach2_stage2.csv")
text = list(data['Text'].astype(str))
label = list(data['Tags'].astype(int))

train_dataset = Dataset.from_dict({'text': text, 'label': label})
data = DatasetDict({'train': train_dataset})
data


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3241
    })
})

## Fine-tuning XLM-RoBERTa for Entity Classificatio

In [None]:
#use the access token of your hugging face account to login
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
if torch.cuda.is_available() == True:
  !nvidia-smi
else:
  print("Not found GPU")
def CSI_tokenize(batch):
  return CSI_tokenizer(batch['text'], padding = True, truncation = True)

Wed Feb 28 04:46:24 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   46C    P8               9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import random
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoModel, AutoTokenizer
from sklearn.metrics import classification_report, f1_score
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
import tensorflow as tf
import torch
# Set random seed
random_seed = 42
random.seed(random_seed)
tf.random.set_seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained model
model_id = "xlm-roberta-base"
num_labels = 13
CSI_model = (AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=num_labels).to(device))
CSI_tokenizer = AutoTokenizer.from_pretrained(model_id)
data_encoded = data.map(CSI_tokenize, batched=True, batch_size=None)

# Set up parameter
batch_size = 16
logging_steps = int(len(data_encoded['train']) // batch_size)
model_name = "your_model_name_stage2"

# Training and push to hugging face
training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=20,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  disable_tqdm=False,
                                  #logging_steps=logging_steps,
                                  seed = random_seed,
    load_best_model_at_end = True,
    evaluation_strategy = "steps",
    eval_steps = logging_steps,
    metric_for_best_model= "eval_loss",
    save_strategy= "steps",
    save_steps = logging_steps,
    save_total_limit = 1,
    push_to_hub=True)

trainer = Trainer(model=CSI_model,
                  args=training_args,
                  train_dataset=data_encoded['train'],
                  eval_dataset  =  data_encoded['train'],
                  tokenizer=CSI_tokenizer,
                  callbacks = [EarlyStoppingCallback(early_stopping_patience=2)])
trainer.train();
trainer.push_to_hub(commit_message="Training completed!")

# Combine 2 stage to predict end2end

## Load model and create process function for stage 1

In [None]:
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer
import torch
tokenizer = AutoTokenizer.from_pretrained("your_user_name/your_model_name") # replace your_user_name by user name of your hugging face account
model = AutoModelForTokenClassification.from_pretrained("your_user_name/your_model_name")

def Entity_detection(text):
    token_input = tokenizer(text.split(" "), is_split_into_words=True)
    word_ids = token_input.word_ids()
    input_ids = token_input["input_ids"]

    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits
    predictions = torch.argmax(logits, dim=2)
    predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]

    previous_word_idx = None
    final_tag = []

    for id, word_idx in enumerate(word_ids):
        if word_idx is None or word_idx == previous_word_idx:
            continue
        elif word_idx != previous_word_idx:
            final_tag.append(predicted_token_class[id])
        previous_word_idx = word_idx
    result = " ".join(final_tag)
    return result
# Entity_detection(sequences[0]) == 'O O O O O O O O B O O O O O O O O O O O O'

## Load model and create process function for stage 2

In [None]:
from transformers import pipeline
model_id = "your_user_name/your_model_name_stage2"
classifier = pipeline("text-classification", model=model_id) # # replace your_user_name by user name of your hugging face account

# Dictionary to convert index2tag and tag2index
index2tag =  {0: 'B-Application_Creation', 1: 'B-Application_Deposition', 2: 'B-Application_Mention', 3: 'B-Application_Usage', 4: 'B-OperatingSystem_Mention', 5: 'B-OperatingSystem_Usage', 6: 'B-PlugIn_Creation', 7: 'B-PlugIn_Deposition', 8: 'B-PlugIn_Mention', 9: 'B-PlugIn_Usage', 10: 'B-ProgrammingEnvironment_Mention', 11: 'B-ProgrammingEnvironment_Usage', 12: 'B-SoftwareCoreference_Deposition'}
tag2index = {'B-Application_Creation': 0, 'B-Application_Deposition': 1, 'B-Application_Mention': 2, 'B-Application_Usage': 3, 'B-OperatingSystem_Mention': 4, 'B-OperatingSystem_Usage': 5, 'B-PlugIn_Creation': 6, 'B-PlugIn_Deposition': 7, 'B-PlugIn_Mention': 8, 'B-PlugIn_Usage': 9, 'B-ProgrammingEnvironment_Mention': 10, 'B-ProgrammingEnvironment_Usage': 11, 'B-SoftwareCoreference_Deposition': 12}
def Entity_predict(text, classifier):
  pred = classifier(text)
  return index2tag[int(pred[0]['label'].split("_")[-1])]


## Predict

In [None]:
def Pipeline_predict(sequence):
    seq_token = sequence.split(" ")
    detect = Entity_detection(sequence).split(" ")
    if len(detect) != len(seq_token):
        print("Length error")

    list_index = []
    for idx, tok in enumerate(detect):
        if tok == 'B':
          start = idx
          end = idx + 1
          while end < len(detect):
            if detect[end] == 'I':
                end +=1
            else:
              break
          list_index.append((start, end))
    for s, e in list_index:
        entity_text = " ".join(seq_token[s:e])
        classify_text = f"What is {entity_text} in sentence: {sequence}"
        entity_tag = Entity_predict(classify_text, classifier)
        detect[s] = entity_tag

        for i in range(s+1, e):
            detect[i] = entity_tag[0].replace('B', 'I') + entity_tag[1:]
    return " ".join(detect)


In [None]:
# predict for test data
with open("data/subtask1_test.data.txt", 'r') as f:
    data = f.read()

sequences = data.split("\n")[:-1] # remove the empty line this the end of file
print("Number of sentence in the test set", len(sequences))
list_result = []
for idx, s in enumerate(sequences):
    tag = Pipeline_predict(s)
    list_result.append(tag)
    if idx % 200 == 0:
        print("[INFO] processing", idx)
        print(idx, list_result[-1])
text_result = "\n".join(list_result)
with open("predictions.txt", 'w+') as f:
    f.write(text_result)