# Stage 1: Binary classification


In [None]:
!pip install --upgrade transformers
!pip install transformers[sentencepiece]
!pip install transformers[torch]
!pip install datasets

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.27.2
Collecting datasets
  Downloading datasets-2.17.1-py3-none-any.whl (536 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m536.7/536.7 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill,

## Create dataset

In [None]:
import pandas as pd
from datasets import DatasetDict, Dataset
import torch

data = pd.read_csv("data/subtask1_approach3_stage1.csv")
text = list(data['text'].astype(str))
label = list(data['label'].astype(int))

train_dataset = Dataset.from_dict({'text': text, 'label': label})
data = DatasetDict({'train': train_dataset})
data


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 39768
    })
})

# Fine-tuning

In [None]:
#use the access token of your hugging face account to login
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
if torch.cuda.is_available() == True:
  !nvidia-smi
else:
  print("Not found GPU")
def CSI_tokenize(batch):
  return CSI_tokenizer(batch['text'], padding = True, truncation = True)

Wed Feb 21 04:31:31 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P8               9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [None]:
import random
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoModel, AutoTokenizer
from sklearn.metrics import classification_report, f1_score
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
import tensorflow as tf
import torch
# Set random seed
random_seed = 42
random.seed(random_seed)
tf.random.set_seed(random_seed)
torch.manual_seed(random_seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(random_seed)

# Load pre-trained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_id = "xlm-roberta-base"
num_labels = 2
CSI_model = (AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=num_labels).to(device))
CSI_tokenizer = AutoTokenizer.from_pretrained(model_id)
data_encoded = data.map(CSI_tokenize, batched=True, batch_size=None)

# Set up parameter
batch_size = 16
logging_steps = int(len(data_encoded['train']) // batch_size)
model_name = "your_model_name_approach3"

training_args = TrainingArguments(output_dir=model_name,
                                  num_train_epochs=25,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  disable_tqdm=False,
                                  #logging_steps=logging_steps,
                                  seed = random_seed,
    load_best_model_at_end = True,
    evaluation_strategy = "steps",
    eval_steps = logging_steps,
    metric_for_best_model= "eval_loss",
    save_strategy= "steps",
    save_steps = logging_steps,
    save_total_limit = 1,
    push_to_hub=True)

trainer = Trainer(model=CSI_model,
                  args=training_args,
                  train_dataset=data_encoded['train'],
                  eval_dataset  =  data_encoded['train'],
                  tokenizer=CSI_tokenizer,
                  callbacks = [EarlyStoppingCallback(early_stopping_patience=2)])

# train model and push to hugging face
trainer.train();
trainer.push_to_hub(commit_message="Training completed!")

# Stage 2 and Stage 3: They are Approach2

# Combine 3 stages to predict

## Stage 1: Binary classification

In [None]:
# Load model and create process function
from transformers import pipeline
model_id = "your_user_name/your_model_name_approach3" # replace your_user_name by user name of your hugging face account
classifier = pipeline("text-classification", model=model_id)

def Classify_predict(text, classifier):
  pred = classifier(text)
  return int(pred[0]['label'][-1])


## Stage 2: Entity extraction

In [None]:
# Load model and create process function for stage 2
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer
import torch
tokenizer = AutoTokenizer.from_pretrained("your_user_name/your_model_name") # replace your_user_name by user name of your hugging face account
model = AutoModelForTokenClassification.from_pretrained("your_user_name/your_model_name")

def Entity_detection(text):
    token_input = tokenizer(text.split(" "), is_split_into_words=True)
    word_ids = token_input.word_ids()
    input_ids = token_input["input_ids"]

    inputs = tokenizer(text, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits
    predictions = torch.argmax(logits, dim=2)
    predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]

    previous_word_idx = None
    final_tag = []

    for id, word_idx in enumerate(word_ids):
        if word_idx is None or word_idx == previous_word_idx:
            continue
        elif word_idx != previous_word_idx:
            final_tag.append(predicted_token_class[id])
        previous_word_idx = word_idx
    result = " ".join(final_tag)
    return result
# Entity_detection(sequences[0]) == 'O O O O O O O O B O O O O O O O O O O O O'

## Stage 3: Entity classification

In [None]:
#Load model and create process function for stage 3
from transformers import pipeline
model_id = "your_user_name/your_model_name"
classifier = pipeline("text-classification", model=model_id) # # replace your_user_name by user name of your hugging face account

# Dictionary to convert index2tag and tag2index
index2tag =  {0: 'B-Application_Creation', 1: 'B-Application_Deposition', 2: 'B-Application_Mention', 3: 'B-Application_Usage', 4: 'B-OperatingSystem_Mention', 5: 'B-OperatingSystem_Usage', 6: 'B-PlugIn_Creation', 7: 'B-PlugIn_Deposition', 8: 'B-PlugIn_Mention', 9: 'B-PlugIn_Usage', 10: 'B-ProgrammingEnvironment_Mention', 11: 'B-ProgrammingEnvironment_Usage', 12: 'B-SoftwareCoreference_Deposition'}
tag2index = {'B-Application_Creation': 0, 'B-Application_Deposition': 1, 'B-Application_Mention': 2, 'B-Application_Usage': 3, 'B-OperatingSystem_Mention': 4, 'B-OperatingSystem_Usage': 5, 'B-PlugIn_Creation': 6, 'B-PlugIn_Deposition': 7, 'B-PlugIn_Mention': 8, 'B-PlugIn_Usage': 9, 'B-ProgrammingEnvironment_Mention': 10, 'B-ProgrammingEnvironment_Usage': 11, 'B-SoftwareCoreference_Deposition': 12}
def Entity_predict(text, classifier):
  pred = classifier(text)
  return index2tag[int(pred[0]['label'].split("_")[-1])]


## Predict

In [None]:
def Pipeline_predict(sequence):
  if Classify_predict(sequence, classifier) == 0: # not contain entity
    temp_tag = ['O']*len(sequence.split(" "))
    tag = " ".join(temp_tag)
    return tag # tag like "O O O O ... length of sentence"
  else: #contain entity
    seq_token = sequence.split(" ")
    detect = Entity_detection(sequence).split(" ")
    if len(detect) != len(seq_token):
        print("Length error")

    list_index = []
    for idx, tok in enumerate(detect):
        if tok == 'B':
          start = idx
          end = idx + 1
          while end < len(detect):
            if detect[end] == 'I':
                end +=1
            else:
              break
          list_index.append((start, end))
    for s, e in list_index:
        entity_text = " ".join(seq_token[s:e])
        classify_text = f"What is {entity_text} in sentence: {sequence}"
        entity_tag = Entity_predict(classify_text, classifier)
        detect[s] = entity_tag

        for i in range(s+1, e):
            detect[i] = entity_tag[0].replace('B', 'I') + entity_tag[1:]
    return " ".join(detect)


In [None]:
# predict for test data
with open("data/subtask1_test.data.txt", 'r') as f:
    data = f.read()

sequences = data.split("\n")[:-1] # remove the empty line this the end of file
print("Number of sentence in the test set", len(sequences))
list_result = []
for idx, s in enumerate(sequences):
    tag = Pipeline_predict(s)
    list_result.append(tag)
    if idx % 200 == 0:
        print("[INFO] processing", idx)
        print(idx, list_result[-1])
text_result = "\n".join(list_result)
with open("predictions.txt", 'w+') as f:
    f.write(text_result)