In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!pip install transformers
!pip install sentencepiece
!pip install wandb

In [17]:
import os
import re
from collections import defaultdict
import pandas as pd
import torch
import random
import numpy as np
import json

from transformers import (
    AutoTokenizer, 
    RobertaTokenizerFast, 
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification, 
    TrainingArguments, 
    Trainer
)

from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset

import nltk
nltk.download('punkt')

import wandb
wandb.login()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [23]:
%env WANDB_PROJECT=Sentiment_Aspect

env: WANDB_PROJECT=Sentiment_Aspect


In [4]:
import random

def set_random_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

In [5]:
RANDOM_SEED = 1234
set_random_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NLP_project/train_aspects.csv', index_col=0)

In [7]:
MODEL_NAME = 'sberbank-ai/ruRoberta-large'

In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)

Downloading:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.81M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

In [9]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [12]:
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=4)

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of the model checkpoint at sberbank-ai/ruRoberta-large were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['classifier.bias', 'classifier.weight']
You should prob

In [14]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2).reshape((-1,))
    labels = labels.reshape((-1,))

    true_predictions = [p for (p, l) in zip(predictions, labels) if l != -100]
    true_labels = [l for (p, l) in zip(predictions, labels) if l != -100]

    accuracy = accuracy_score(true_labels, true_predictions)
    return {
        "accuracy": accuracy,
    }

In [30]:
label_all_tokens = True

class CustomDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.class_to_int = {"positive": 0, "negative": 1, "neutral": 2, "both": 3}

    def __len__(self):
        return len(self.dataset["tokens"])

    def __getitem__(self, idx):
        tokenized_inputs = tokenizer(self.dataset["tokens"].iloc[idx], truncation=True, is_split_into_words=True, max_length=512)
        label = self.dataset["sentiment"].iloc[idx]
        cls = self.dataset["class"].iloc[idx]
        word_ids = tokenized_inputs.word_ids()
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None or cls[word_idx] == "no":
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(self.class_to_int[label[word_idx]])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(self.class_to_int[label[word_idx]] if label_all_tokens else -100)
            previous_word_idx = word_idx
        tokenized_inputs["labels"] = label_ids
        return tokenized_inputs

In [31]:
df = df.groupby("idx").agg(list)

In [32]:
df_train, df_test = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)

In [33]:
ds_train = CustomDataset(df_train)
ds_dev = CustomDataset(df_test)

In [22]:
for name,param in model.named_parameters():
    if not re.search("classifier|23|22|21|20", name):
        param.requires_grad = False

In [35]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    report_to="wandb",
    
    run_name="aspects/experiment_1",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=10,
    learning_rate=3e-5,
    weight_decay=0.1,
    warmup_ratio=0.06,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_dev,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

PyTorch: setting up devices
***** Running training *****
  Num examples = 255
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 1280
  Number of trainable parameters = 50388996
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.8179,0.712275,0.725823
2,0.7119,0.623854,0.767697
3,0.7793,0.60452,0.788634
4,0.3791,0.685268,0.778664
5,0.4422,0.578783,0.771685
6,0.4169,0.607803,0.770688
7,0.2886,0.656799,0.781655
8,0.4321,0.593664,0.785643
9,0.3314,0.635931,0.788634
10,0.355,0.642019,0.788634


***** Running Evaluation *****
  Num examples = 29
  Batch size = 2
***** Running Evaluation *****
  Num examples = 29
  Batch size = 2
***** Running Evaluation *****
  Num examples = 29
  Batch size = 2
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Model weights saved in ./results/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 29
  Batch size = 2
***** Running Evaluation *****
  Num examples = 29
  Batch size = 2
***** Running Evaluation *****
  Num examples = 29
  Batch size = 2
***** Running Evaluation *****
  Num examples = 29
  Batch size = 2
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Model weights saved in ./results/checkpoint-1000/pytorch_model.bin
tokenizer

TrainOutput(global_step=1280, training_loss=0.5571612227708101, metrics={'train_runtime': 246.8729, 'train_samples_per_second': 10.329, 'train_steps_per_second': 5.185, 'total_flos': 1111359632076792.0, 'train_loss': 0.5571612227708101, 'epoch': 10.0})

In [None]:
labels = ["positive", "negative", "neutral", "both"]

def inference(tokens, classes, model, tokenizer):
    model.eval()
    tokenized = tokenizer(tokens, truncation=True, is_split_into_words=True, return_tensors="pt", max_length=512)
    pred = model(tokenized["input_ids"].to(device), attention_mask=tokenized["attention_mask"].to(device)).logits.argmax(dim=2)[0]
    word_ids = tokenized.word_ids()
    res = []
    prev = None
    for k, j in enumerate(word_ids):
        if j != None and prev != j:
            res.append(labels[pred[k].item()])
        prev = j
    ds["tags"][i] = [4 if t == 1 else 0 for t in ds["tags"][i]]
    for k, new in enumerate(res):
        if new == 1 and ds["tags"][i][k] == 0:
            ds["tags"][i][k] = 1
        elif new == 2 and ds["tags"][i][k] == 0:
            ds["tags"][i][k] = 2
        elif new == 3 and ds["tags"][i][k] == 0 and re.search("^[xvil]+$", ds["tokens"][i][k]):
            ds["tags"][i][k] = 3
    return ds