# Install & imports

In [1]:
import wandb
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mjulia_kor[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [42]:
import pandas as pd
import numpy as np
import re
from dataclasses import dataclass
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn, optim

import transformers
from transformers import (
    AutoTokenizer, 
    RobertaTokenizerFast, 
    DataCollatorForTokenClassification,
    AutoModelForTokenClassification, 
    TrainingArguments, 
    Trainer
)

from string import punctuation

In [3]:
import random

def set_random_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

In [4]:
RANDOM_SEED = 1234
set_random_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [5]:
%env WANDB_PROJECT=Sentiment_Aspect

env: WANDB_PROJECT=Sentiment_Aspect


# Train

In [6]:
df_train = pd.read_csv('data/aspects_train.csv', index_col=0)
df_val = pd.read_csv('data/aspects_val.csv', index_col=0)

In [7]:
class AspectDataset(Dataset):
    def __init__(self, dataset, tokenizer):
        self.dataset = dataset
        self.cat_to_int = {
            "O": 0, 
            "B-Food": 1, 
            "I-Food": 2, 
            "B-Interior": 3,
            "I-Interior": 4, 
            "B-Price": 5,
            "I-Price": 6, 
            "B-Whole": 7, 
            "I-Whole": 8, 
            "B-Service": 9, 
            "I-Service": 10,
            }
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.dataset["tokens"])

    def __getitem__(self, idx):
        tokenized_inputs = self.tokenizer(self.dataset["tokens"].iloc[idx], truncation=True, is_split_into_words=True, max_length=512)
        label = self.dataset["class"].iloc[idx]
        word_ids = tokenized_inputs.word_ids()
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(self.cat_to_int[label[word_idx]])
            else:
                label_ids.append(self.cat_to_int[label[word_idx]])
            previous_word_idx = word_idx
        tokenized_inputs["labels"] = label_ids
        return tokenized_inputs

In [8]:
MODEL_NAME = 'sberbank-ai/ruRoberta-large'

In [9]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)

In [10]:
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [11]:
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, num_labels=11)

Some weights of the model checkpoint at sberbank-ai/ruRoberta-large were not used when initializing RobertaForTokenClassification: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should prob

In [12]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2).reshape((-1,))
    labels = labels.reshape((-1,))

    true_predictions = [p for (p, l) in zip(predictions, labels) if l != -100]
    true_labels = [l for (p, l) in zip(predictions, labels) if l != -100]

    accuracy = accuracy_score(true_labels, true_predictions)
    return {
        "accuracy": accuracy,
    }

In [13]:
df_train = df_train.groupby("idx").agg(list)
df_val = df_val.groupby("idx").agg(list)

In [14]:
ds_train = AspectDataset(df_train, tokenizer)
ds_dev = AspectDataset(df_val, tokenizer)

In [15]:
for name,param in model.named_parameters():
    if not re.search("classifier|23|22|21|20", name):
        param.requires_grad = False

In [16]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_steps=10,
    report_to="wandb",
    
    run_name="aspects_cat/experiment_3",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=10,
    learning_rate=3e-5,
    weight_decay=0.1,
    warmup_ratio=0.06,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_dev,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

***** Running training *****
  Num examples = 227
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 290
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"




Epoch,Training Loss,Validation Loss,Accuracy
1,0.8159,0.55453,0.85223
2,0.4292,0.353754,0.892247
3,0.3503,0.311179,0.907045
4,0.3059,0.284415,0.911005
5,0.2795,0.286117,0.907045
6,0.25,0.275298,0.911421
7,0.2474,0.267144,0.91288
8,0.2435,0.261825,0.916007
9,0.2411,0.259323,0.915381
10,0.2291,0.253737,0.916424


***** Running Evaluation *****
  Num examples = 28
  Batch size = 8
***** Running Evaluation *****
  Num examples = 28
  Batch size = 8
***** Running Evaluation *****
  Num examples = 28
  Batch size = 8
***** Running Evaluation *****
  Num examples = 28
  Batch size = 8
***** Running Evaluation *****
  Num examples = 28
  Batch size = 8
***** Running Evaluation *****
  Num examples = 28
  Batch size = 8
***** Running Evaluation *****
  Num examples = 28
  Batch size = 8
***** Running Evaluation *****
  Num examples = 28
  Batch size = 8
***** Running Evaluation *****
  Num examples = 28
  Batch size = 8
***** Running Evaluation *****
  Num examples = 28
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=290, training_loss=0.37273594510966335, metrics={'train_runtime': 196.5783, 'train_samples_per_second': 11.548, 'train_steps_per_second': 1.475, 'total_flos': 1208319130845414.0, 'train_loss': 0.37273594510966335, 'epoch': 10.0})

In [17]:
torch.save(model, "model_aspect_cat.pt")