In [None]:
!git clone https://github.com/yuliya1324/Aspect_Sent_project.git

In [None]:
!pip install transformers
!pip install sentencepiece
!pip install wandb

In [3]:
import wandb
wandb.login()

ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 

··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW, get_linear_schedule_with_warmup, AutoModel
import torch
import numpy as np
from torch.utils.data import Dataset, DataLoader
import re
from torch import nn, optim
from sklearn.metrics import f1_score, classification_report, accuracy_score

In [5]:
import random

def set_random_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

In [6]:
RANDOM_SEED = 1234
set_random_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [9]:
df_train = pd.read_csv('/content/Aspect_Sent_project/data/cats_train.csv', index_col=0)
df_val = pd.read_csv('/content/Aspect_Sent_project/data/cats_val.csv', index_col=0)

In [None]:
df_train

Unnamed: 0,idx,text,Food,Interior,Price,Whole,Service
0,3976,"День 8-го марта прошёл, можно и итоги подвести...",positive,positive,absence,positive,positive
1,30808,Отмечали в этом ресторане день рождение на пер...,positive,positive,positive,positive,positive
2,14031,Хочу поделиться своим впечатлением от посещени...,absence,positive,absence,positive,positive
3,2495,Добрый день! Были вчера с друзьями в этом кафе...,positive,positive,absence,positive,positive
4,38835,Отметили с мужем годовщину свадьбы 6 ноября в ...,both,absence,negative,negative,positive
...,...,...,...,...,...,...,...
279,6962,Очаровательная Виктория просила об отзыве и я ...,both,absence,absence,positive,both
280,9878,Пришли в данное заведение 4 июня 2014 года пок...,negative,absence,absence,negative,negative
281,28258,Заехали с мужем поужинать в пятницу ( 17.01.14...,positive,positive,neutral,positive,positive
282,33043,Мне так там нравитсяяяя!!!!!!!!! Интерьер модн...,positive,positive,absence,positive,both


In [10]:
class CatDataset(Dataset):

  def __init__(self, data, tokenizer, max_len):
    self.texts = data["text"]
    self.Food = data["Food"]
    self.Interior = data["Interior"]
    self.Price = data["Price"]
    self.Whole = data["Whole"]
    self.Service = data["Service"]
    self.id = data["idx"]
    self.tokenizer = tokenizer
    self.max_len = max_len
    self.class_to_int = {"absence": 0, "positive": 1, "negative": 2, "both": 3, "neutral": 4}
  
  def __len__(self):
    return len(self.texts)
  
  def __getitem__(self, item):
    text = self.texts.iloc[item]
    Food = self.class_to_int[self.Food.iloc[item]]
    Interior = self.class_to_int[self.Interior.iloc[item]]
    Price = self.class_to_int[self.Price.iloc[item]]
    Whole = self.class_to_int[self.Whole.iloc[item]]
    Service = self.class_to_int[self.Service.iloc[item]]
    idx = self.id.iloc[item]

    encoding = self.tokenizer.encode_plus(
      text,
      add_special_tokens=True,
      max_length=self.max_len,
      return_token_type_ids=False,
      padding='max_length',
      truncation=True,
      return_attention_mask=True,
      return_tensors='pt',
    )

    return {
      'id': idx,
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'Food': torch.tensor(Food, dtype=torch.long),
      'Interior': torch.tensor(Interior, dtype=torch.long),
      'Price': torch.tensor(Price, dtype=torch.long),
      'Whole': torch.tensor(Whole, dtype=torch.long),
      'Service': torch.tensor(Service, dtype=torch.long),
    }

In [11]:
def create_data_loader(df, tokenizer, max_len, batch_size):
  ds = CatDataset(
    data=df,
    tokenizer=tokenizer,
    max_len=max_len
  )

  return DataLoader(
    ds,
    batch_size=batch_size,
    num_workers=1
  )

In [12]:
MAX_LEN = 512
BATCH_SIZE = 4
MODEL_NAME = 'sberbank-ai/ruRoberta-large'

In [13]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.81M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

In [14]:
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)

In [15]:
class CatClassifier(nn.Module):

    def __init__(self, n_classes):
        super(CatClassifier, self).__init__()
        self.bert = AutoModel.from_pretrained(MODEL_NAME).to(device)
        self.drop = nn.Dropout(p=0.5)
        self.Food = nn.Linear(self.bert.config.hidden_size, n_classes).to(device)
        self.Interior = nn.Linear(self.bert.config.hidden_size, n_classes).to(device)
        self.Price = nn.Linear(self.bert.config.hidden_size, n_classes).to(device)
        self.Whole = nn.Linear(self.bert.config.hidden_size, n_classes).to(device)
        self.Service = nn.Linear(self.bert.config.hidden_size, n_classes).to(device)
  
    def forward(self, batch):
        outputs = self.bert(
            input_ids=batch["input_ids"].to(device),
            attention_mask=batch["attention_mask"].to(device)
        )
        output = self.drop(self.mean_pooling(outputs, batch["attention_mask"].to(device)))
        Food = self.Food(output)
        Interior = self.Interior(output)
        Price = self.Price(output)
        Whole = self.Whole(output)
        Service = self.Service(output)
        return {
            "Food": Food, 
            "Interior": Interior, 
            "Price": Price, 
            "Whole": Whole, 
            "Service": Service, 
            }
    
    @staticmethod
    def mean_pooling(model_output, attention_mask):
        token_embeddings = model_output[0] #First element of model_output contains all token embeddings
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

In [16]:
model = CatClassifier(5)

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of the model checkpoint at sberbank-ai/ruRoberta-large were not used when initializing RobertaModel: ['lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at sberbank-ai/ruRoberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to b

In [17]:
for name,param in model.named_parameters():
    if not re.search("Service|Whole|Price|Food|Interior|pooler|23|22|21|20", name):
        param.requires_grad = False

In [18]:
EPOCHS = 10

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=1e-5, correct_bias=False, weight_decay=0.06)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)



In [19]:
def count_loss(batch, pred):
    loss1 = loss_fn(pred["Food"], batch["Food"].to(device))
    loss2 = loss_fn(pred["Interior"], batch["Interior"].to(device))
    loss3 = loss_fn(pred["Price"], batch["Price"].to(device))
    loss4 = loss_fn(pred["Whole"], batch["Whole"].to(device))
    loss5 = loss_fn(pred["Service"], batch["Service"].to(device))
    return sum([loss1, loss2, loss3, loss4, loss5])

In [20]:
def compute_metrics(batch, pred, sub="train"):
    _, Food = torch.max(pred["Food"], dim=1)
    _, Interior = torch.max(pred["Interior"], dim=1)
    _, Price = torch.max(pred["Price"], dim=1)
    _, Whole = torch.max(pred["Whole"], dim=1)
    _, Service = torch.max(pred["Service"], dim=1)
    Food_s = accuracy_score(batch["Food"], Food.cpu())
    Interior_s = accuracy_score(batch["Interior"], Interior.cpu())
    Price_s = accuracy_score(batch["Price"], Price.cpu())
    Whole_s = accuracy_score(batch["Whole"], Whole.cpu())
    Service_s = accuracy_score(batch["Service"], Service.cpu())
    return {
        f"{sub}/accuracy": np.mean([Food_s, Interior_s, Price_s, Whole_s, Service_s]),
        f"{sub}/accuracy_Food": Food_s,
        f"{sub}/accuracy_Interior": Interior_s,
        f"{sub}/accuracy_Price": Price_s,
        f"{sub}/accuracy_Whole": Whole_s,
        f"{sub}/accuracy_Service": Service_s,
    }

In [21]:
def train_epoch(
  model, 
  data_loader, 
  optimizer, 
  device, 
  scheduler, 
):
  model = model.train()
  
  for batch in data_loader:
    pred = model(batch)
    loss = count_loss(batch, pred) 
    score = compute_metrics(batch, pred)
    score["train/loss"] = loss.item()
    wandb.log(score)
    
    loss.backward()
    optimizer.step()
    scheduler.step()
    optimizer.zero_grad()

def eval_model(model, data_loader, device):
    model = model.eval()

    losses = []
    true = {
            "Food": torch.tensor([]), 
            "Interior": torch.tensor([]), 
            "Price": torch.tensor([]), 
            "Whole": torch.tensor([]), 
            "Service": torch.tensor([]), 
            }
    predictions = {
            "Food": torch.tensor([]), 
            "Interior": torch.tensor([]), 
            "Price": torch.tensor([]), 
            "Whole": torch.tensor([]), 
            "Service": torch.tensor([]), 
            }

    with torch.no_grad():
        for batch in data_loader:
            pred = model(batch)
            loss = count_loss(batch, pred)
            losses.append(loss.item())

            for k in true:
                true[k] = torch.cat((true[k], batch[k].cpu()))
                predictions[k] = torch.cat((predictions[k], pred[k].cpu()))

    score = compute_metrics(true, predictions, "eval")
    score["eval/loss"] = np.mean(losses)
    wandb.log(score)

In [22]:
wandb.init(
    project="Sentiment_Aspect",
    name=f"experiment_{2}",
    )

for epoch in range(EPOCHS):
    train_epoch(
        model, 
        train_data_loader, 
        optimizer, 
        device, 
        scheduler, 
        )
    eval_model(model, val_data_loader, device)
    # torch.save(model, f"checkpoints/roberta_sent_{epoch}")

wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33mjulia_kor[0m. Use [1m`wandb login --relogin`[0m to force relogin


0,1
eval/accuracy,▁▄▄▅██▇██▆
eval/accuracy_Food,▁▁▁▆█▆▆██▆
eval/accuracy_Interior,▅█▅▅▅▅▁▁▁▁
eval/accuracy_Price,▁▁▁▂▅█▇██▇
eval/accuracy_Service,▁▆▇█▇▆▇▇▆▅
eval/accuracy_Whole,▁▄▄▄██▇▆▇▆
eval/loss,█▄▃▂▁▁▁▁▁▁
train/accuracy,▂▁▆▆▁▆▃▄▆▂▅▅▄▂▄▄▄▄▃▅▇▇▆▆█▃▅▆▆▇▆▇▆▅▇▇▅▃▆▅
train/accuracy_Food,▆▃█▆▁▆█▆█▃▆▆█▁▆█▆▃▁█▆█▆██▆▃▆█▆▆▆█▁█▆▁▁█▆
train/accuracy_Interior,▃▁██▆▆▃▆▆▁▆▆█▁█▃██▆▃▆███▆▁▆▃▆▆█▆▃▆▆█▃▆▆▁

0,1
eval/accuracy,0.78571
eval/accuracy_Food,0.78571
eval/accuracy_Interior,0.71429
eval/accuracy_Price,0.82143
eval/accuracy_Service,0.71429
eval/accuracy_Whole,0.89286
eval/loss,2.35935
train/accuracy,0.93333
train/accuracy_Food,1.0
train/accuracy_Interior,1.0


In [23]:
labels = ["absence", "positive", "negative", "both", "neutral"]

def inference(tokenizer, model, text):
    encoding = tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          max_length=MAX_LEN,
          return_token_type_ids=False,
          padding='max_length',
          truncation=True,
          return_attention_mask=True,
          return_tensors='pt',
        )
    model = model.eval()
    with torch.no_grad():
        pred = model(encoding)
    _, Food = torch.max(pred["Food"], dim=1)
    _, Interior = torch.max(pred["Interior"], dim=1)
    _, Price = torch.max(pred["Price"], dim=1)
    _, Whole = torch.max(pred["Whole"], dim=1)
    _, Service = torch.max(pred["Service"], dim=1)
    return {
            "Food": labels[Food.item()], 
            "Interior": labels[Interior.item()], 
            "Price": labels[Price.item()], 
            "Whole": labels[Whole.item()], 
            "Service": labels[Service.item()],
    }

In [24]:
with open("/content/dev_pred_cats.txt", "w", encoding="utf-8") as file_write:
    with open("/content/Aspect_Sent_project/data/dev_reviews.txt", encoding="utf-8") as file_read:
        for line in file_read:
            line = line[:-1]
            idx, text = line.split("\t")
            pred = inference(tokenizer, model, text)
            for cat in pred:
                file_write.write(f"{idx}\t{cat}\t{pred[cat]}\n")

In [None]:
torch.save(model, "/content/drive/MyDrive/Colab Notebooks/NLP_project/model_cat.pt")