Вам нужно получить доступ к чекпоинту, который хранится у меня на Гугл диске. Нужно сделать следующее (я проверяла, это работает):
1. Go to Shared with me in Google Drive.
2. Select the folder or file you want to acess.
3. Right click on it and choose Add shortcut to drive.
4. A pop-up window will apear, Select MyDrive then click on Add Shortcut.

Ссылка на папку с чекпоинтом: https://drive.google.com/drive/folders/1mUKfgYDcfzQQglhbTowFtmCFe0lxdgFT?usp=sharing

In [None]:
#загрузить данные
!wget -nc https://github.com/named-entity/hse-nlp/raw/master/4th_year/Project/train_reviews.txt

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
pred_data_filepath = "/content/train_reviews.txt"
out_filename = "pred_categories.txt"

checkpoint_path = "/content/gdrive/MyDrive/gorshkova_checkpoints/CategorySentiment.ckpt"

#Это можно просто запустить, оно само сработает и выдаст предсказание в указанный файл

##imports and constants

In [None]:
LABEL_COLUMNS = ["Food", "Interior", "Price", "Whole", "Service"]
id2label = {0: "absence",
            1: "negative", 
            2: "neutral",
            3: "positive", 
            4: "both"}

In [None]:
!pip install transformers --upgrade --quiet
!pip install pytorch_lightning --quiet

import pandas as pd
import numpy as np

from collections import defaultdict
import os
import torch
import torchmetrics
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split
import pytorch_lightning as pl

from transformers import BertModel, BertTokenizer
from transformers import get_polynomial_decay_schedule_with_warmup, get_linear_schedule_with_warmup

%load_ext tensorboard

## Dataset and Datamodule

In [None]:
class CatSentDataset(Dataset):

  def __init__(self, 
               data: pd.DataFrame, 
               tokenizer: BertTokenizer,
               max_token_len: int = 128):
    self.data = data
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    item_data = self.data.iloc[index]
    text = item_data["text"]
    
    encoding = self.tokenizer(text, max_length = self.max_token_len,
                              padding = "max_length", truncation = True, 
                              return_attention_mask = True, return_tensors = "pt")
    
    labels = item_data[LABEL_COLUMNS]
    labels = torch.LongTensor(labels)
    #labels = torch.unsqueeze(labels, 1)

    return dict(
        text=text,
        input_ids = encoding["input_ids"].flatten(),
        attention_mask = encoding["attention_mask"].flatten(),
        labels=labels)

class CatSentPredictDataset(Dataset):
  def __init__(self, 
               data: pd.DataFrame, 
               tokenizer: BertTokenizer,
               max_token_len: int = 128):
    self.data = data
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len

  def __len__(self):
    return len(self.data)

  def __getitem__(self, index: int):
    item_data = self.data.iloc[index]
    text = item_data["text"]
    
    encoding = self.tokenizer(text, max_length = self.max_token_len,
                              padding = "max_length", truncation = True, 
                              return_attention_mask = True, return_tensors = "pt")

    return dict(
        text=text,
        input_ids = encoding["input_ids"].flatten(),
        attention_mask = encoding["attention_mask"].flatten())

In [None]:
class CatSentDataModule(pl.LightningDataModule):
  def __init__(self, 
               data: pd.DataFrame, 
               predict_data: pd.DataFrame = pd.DataFrame(),
               batch_size: int = 32,
               eval_fraction = 0.3,
               max_token_len: int = 128,
               ):
    super().__init__()
    self.data = data
    self.predict_data = predict_data
    self.batch_size = batch_size
    self.eval_fraction = eval_fraction
    self.tokenizer = BertTokenizer.from_pretrained("sberbank-ai/ruBert-base", never_split=["USER"])
    self.max_token_len = max_token_len

  def setup(self, stage: str):

    data_full = CatSentDataset(self.data,
                              self.tokenizer,
                              self.max_token_len)
    eval_len = int(self.data.shape[0] * self.eval_fraction)
    train_len = self.data.shape[0] - eval_len
    self.data_train, self.data_eval = random_split(data_full, 
                                         [train_len, eval_len], 
                                         generator=torch.Generator().manual_seed(1000))


  def train_dataloader(self):
      return DataLoader(self.data_train, batch_size = self.batch_size,
                        shuffle = True, num_workers = 2)

  def val_dataloader(self):
      return DataLoader(self.data_eval, batch_size = self.batch_size, num_workers = 2)

  def predict_dataloader(self):
      dataset = CatSentPredictDataset(self.predict_data, self.tokenizer, self.max_token_len)
      return DataLoader(dataset, batch_size = self.batch_size, num_workers = 2, shuffle=False)

## Model

In [None]:
class CatSentClassifier(pl.LightningModule):
  def __init__(self, 
               num_labels: int,
               num_tasks: int,
               learning_rate : float = 2e-5):
    super().__init__()

    self.save_hyperparameters()

    self.num_labels = num_labels #одинаковые лейблы для каждой задачи: [0, 4]
    self.num_tasks = num_tasks
    self.learning_rate = learning_rate

    self.bert = BertModel.from_pretrained("sberbank-ai/ruBert-base", 
                                          output_attentions=True,
                                          output_hidden_states=True,
                                          ) 
    for param in self.bert.parameters():
      param.requires_grad = False

    self.classifiers = nn.ModuleList([nn.Linear(self.bert.config.hidden_size, self.num_labels) for _ in range(self.num_tasks)]) 

    self.criterions = nn.ModuleList([nn.CrossEntropyLoss() for _ in range(self.num_tasks)]) 

    
  def forward(self, input_ids, attention_mask, labels=None):
    bert_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
    attn = bert_output.attentions
    h_cls = bert_output.pooler_output
    outputs =  [l(h_cls) for l in self.classifiers]
    return torch.stack((outputs), dim=0)

  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"].T

    outputs = self(input_ids, attention_mask)

    losses = [loss(output, label) for loss, output, label in zip(self.criterions, outputs, labels)]
    total_loss = sum(losses)
    self.log("train_loss", total_loss, prog_bar=True, logger=True)
    return {"loss": total_loss, "predictions": outputs.transpose(0,1), "labels": labels.transpose(0,1)}

  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    labels = batch["labels"].T

    outputs = self(input_ids, attention_mask)

    losses = [loss(output, label) for loss, output, label in zip(self.criterions, outputs, labels)]
    total_loss = sum(losses)
    self.log("val_loss", total_loss, prog_bar=True, logger=True, sync_dist=True,
             on_step=True, on_epoch=True)
    return {"predictions" : outputs.transpose(0,1), "labels" : labels.transpose(0,1)}

  def validation_epoch_end(self, val_outputs):
    y_true = []
    y_pred = []
    for ouput_dict in val_outputs:
      y_pred.append(ouput_dict["predictions"])
      y_true.append(ouput_dict["labels"])
    y_pred = torch.cat(y_pred, dim=0).softmax(dim=2).argmax(dim=2).flatten()
    y_true = torch.cat(y_true, dim=0).flatten()
    accuracy = (torch.eq(y_pred, y_true).sum()/y_true.shape[0]).item()
    self.log("val_accuracy", accuracy, logger=True)
    

  def predict_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]

    outputs = self(input_ids, attention_mask)
    outputs = outputs.softmax(dim=2)
    return outputs.argmax(dim=2).T


  def configure_optimizers(self):

    optimizer = torch.optim.AdamW(self.parameters(), lr=self.learning_rate)
    return optimizer



## prediction

In [None]:
pred_data = defaultdict(list)
with open(pred_data_filepath) as f:
  for line in f.readlines():
    k,v = line.strip().split("\t")
    pred_data["text_id"].append(k)
    pred_data["text"].append(v)
pred_data = pd.DataFrame(pred_data)
pred_data.head(2)

In [None]:
model = CatSentClassifier.load_from_checkpoint(checkpoint_path)

In [None]:
pred_trainer = pl.Trainer()

BATCH_SIZE = 64
MAX_TOKEN_LEN = 128
pred_data_module = CatSentDataModule(pred_data, pred_data,
                               batch_size = BATCH_SIZE,
                               max_token_len = MAX_TOKEN_LEN)
pred_data_module.setup("predict")

In [None]:
predictions = pred_trainer.predict(model, pred_data_module.predict_dataloader())
preds = []
for p in predictions:
  preds.extend(p)
preds = torch.stack(preds, dim=0)
preds.size()

In [None]:
with open(out_filename, "w") as outf:
  for text_id, pred in zip(pred_data.text_id, preds):
    for category, label_id in zip(LABEL_COLUMNS, pred):
      outf.write("\t".join([text_id, category, id2label[label_id.item()]]) + "\n")