In [9]:
from pathlib import Path
import sys

import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification

import torch
from torch import nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split

from tqdm.notebook import tqdm

%load_ext autoreload
%autoreload 2

repo_dir = Path().resolve().parent
sys.path.append(str(repo_dir))

ModuleNotFoundError: No module named 'numpy'

In [None]:
class TextDataset(Dataset):

  def __init__(self, texts, targets, tokenizer, max_len=512):
    self.texts = texts
    self.targets = targets
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, idx):
    text = str(self.texts[idx])
    target = self.targets[idx]

    encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=self.max_len,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt',
        truncation=True
    )

    return {
      'text': text,
      'input_ids': encoding['input_ids'].flatten(),
      'attention_mask': encoding['attention_mask'].flatten(),
      'targets': torch.tensor(target, dtype=torch.long)
    }

In [None]:
df = pd.read_excel('data.xlsx')

In [None]:
tokenizer = BertTokenizer.from_pretrained('DeepPavlov/rubert-base-cased')

In [None]:
# Валидационный датасет
val_ratio = 0.2

train_dataset, val_dataset = random_split(
    dataset=train_dataset, 
    lengths=[1-val_ratio, val_ratio],
)

In [7]:
from src.engine import Trainer
from src.optim import WarmupScheduler

num_classes = len(train_dataset.dataset.classes)
model = BertForSequenceClassification.from_pretrained('DeepPavlov/rubert-base-cased', num_labels=num_classes)
optimizer = torch.optim.Adam(model.parameters(), lr=.001, weight_decay=1e-3)
criterion = nn.CrossEntropyLoss()
batch_size = 64
scheduler = WarmupScheduler(optimizer)

trainer = Trainer(
    model, 
    optimizer=optimizer, 
    criterion=criterion,
    train_dataset=train_dataset, 
    val_dataset=val_dataset, 
    batch_size=batch_size,
    scheduler=scheduler,
    path = 'model.pt'
)

trainer.train(10)

ModuleNotFoundError: No module named 'numpy'