In [1]:
import gc
from functools import partial
from operator import methodcaller

import numpy as np
import pandas as pd
import torch
from sklearn.metrics import classification_report
from sklearn.utils import gen_batches, shuffle
from tqdm.notebook import tqdm
from transformers import BertTokenizerFast, AutoModelForSequenceClassification

from nya_ml.models import *
from nya_ml_research.config import DATA_PATH
from nya_ml_research.src.data.dataset import RuSentimentDataset
from nya_ml_research.src.evaluation.datasets import get_rusentiment, get_kaggle_news, \
    get_russian_language_toxic_comments, fix_toxic_russian_comments, get_toxic_russian_comments
from nya_utils.datatools import supplier

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%reload_ext autoreload

In [58]:
BATCH_SIZE = 64

In [87]:
before = torch.cuda.memory_allocated()
before

512

In [88]:
t = torch.tensor([1., 2.], requires_grad=True, device='cuda')

In [89]:
with torch.no_grad():
    t2 = torch.asin(t)
    t2 = t2.cuda()

In [90]:
t3 = t2.to('cpu')

In [91]:
del t, t2
gc.collect()

In [94]:
after = torch.cuda.memory_allocated()
after

512

In [93]:
# del t
gc.collect()

2989

In [32]:
torch.cuda.synchronize()

In [31]:
t2

NameError: name 't2' is not defined

In [59]:
sentiment_model_suppliers = [
    partial(TatyanaRuBertSentiment.load, 'cuda'),
    partial(BlanchefortRuBertSentiment.load, 'cuda'),
]
toxic_model_suppliers = [
    partial(SismetaninRuBertToxic.load, 'cuda'),
    partial(SkolkovoRuToxicityClassifier.load, 'cuda'),
]

In [60]:
sentiment_dataset_suppliers = [get_rusentiment, get_kaggle_news]
toxic_dataset_suppliers = [get_russian_language_toxic_comments, get_toxic_russian_comments]

In [61]:
def clear(assert_cuda=False):
    gc.collect()

    torch.cuda.synchronize()
    torch.cuda.empty_cache()

    if assert_cuda:
        assert torch.cuda.memory_allocated() == 0, 'Can`t clear memory'


In [113]:
# del sentiment_model
gc.collect()
clear()

In [114]:
torch.cuda.memory_allocated()

0

In [98]:
def evaluate(dataset, model):
    dataset = dataset[:120]
    y_pred = torch.empty(0)
    y_true = []

    for batch in tqdm(gen_batches(len(dataset), BATCH_SIZE), total=int(len(dataset) / BATCH_SIZE)):
        predicted = model._predict(dataset.text[batch].tolist())
        predicted = predicted.to('cpu')

        y_pred = torch.cat([y_pred, torch.argmax(predicted, dim=1)])

        del predicted
        gc.collect()

        y_true += dataset.label[batch].tolist()

    y_pred = y_pred.tolist()
    clear()

    return y_true, y_pred


In [96]:
for sentiment_model_supplier in sentiment_model_suppliers:
    sentiment_model = sentiment_model_supplier()

    for sentiment_dataset_supplier in sentiment_dataset_suppliers:
        print('Model:', sentiment_model.__class__.__name__, 'Dataset:', sentiment_dataset_supplier.__name__)
        ds = sentiment_dataset_supplier()
        y_true, y_pred = evaluate(ds, sentiment_model)
        print(classification_report(y_true, y_pred, digits=2))
        # del y_true, y_pred

    print('before:', torch.cuda.memory_allocated())
    del sentiment_model
    clear(assert_cuda=True)
    print('after:', torch.cuda.memory_allocated())


Model: TatyanaRuBertSentiment Dataset: get_rusentiment


  0%|          | 0/1 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       0.74      0.85      0.79        46
           1       0.89      0.69      0.78        45
           2       0.78      0.86      0.82        29

    accuracy                           0.79       120
   macro avg       0.80      0.80      0.79       120
weighted avg       0.80      0.79      0.79       120

Model: TatyanaRuBertSentiment Dataset: get_kaggle_news


  0%|          | 0/1 [00:00<?, ?it/s]

RuntimeError: CUDA out of memory. Tried to allocate 768.00 MiB (GPU 0; 3.00 GiB total capacity; 1.79 GiB already allocated; 117.37 MiB free; 1.84 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
# for toxic_dataset_supplier in toxic_dataset_suppliers:
#     for toxic_model in map(methodcaller('__call__'), toxic_model_suppliers):
#         print('Model:', toxic_model.__class__.__name__, 'Dataset:', toxic_dataset_supplier.__name__)
#         print(classification_report(*evaluate(toxic_dataset_supplier(), toxic_model), digits=2))
#         del toxic_model
#         torch.cuda.empty_cache()
#         gc.collect()
