In [1]:
import requests
import json
from typing import Any, List, Tuple

import numpy as np
import torch
from numpy import ndarray
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
from transformers import (  # type: ignore
    AutoModelForSequenceClassification,
    BatchEncoding,
    BertTokenizerFast,
    PreTrainedTokenizerBase,
)

In [2]:
class MyTextDataset(Dataset):  # type: ignore
    def __init__(self, sentence_list: List[str]) -> None:
        self.sentences = sentence_list

    def __len__(self) -> int:
        return len(self.sentences)

    def __getitem__(self, idx: int) -> Tuple[int, str]:
        return idx, self.sentences[idx]


class MyCollateBatch:
    def __init__(self, tokenizer: PreTrainedTokenizerBase) -> None:
        self.tokenizer = tokenizer

    def __call__(self, batch: List[Tuple[int, str]]) -> BatchEncoding:
        sentences = [b[1] for b in batch]
        idx = [b[0] for b in batch]

        text = self.tokenizer(sentences, max_length=512, padding="max_length", truncation=True, return_tensors="pt")
        text["idx"] = idx
        return text


class ModelSentiment:
    def __init__(self, model_folder: str, device: torch.device) -> None:
        self.device = device
        self.model_folder = model_folder

        self.tokenizer = BertTokenizerFast.from_pretrained(model_folder)  # type: ignore
        self.model = AutoModelForSequenceClassification.from_pretrained(model_folder, return_dict=True)  # type: ignore
        self.collate_fn = MyCollateBatch(self.tokenizer)

        self.model.to(device)
        self.model.eval()

    def __call__(self, sentence_list: List[str]):
        data_ds = MyTextDataset(sentence_list)
        loader = DataLoader(data_ds, batch_size=128, collate_fn=self.collate_fn)
        result = np.zeros((len(sentence_list), len(self.class_names())))
        for batch in loader:
            idx = batch["idx"]
            batch = {k: v.to(self.device) for k, v in batch.items() if k != "idx"}
            with torch.no_grad():
                outputs = self.model(**batch)
                logits = outputs.logits
                predictions = torch.softmax(logits, dim=-1)

                result[idx, :] = predictions.to("cpu").numpy()

        return result

    def class_names(self) -> Any:
        return self.model.config.id2label

In [3]:
URL = "http://172.18.211.203:8000"

In [4]:
r = requests.get(URL+"/source")

In [5]:
r.json()

{'items': [{'id': 10,
   'site': 'vk.com/other',
   'source_type_id': 2,
   'parser_state': None,
   'last_update': '2022-11-22T16:48:56.239229'},
  {'id': 5,
   'site': 'banki.ru/news',
   'source_type_id': 3,
   'parser_state': None,
   'last_update': '2022-11-22T18:14:27.994098'},
  {'id': 2,
   'site': 'sravni.ru',
   'source_type_id': 1,
   'parser_state': None,
   'last_update': '2022-11-22T18:15:08.523091'},
  {'id': 7,
   'site': 'banki.ru/mfo',
   'source_type_id': 1,
   'parser_state': None,
   'last_update': '2022-11-22T17:50:24.478308'},
  {'id': 8,
   'site': 'banki.ru/broker',
   'source_type_id': 1,
   'parser_state': None,
   'last_update': '2022-11-22T17:58:20.271769'},
  {'id': 4,
   'site': 'vk.com/comments',
   'source_type_id': 2,
   'parser_state': None,
   'last_update': '2022-11-22T17:59:08.686448'},
  {'id': 6,
   'site': 'irecommend.ru',
   'source_type_id': 1,
   'parser_state': None,
   'last_update': '2022-11-20T20:26:07.251741'},
  {'id': 9,
   'site': 'ba

In [6]:
r = requests.post(URL+"/model", data=json.dumps({"model_name":"ethics_model_sentiment_test","model_type":"sentiment"}))

In [7]:
model_id = r.json()["model_id"]

In [8]:
model_id

1

In [12]:
!nvidia-smi

Wed Nov 23 12:52:13 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.56       Driver Version: 460.56       CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  GeForce RTX 3080    Off  | 00000000:03:00.0 Off |                  N/A |
|  0%   46C    P8    36W / 340W |   6603MiB / 10018MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  GeForce RTX 3080    Off  | 00000000:08:00.0 Off |                  N/A |
|  0%   50C    P8    24W / 340W |     19MiB / 10015MiB |      0%      Default |
|       

In [10]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
m = ModelSentiment("/home/rsolomatin/ESGanalysis/pretrained_models/model-sentiment", device)

In [None]:
params = {
    "sources": [
#         "banki.ru/broker",
#         "banki.ru/mfo",
#         "banki.ru/insurance",
#         "vk.com/other",
#         "irecommend.ru",
        "vk.com/comments",
        "banki.ru",
        "sravni.ru",
        "irecommend.ru",
        "banki.ru/news",
    ],
    "model_id": 1,
    "limit": 1000
}
for i in tqdm(range(100_000)):
    r = requests.get(URL+"/text/sentences", params = params)
    sentences = [elem["sentence"] for elem in r.json()["items"]]
    ids = [elem["id"] for elem in r.json()["items"]]
    with torch.no_grad():
        result_sentences = m(sentences)
    request_data = []
    for result_arr, sentence_id in zip(result_sentences, ids):
        request_data.append({"model_id": model_id, "text_sentence_id": sentence_id, "text_result":result_arr.tolist()})
    r = requests.post(URL+"/text_result/", data=json.dumps({"items":request_data}))
    if r.status_code != 200:
        raise Exception(r.text)