In [None]:
!pip install transformers

In [None]:
import os
import io
import sys
import json
import boto3
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import transformers
from transformers import (
    AutoModel, 
    BertTokenizer, 
    BertForSequenceClassification,
    AdamW
)

In [None]:
def read_json(file_path):
    with open(file_path) as file:
        access_data = json.load(file)
    return access_data

CREDS = read_json(file_path='../configs/access_bucket.json')
print(CREDS.keys())

In [None]:
session = boto3.session.Session()
s3 = session.client(
    service_name='s3',
    aws_access_key_id=CREDS['aws_access_key_id'],
    aws_secret_access_key=CREDS['aws_secret_access_key'],
    endpoint_url=CREDS['endpoint_url'] 
)

In [None]:
[x['Key'] for x in s3.list_objects(Bucket=CREDS['name'])['Contents']]

In [None]:
APP_CONFIG = read_json(file_path='../configs/config.json')
print('application config loaded:', APP_CONFIG)

file_to_load = f'{APP_CONFIG["model"]}/config.json'
get_object_response = s3.get_object(
    Bucket=CREDS['name'], 
    Key=file_to_load
)
CONFIG = json.load(get_object_response['Body'])
print('config loaded:', CONFIG)

In [None]:
class ArticlesDataset(torch.utils.data.Dataset):
    def __init__(self, df, tokenizer, max_len, target_cols):
        self.df = df
        self.max_len = max_len
        self.text = df['anno']
        self.tokenizer = tokenizer
        self.targets = df[target_cols].values if target_cols else []
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        text = self.text[index]
        inputs = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

class BERTClass(torch.nn.Module):
    def __init__(self, model_name, target_cols):
        super(BERTClass, self).__init__()
        self.rubert = AutoModel.from_pretrained(model_name)
        self.fc = torch.nn.Linear(768, len(target_cols))
    
    def forward(self, ids, mask, token_type_ids):
        _, features = self.rubert(
            ids, 
            attention_mask=mask, 
            token_type_ids=token_type_ids, 
            return_dict=False
        )
        output = self.fc(features)
        return output

In [None]:
class ArticlesPredictor():
    def __init__(self, model_name, target_cols, max_seq_len,
                 device, batch_size, num_workers, model_files):
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.max_seq_len = max_seq_len
        self.target_cols = target_cols
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.device = device
        self.models = []
        for m_name in model_files:
            m = BERTClass(
                model_name=model_name, 
                target_cols=target_cols
            )
            m.to(device)
            get_object_response = s3.get_object(
                Bucket=CREDS['name'], 
                Key=m_name
            )
            checkpoint = torch.load(
                io.BytesIO(get_object_response['Body'].read()),
                map_location=torch.device('cpu')
            )
            m.load_state_dict(checkpoint['model_state_dict'])
            m.eval()
            self.models.append(m)
            print('loaded:', m_name)
    
    def infer(self, df):
        print('PREDICT:', df.shape)
        pred_dataset = ArticlesDataset(
            df=df, 
            tokenizer=self.tokenizer, 
            max_len=self.max_seq_len, 
            target_cols=self.target_cols
        )
        pred_loader = torch.utils.data.DataLoader(
            pred_dataset, 
            batch_size=self.batch_size, 
            num_workers=self.num_workers, 
            shuffle=False, 
            pin_memory=True
        )
        y_pred = []
        for j, model in enumerate(self.models):
            y_pred_tmp = []
            for i, batch in enumerate(pred_loader, start=1):
                print(f'model {j}, prediction step {i}/{len(pred_loader)}   ', end='\r')
                with torch.no_grad():
                    ids = batch['ids'].to(self.device, dtype=torch.long)
                    mask = batch['mask'].to(self.device, dtype=torch.long)
                    token_type_ids = batch['token_type_ids'].to(self.device, dtype=torch.long)
                    tmp_pred = model(ids, mask, token_type_ids)
                    y_pred_tmp.extend(torch.sigmoid(tmp_pred).cpu().detach().numpy().tolist())
            y_pred.append(y_pred_tmp)
        y_pred = np.mean(y_pred, axis=0)
        df_pred = pd.DataFrame(y_pred) 
        df_pred.columns = [c + '_pred' for c in self.target_cols]
        return df_pred

In [None]:
CONFIG['target_cols']

In [None]:
file_to_load = f'{APP_CONFIG["model"]}/model_files.json'
get_object_response = s3.get_object(
    Bucket=CREDS['name'], 
    Key=file_to_load
)
MODEL_FILES = json.load(get_object_response['Body'])
MODEL_FILES = [
    f'{APP_CONFIG["model"]}/{x.split("/")[-1]}'
    for x in MODEL_FILES
]
print('model files:', MODEL_FILES)

In [None]:
%%time
predictor = ArticlesPredictor(
    model_name=CONFIG['bbone'], 
    target_cols=CONFIG['target_cols'], 
    max_seq_len=CONFIG['max_seq_len'],
    device='cpu', 
    batch_size=1, 
    num_workers=1, 
    model_files=MODEL_FILES #io.BytesIO(get_object_response['Body'].read())
)

In [None]:
text = """Результаты исследования позволили выявить 
городские секторы, подверженные большему и меньшему риску из-за последствий изменения климата. 
Определение рисков от изменения климата в жизненно важных секторах Стамбула является 
необходимым при принятии решений для разработки дальнейших стратегий по смягчению возможных 
последствий и адаптации к новым условиям."""
d = {'anno': [text]}
d.update(dict(zip(CONFIG['target_cols'], [0] * len(CONFIG['target_cols']))))
df_txt = pd.DataFrame(d)
df_txt

In [None]:
%%time
df_pred = predictor.infer(df_txt)
display(df_pred)

In [None]:
df_pred.to_dict()

In [None]:
preds = {}
for k, v in df_pred.to_dict().items():
    lbl = k.replace('_pred', '')
    preds[lbl] = v[0]
data = {}
data['legend'] = CONFIG['targets_description']
data['predictions'] = preds
data