In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch.nn.functional as F
from torch import nn, Tensor
import torch
from enum import Enum
import ast
from torch.utils.data import Dataset
from transformers import BertTokenizerFast, BertModel, BertConfig, get_scheduler
from transformers.optimization import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader

In [2]:
class SentencesDataset(Dataset):
    def __init__(
        self,
        metalurgi,
        prokuror,
        prokuror_col,
        data_size=None,
        close_sent_dist=1,
        far_sent_dist=10,
    ):
        self.metalurgi = metalurgi
        self.prokuror = prokuror
        self.prokuror_col = prokuror_col
        self.exsist_inn = self.prokuror.loc[
            self.prokuror["INN"].isin(self.metalurgi["INN"]), "INN"
        ].unique()
        self.data_size = (
            data_size
            if data_size is not None
            else min(self.metalurgi.shape[0], self.prokuror.shape[0])
        )
        self.close_sent_dist = close_sent_dist
        self.far_sent_dist = far_sent_dist

    def __len__(self):
        return self.data_size

    def __getitem__(self, idx: int):
        first, second = 0, 0
        label = idx % 2

        if idx % 2:
            first_sentence, second_sentence = self.get_positive_example()
        else:
            first_sentence, second_sentence = self.get_negative_example()

        examples = {
            "sentence1": first_sentence,
            "sentence2": second_sentence,
            "label": label,
        }

        return examples

    def get_positive_example(self):
        second_sentence = None
        while second_sentence is None or pd.isnull(second_sentence):
            inn = np.random.choice(self.exsist_inn)
            first_id = np.random.choice(
                self.metalurgi[self.metalurgi["INN"] == inn].index
            )
            first_sentence = self.metalurgi.at[first_id, "line"]
            second_id = np.random.choice(
                self.prokuror[self.prokuror["INN"] == inn].index
            )
            second_sentence = self.prokuror.at[second_id, "line"]
        return first_sentence, second_sentence

    def get_negative_example(self):
        second_sentence = None
        while second_sentence is None or pd.isnull(second_sentence):
            first_id = np.random.choice(self.metalurgi.shape[0])
            second_id = np.random.choice(self.prokuror.shape[0])
            while (
                self.metalurgi.iloc[first_id]["INN"]
                == self.prokuror.iloc[second_id]["INN"]
            ):
                second_id = np.random.choice(self.prokuror.shape[0])
            first_sentence = self.metalurgi.iloc[first_id]["line"]
            second_sentence = self.prokuror.iloc[second_id]["line"]
        return first_sentence, second_sentence


class TransformText:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, data):
        sentences1 = []
        sentences2 = []
        labels = []
        for example in data:
            sentences1.append(example["sentence1"])
            sentences2.append(example["sentence2"])
            labels.append(example["label"])
        labels = torch.tensor(labels)
        try:
            sentences1 = self.transform(sentences1)
            sentences2 = self.transform(sentences2)
        except TypeError:
            print(sentences1, sentences2)
            raise TypeError(f"{sentences1}, {sentences2}")
        return sentences1, sentences2, labels

    def transform(self, sentences):
        sentences = self.tokenizer(
            sentences,
            max_length=512,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return sentences

In [3]:
class SiameseDistanceMetric(Enum):
    EUCLIDEAN = lambda x, y: F.pairwise_distance(x, y, p=2)
    MANHATTAN = lambda x, y: F.pairwise_distance(x, y, p=1)
    COSINE_DISTANCE = lambda x, y: 1 - F.cosine_similarity(x, y)


class Probability(nn.Module):
    def __init__(self, distance_metric=SiameseDistanceMetric.EUCLIDEAN):
        super(Probability, self).__init__()
        self.distance_metric = distance_metric
        self.alpha = torch.nn.Parameter(torch.ones(1), requires_grad=False)

    def get_config_dict(self):
        distance_metric_name = self.distance_metric.__name__
        for name, value in vars(SiameseDistanceMetric).items():
            if value == self.distance_metric:
                distance_metric_name = "SiameseDistanceMetric.{}".format(name)
                break
        return {"distance_metric": distance_metric_name, "alpha": self.alpha}

    def forward(self, sentence1: Tensor, sentence2: Tensor):
        distances = self.distance_metric(sentence1, sentence2)
        prob = 2 / (1 + torch.exp(distances * (self.alpha**2)))
        return prob

    def device(self):
        return self.alpha.device

In [4]:
class Loss(nn.Module):
    def __init__(self):
        super(Loss, self).__init__()

    def get_config_dict(self):
        return {}

    def forward(self, prob: Tensor, labels: Tensor):
        losses = labels * torch.log(prob) + (1 - labels) * torch.log(1 - prob)
        return -losses.mean()

In [5]:
class ProbModel(nn.Module):
    def __init__(self, bert, custom_bert):
        super(ProbModel, self).__init__()
        self.bert = bert
        self.custom_bert = custom_bert
        self.dense = nn.Linear(768, 768)
        self.probability = Probability()

    def forward(self, sentence1: Tensor, sentence2: Tensor):
        a = self.get_embedding(sentence1)
        b = self.get_embedding(sentence2, True)

        p = self.probability(a, b)

        return p

    def get_embedding(self, sentence, custom_model=False):
        device = self.probability.device()
        anchor_ids = sentence["input_ids"]  # .to(device)
        anchor_mask = sentence["attention_mask"]  # .to(device)
        if custom_model:
            a = self.custom_bert(anchor_ids, attention_mask=anchor_mask)[0][:, 0]
        else:
            with torch.no_grad():
                a = self.bert(anchor_ids, attention_mask=anchor_mask)[0][:, 0]
        a = self.dense(a)

        return a

In [6]:
def freeze_layers(model, start, end):
    for p in model.base_model.parameters():
        p.requires_grad = False

    for layer in range(start, end):
        for p in model.base_model.encoder.layer[layer].parameters():
            p.requires_grad = True

In [14]:
metalurg_df = pd.read_csv("parsed_metalurgs_v3_only_prokuror.csv", index_col=0)
prokuror_df = pd.read_csv("prokuror_results.csv", index_col=0, engine="python")
metalurg_df = metalurg_df.rename(columns={"Код налогоплательщика": "INN"})

prokuror_df.shape, metalurg_df.shape

((3022028, 5), (4517712, 5))

In [15]:
prokuror_df.info(), metalurg_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3022028 entries, 0 to 3019345
Data columns (total 5 columns):
 #   Column    Dtype  
---  ------    -----  
 0   path      object 
 1   INN       float64
 2   column    object 
 3   line      object 
 4   sent_num  float64
dtypes: float64(2), object(3)
memory usage: 138.3+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4517712 entries, 58748 to 20555692
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   path        object 
 1   sent_num    int64  
 2   line        object 
 3   INN         int64  
 4   p1_topic 1  float64
dtypes: float64(1), int64(2), object(2)
memory usage: 206.8+ MB


(None, None)

In [16]:
prokuror_df.head()

Unnamed: 0,path,INN,column,line,sent_num
0,result_1,7325142000.0,OKVEDS,Деятельность ресторанов и услуги по доставке п...,0.0
1,result_1,7325142000.0,REASON,(ФЗ 248) Наличие у контрольного (надзорного) о...,1.0
2,result_1,7325143000.0,OKVEDS,Деятельность ресторанов и услуги по доставке п...,0.0
3,result_1,7325143000.0,REASON,(ФЗ 248) Наличие у контрольного (надзорного) о...,1.0
4,result_1,5610097000.0,OKVEDS,Деятельность автомобильного грузового транспор...,0.0


In [17]:
metalurg_df.tail()

Unnamed: 0,path,sent_num,line,INN,p1_topic 1
20555688,"Sber_parser/3801-4001/data/ЛУКА, ООО_105631100...",183,Профили из нержавеющей стали,6311074905,0.002909
20555689,"Sber_parser/3801-4001/data/ЛУКА, ООО_105631100...",186,Профили для ступеней входной группы,6311074905,0.002909
20555690,"Sber_parser/3801-4001/data/ЛУКА, ООО_105631100...",197,Данный носит исключительно информационный хара...,6311074905,0.002909
20555691,"Sber_parser/3801-4001/data/ЛУКА, ООО_105631100...",258,Отправьте нам ваше резюме,6311074905,0.002909
20555692,"Sber_parser/3801-4001/data/ЛУКА, ООО_105631100...",261,Нажимая на кнопку вы подтверждаете что согласн...,6311074905,0.002909


In [18]:
torch.cuda.empty_cache()
model_name = "DeepPavlov/rubert-base-cased"
config = BertConfig.from_pretrained(model_name)
tokenizer = BertTokenizerFast.from_pretrained(model_name)
orig_model = BertModel.from_pretrained(model_name, config=config)
custom_model = BertModel.from_pretrained(
    "/home/rsolomatin/metalurgi/site-prokuror/Models/transformers/DeepPavlov/rubert-base-cased-sentence",
    config=config,
)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


device(type='cpu')

In [19]:
freeze_layers(custom_model, 11, 12)

In [20]:
batch_size = 16
train = SentencesDataset(
    metalurg_df, prokuror_df, ["REASON", "WARNING_INFO"], data_size=10000
)
batch_transformer = TransformText(tokenizer)

train_loader = DataLoader(train, batch_size=batch_size, collate_fn=batch_transformer)

total_steps = int(len(train_loader) / batch_size)

In [21]:
torch.cuda.empty_cache()

model = ProbModel(orig_model, custom_model).to(device)
optim = torch.optim.AdamW(model.parameters(), lr=0.00001)
loss_func = Loss()
loss_func.to(device)

Loss()

In [24]:
import sys, os


def fit(
    model,
    loss_func,
    train_loader,
    optim,
    device,
    epochs=3,
    model_save_path="model",
    disable_tqdm=False,
):
    os.makedirs(model_save_path, exist_ok=True)

    epoch = 0

    # Loading the latest model
    best_filename = ""
    for filename in os.listdir(model_save_path):
        if not filename.endswith(".pth"):
            continue

        if len(filename) < len(best_filename):
            continue
        if len(filename) == len(best_filename) and filename < best_filename:
            continue
        best_filename = filename
    if best_filename != "":
        saved_state = torch.load(model_save_path + "/" + best_filename)
        epoch = saved_state["epoch"]
        model.load_state_dict(saved_state["model"])

    # Starting over the epochs
    result = []
    while epoch < epochs:
        epoch += 1
        result = []

        torch.cuda.empty_cache()

        # TRAINING
        model.train()
        for i, batch in tqdm(
            enumerate(train_loader),
            desc=f"Epoch {epoch}/{epochs} train",
            total=len(train_loader),
        ):
            sentence1, sentence2, labels = batch

            probability = model(sentence1, sentence2)

            labels = labels.to(device)
            loss = loss_func(probability, labels)

            batch_result = pd.DataFrame(
                {
                    "Type": "Train",
                    "Epoch": epoch,
                    "Batch": i,
                    "Probs": [p.item() for p in probability],
                    "Labels": labels.cpu().numpy(),
                    "Losses": loss.item(),
                }
            )
            result.append(batch_result)

            loss.backward()
            optim.step()
            optim.zero_grad()

        res = pd.DataFrame().append(result)
        result = [res]
        sel = (res.Type == "Train") & (res.Epoch == epoch)

        print(f"=====TRAIN(Epoch {epoch})=====")
        accuracy_train, qqs_train, label_mean_train = analyse_quality(res[sel])

In [None]:
fit(model, loss_func, train_loader, optim, device)