In [1]:
import joblib
import torch
from torch import nn, Tensor
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import numpy as np
from sklearn import preprocessing
from tqdm import tqdm
from numba import jit

In [2]:
class Probability(nn.Module):
    def __init__(self):
        super(Probability, self).__init__()
        self.alpha = torch.nn.Parameter(torch.ones(1), requires_grad=False)
        # self.fc = nn.Linear(768*2,1)

    def get_config_dict(self):
        return {"alpha": self.alpha}

    def forward(self, sentence1: Tensor, sentence2: Tensor):
        distances = F.pairwise_distance(sentence1, sentence2, p=2)
        prob = 2 / (1 + torch.exp(distances * (self.alpha**2)))
        # out = self.fc(torch.cat((sentence1, sentence2),1))
        # prob = 1 / (torch.exp(-out)+1)
        return prob

    def device(self):
        return self.alpha.device


class ProkurorProbModel(nn.Module):
    def __init__(self, metalurg_prob_model, prokuror_bert):
        super(ProkurorProbModel, self).__init__()

        self.metalurg_prob_model = metalurg_prob_model

        self.prokuror_bert = prokuror_bert

        self.dense = nn.Linear(768, 768)
        self.probability = Probability()

    def forward(self, sentence1: Tensor, sentence2: Tensor):
        with torch.no_grad():
            self.metalurg_prob_model.eval()
            a = self.metalurg_prob_model.get_embedding(sentence1)
        b = self.get_embedding(sentence2)

        p = self.probability(a, b)

        return p

    def get_embedding(self, sentence):
        device = self.device()
        # sentence.to(device)

        anchor_ids = sentence["input_ids"].to(device)
        anchor_mask = sentence["attention_mask"].to(device)
        with torch.no_grad():
            a = self.prokuror_bert(anchor_ids, attention_mask=anchor_mask)[0][:, 0]
            a = self.dense(a)

        return a

    def to(self, device):
        self.prokuror_bert.to(device)
        self.dense.to(device)
        self.probability.to(device)

    def device(self):
        return self.metalurg_prob_model.device()


class ProbModel(nn.Module):
    def __init__(self, bert):
        super(ProbModel, self).__init__()
        self.bert = bert
        self.dense = nn.Linear(768, 768)
        self.probability = Probability()

    def forward(self, sentence1: Tensor, sentence2: Tensor):
        """
        device = self.probability.device()

        anchor_ids = sentence1["input_ids"].to(device)
        pos_ids = sentence2["input_ids"].to(device)

        anchor_mask = sentence1['attention_mask'].to(device)
        pos_mask = sentence2['attention_mask'].to(device)

        a = self.bert(anchor_ids, attention_mask=anchor_mask).pooler_output
        b = self.bert(pos_ids, attention_mask=pos_mask).pooler_output
        """

        a = self.get_embedding(sentence1)
        b = self.get_embedding(sentence2)

        p = self.probability(a, b)

        return p

    def get_embedding(self, sentence):
        device = self.device()

        anchor_ids = sentence["input_ids"].to(device)
        anchor_mask = sentence["attention_mask"].to(device)
        with torch.no_grad():
            a = self.bert(anchor_ids, attention_mask=anchor_mask)[0][:, 0]
            a = self.dense(a)
        # a = self.bert(anchor_ids, attention_mask=anchor_mask).pooler_output

        return a

    def to(self, device):
        self.bert.to(device)
        self.dense.to(device)
        self.probability.to(device)

    def device(self):
        return self.bert.device


def load_model(model_name: str = "DeepPavlov/rubert-base-cased"):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    orig_model = AutoModel.from_pretrained(model_name)
    pmodel = ProbModel(orig_model)
    model = ProkurorProbModel(pmodel, orig_model)
    d = torch.load("/home/rsolomatin/metalurgi/prokuror_model/learned_model/35.pth")
    model.load_state_dict(d["model"])
    return tokenizer, model

In [7]:
metalurgs = pd.read_csv(
    "data/parsed_metalurgs_v3_only_prokuror.csv",
    index_col=0,
    dtype={"Код налогоплательщика": str},
)
metalurgs = metalurgs.rename(columns={"Код налогоплательщика": "INN"})
prokuror = pd.read_csv(
    "data/prokuror_results.csv", dtype={"INN": str}, compression="zip"
)
# prokuror = pd.read_pickle("data/prokuror_results_allcolumns.pkl.zip", compression="zip")
job = joblib.load("data/prokuror_test_inn.joblib")

metalurgs.shape, prokuror.shape, len(job)

((4517712, 5), (5224726, 5), 86)

In [8]:
%%time
le = preprocessing.LabelEncoder()
prokuror["cat_INN"] = le.fit_transform(prokuror["INN"])
cat_job = le.transform(job)
metalurgs["cat_INN"] = le.transform(metalurgs["INN"])

CPU times: user 4.19 s, sys: 72.6 ms, total: 4.26 s
Wall time: 4.26 s


In [9]:
prokuror["cat_INN"].nunique(), metalurgs["cat_INN"].nunique()

(590921, 426)

In [10]:
metalurgs_inn = metalurgs["cat_INN"].unique()

In [11]:
prokuror.shape

(5224726, 6)

In [12]:
prokuror = prokuror[prokuror["cat_INN"].isin(metalurgs_inn)]
# test_metalurgs = metalurgs[metalurgs["cat_INN"].isin(cat_job)]
metalurgs.shape, prokuror.shape, len(job)

((4517712, 6), (8425, 6), 86)

In [13]:
prokuror.to_csv("data/prokuror_only_metalurgs_all_columns.csv")

In [14]:
# torch.save(model, "model/save_35.pth")
model = torch.load("model/save_35.pth")
model.eval()
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
tokenizer = AutoTokenizer.from_pretrained("DeepPavlov/rubert-base-cased")
model.metalurg_prob_model.dense.to(device)
model.to(device)

In [19]:
prokuror = pd.read_csv("data/prokuror_only_metalurgs.csv")

In [None]:
arr = []
num_split = test_metalurgs.shape[0] / 32
for row in tqdm(np.split(test_metalurgs, num_split)):
    sentences = row["line"].values.tolist()
    sentense_embeddings = tokenizer(
        sentences,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    embeddings = (
        model.metalurg_prob_model.get_embedding(sentense_embeddings).cpu().numpy()
    )  # .astype(np.float16)
    for sentence_id, inn, embedding in zip(row.index, row["INN"], embeddings):
        arr.append({"sentence_id": sentence_id, "inn": inn, "arr": embedding})

In [None]:
arr = np.array(arr)
with open("data/metalurgs_emb_32.npy", "wb") as f:
    np.save(f, arr)

In [None]:
arr = []
for index, row in tqdm(test_prokuror.iterrows(), total=test_prokuror.shape[0]):
    sentence = row["line"]
    sentense_embedding = tokenizer(
        sentence,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    embedding = (
        model.get_embedding(sentense_embedding).cpu().numpy()
    )  # .astype(np.float16)
    arr.append({"sentence_id": index, "inn": row["INN"], "arr": embedding})

In [None]:
arr = np.array(arr)
with open("data/prokuror_emb_32.npy", "wb") as f:
    np.save(f, arr)

In [20]:
arr = []
num_split = prokuror.shape[0] / 32
for row in tqdm(np.split(prokuror, num_split)):
    sentences = row["line"].values.tolist()
    sentense_embeddings = tokenizer(
        sentences,
        max_length=512,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
    )
    embeddings = (
        model.get_embedding(sentense_embeddings).cpu().numpy()
    )  # .astype(np.float16)
    for sentence_id, inn, embedding in zip(row.index, row["INN"], embeddings):
        arr.append({"sentence_id": sentence_id, "inn": inn, "arr": embedding})

100%|██████████| 145/145 [00:30<00:00,  4.71it/s]


In [21]:
arr = np.array(arr)
with open("data/metalurgprokuror_emb_32.npy", "wb") as f:
    np.save(f, arr)

# Compute distances

## numba cycle

In [16]:
%%time
with open("data/metalurgs_emb_32.npy", "rb") as f:
    metalurg = np.load(f, allow_pickle=True)

CPU times: user 5.38 s, sys: 3.38 s, total: 8.76 s
Wall time: 26.7 s


In [73]:
prokuror.shape

(3019791,)

In [15]:
%%time
with open("data/allprokuror_emb_32.npy", "rb") as f:
    prokuror = np.load(f, allow_pickle=True)

CPU times: user 14.6 ms, sys: 8.14 ms, total: 22.7 ms
Wall time: 21.5 ms


In [17]:
prokuror_arr = []
prokuror_id = []
is_2d = len(prokuror[0]["arr"].shape) == 2
for prokuror_elem in tqdm(prokuror):
    if is_2d:
        prokuror_arr.append(prokuror_elem["arr"][0])
    else:
        prokuror_arr.append(prokuror_elem["arr"])
    prokuror_id.append(prokuror_elem["sentence_id"])
prokuror_arr = np.array(prokuror_arr)
prokuror_id = np.array(prokuror_id)

100%|██████████| 4663/4663 [00:00<00:00, 1208479.95it/s]


In [None]:
metalurg_arr = []
for metalurg_elem in tqdm(metalurg):
    metalurg_arr.append(metalurg_elem["arr"])

metalurg_arr = np.array(metalurg_arr)

In [None]:
@jit(nopython=True, nogil=True, cache=True, fastmath=True)
def lin(y1, y2):
    return np.linalg.norm(y1 - y2)


@jit(nopython=True, nogil=True, cache=True)
def find_neighbors(metalurg_arr, prokuror_arr, prokuror_id):
    prok_dist = np.zeros(prokuror_arr.shape[0], dtype=np.float64)
    for i, arr in enumerate(prokuror_arr):
        prok_dist[i] = lin(metalurg_arr, arr)
    indexes = np.argsort(prok_dist)
    return prok_dist[indexes][:3], prokuror_id[indexes][:3]

In [None]:
res = []
for i, metalurg_elem in tqdm(enumerate(metalurg), total=metalurg.size):
    distances, sentences_id = find_neighbors(metalurg_arr[i], prokuror_arr, prokuror_id)
    for dist, sent_id in zip(distances, sentences_id):
        res.append(
            {
                "metalurg_id": metalurg_elem["sentence_id"],
                "metalurg_inn": metalurg_elem["inn"],
                "prokuror_sentence_id": sent_id,
                "prokuror_distance": dist,
            }
        )

In [None]:
prokuror_arr = prokuror_arr.astype(np.float16)
metalurg_arr = metalurg_arr.astype(np.float16)

In [None]:
prokuror_arr = torch.from_numpy(prokuror_arr)
metalurg_arr = torch.from_numpy(metalurg_arr)

In [None]:
prokuror_arr = prokuror_arr.type(torch.float16)
metalurg_arr = metalurg_arr.type(torch.float16)

In [18]:
prokuror_arr = torch.from_numpy(prokuror_arr).type(torch.float16)

In [20]:
torch.save(prokuror_arr, "data/prokuror_arr.pt")
torch.save(metalurg_arr, "data/metalurg_arr.pt")

## Torch cycle

In [25]:
%%time
with open("data/metalurgs_emb_32.npy", "rb") as f:
    metalurg = np.load(f, allow_pickle=True)

CPU times: user 6.22 s, sys: 2.66 s, total: 8.88 s
Wall time: 26.7 s


In [37]:
%%time
with open("data/metalurgprokuror_emb_32.npy", "rb") as f:
    prokuror = np.load(f, allow_pickle=True)

CPU times: user 25.6 ms, sys: 18.5 ms, total: 44.1 ms
Wall time: 230 ms


In [38]:
prokuror_arr = []
prokuror_id = []
is_2d = len(prokuror[0]["arr"].shape) == 2
for prokuror_elem in tqdm(prokuror):
    if is_2d:
        prokuror_arr.append(prokuror_elem["arr"][0])
    else:
        prokuror_arr.append(prokuror_elem["arr"])
    prokuror_id.append(prokuror_elem["sentence_id"])
prokuror_arr = np.array(prokuror_arr)
prokuror_id = np.array(prokuror_id)

100%|██████████| 4663/4663 [00:00<00:00, 1263602.50it/s]


In [39]:
prokuror_arr = torch.from_numpy(prokuror_arr).type(torch.float16)

In [30]:
%%time
# prokuror_arr = torch.load('data/prokuror_arr.pt')
metalurg_arr = torch.load("data/metalurg_arr.pt")

CPU times: user 38.7 ms, sys: 1.35 s, total: 1.39 s
Wall time: 10.1 s


In [31]:
@torch.jit.script
def test_size(metalurg_arr, prokuror_arr):
    ans = F.pairwise_distance(metalurg_arr, prokuror_arr, p=2.0).cpu().detach()
    return ans

In [40]:
torch.cuda.empty_cache()

In [None]:
res = []
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# sizes = [(0, 1_000_000), (1_000_000,  2_000_000), (2_000_000, 3_000_000), (3_000_000, -1)]
# sizes = [(0, 1_500_000), (1_500_000,  3_000_000), (3_000_000, -1)]
sizes = [(0, 1_500_000)]
for size_i, (start, end) in enumerate(sizes):
    dist = [[]] * metalurg_arr.shape[0]
    ind = [[]] * metalurg_arr.shape[0]
    prokuror_gpu = prokuror_arr[start:end].to(device)
    for i, metalurg_elem in tqdm(enumerate(metalurg_arr), total=metalurg_arr.shape[0]):
        tmp_dist = test_size(metalurg_elem.to(device), prokuror_gpu)
        torch.cuda.empty_cache()
        indexes = torch.argsort(tmp_dist)
        dist[i] = tmp_dist[indexes][:3].tolist()
        ind[i] = prokuror_id[indexes][:3].tolist()

 97%|█████████▋| 908987/935699 [15:00<00:26, 1009.98it/s]

In [None]:
res = []
for distance, indexs, metalurg_elem in tqdm(zip(dist, ind, metalurg), total=len(dist)):
    for d, i in zip(distance, indexs):
        res.append(
            {
                "metalurg_id": metalurg_elem["sentence_id"],
                "metalurg_inn": metalurg_elem["inn"],
                "prokuror_sentence_id": i,
                "prokuror_distance": d,
            }
        )

In [None]:
df = pd.DataFrame(res)
df.shape

In [None]:
df.to_csv("prokuror_metalurg_dist.csv.zip", compression="zip")

In [10]:
res = []
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# sizes = [(0, 1_000_000), (1_000_000,  2_000_000), (2_000_000, 3_000_000), (3_000_000, -1)]
sizes = [(0, 1_500_000), (1_500_000, 3_000_000), (3_000_000, -1)]
for i, metalurg_elem in tqdm(enumerate(metalurg), total=metalurg.size):
    ans = torch.tensor([])
    for start, end in sizes:
        size = torch.tensor([start, end], dtype=torch.int)
        tmp = prokuror_arr[start:end]
        prokuror_gpu = tmp.to(device)
        test = test_size(metalurg_arr[i].to(device), prokuror_gpu)
        ans = torch.cat((ans, test))
        del prokuror_gpu, tmp
        torch.cuda.empty_cache()
    indexes = torch.argsort(ans)
    for dist, sent_id in zip(ans[indexes][:3], prokuror_id[indexes][:3]):
        res.append(
            {
                "metalurg_id": metalurg_elem["sentence_id"],
                "metalurg_inn": metalurg_elem["inn"],
                "prokuror_sentence_id": sent_id,
                "prokuror_distance": dist,
            }
        )

  0%|          | 132/935699 [06:15<739:10:22,  2.84s/it] 


KeyboardInterrupt: 

In [None]:
df = pd.DataFrame(res)
df.head()

In [None]:
df.to_csv("data/metalurg_dist2.csv.zip", compression="zip")

## Test euclidean distances

In [None]:
from sklearn.metrics.pairwise import euclidean_distances
from numba import jit
import math
import numpy as np

In [None]:
%%time
with open("data/metalurgs_emb_32.npy", "rb") as f:
    metalurg = np.load(f, allow_pickle=True)

In [None]:
%%time
with open("data/prokuror_emb_32.npy", "rb") as f:
    prokuror = np.load(f, allow_pickle=True)

In [None]:
metalurg.shape, prokuror.shape

In [None]:
metalurg_arr = metalurg[0]["arr"]
prokuror_arr = prokuror[0]["arr"][0]

In [None]:
metalurg_arr.shape, prokuror_arr.shape

In [None]:
%timeit np.linalg.norm(metalurg_arr - prokuror_arr)

In [None]:
%timeit math.sqrt(sum([(a - b)**2 for a, b in zip(metalurg_arr, prokuror_arr)]))

In [None]:
%timeit np.sqrt(np.sum(np.power(metalurg_arr - prokuror_arr, 2)))

In [None]:
%timeit np.sqrt(np.sum((metalurg_arr - prokuror_arr)**2))

In [None]:
%timeit (np.sum((metalurg_arr - prokuror_arr)**2))

In [None]:
v1 = [metalurg_arr]
v2 = [prokuror_arr]

In [None]:
%timeit euclidean_distances(v1, v2)

In [None]:
@jit(nopython=True)
def jit_linalg(y1, y2):
    return np.linalg.norm(y1 - y2)

In [None]:
%timeit jit_linalg(metalurg_arr, prokuror_arr)

In [None]:
%timeit sum((metalurg_arr - prokuror_arr)**2)

In [None]:
%timeit ((metalurg_arr - prokuror_arr)**2).sum()

In [None]:
@jit(nopython=True)
def test_jit_sum(y1, y2):
    return ((y1 - y2) ** 2).sum()

In [None]:
test_jit_sum(metalurg_arr, prokuror_arr)

In [None]:
%timeit test_jit_sum(metalurg_arr, prokuror_arr)

In [None]:
@jit(nopython=True)
def test_jit_sum_square(y1, y2):
    return (np.power(y1 - y2, 2)).sum()

In [None]:
test_jit_sum_square(metalurg_arr, prokuror_arr)

In [None]:
%timeit test_jit_sum_square(metalurg_arr, prokuror_arr)

In [None]:
prokuror_arr = []
prokuror_id = []
for prokuror_elem in prokuror:
    prokuror_arr.append(prokuror_elem["arr"])
    prokuror_id.append(prokuror_elem["sentence_id"])

In [None]:
metalurg_arr = np.array(metalurg[0]["arr"])
prokuror_arr = np.array(prokuror_arr)
prokuror_id = np.array(prokuror_id)

In [None]:
def original(metalurg_arr, prokuror_arr, prokuror_id):
    prok_dist = []
    for prokuror_elem, id in zip(prokuror_arr, prokuror_id):
        distance = np.linalg.norm(metalurg_arr - prokuror_elem[0])
        prok_dist.append({"sent": id, "dist": distance})
    sorted(prok_dist, key=lambda x: x["dist"])[:3]

In [None]:
@jit(nopython=True, nogil=True, cache=True, fastmath=True)
def linalg(y1, y2):
    return np.linalg.norm(y1 - y2)


def numba_v1(metalurg_arr, prokuror_arr, prokuror_id):
    prok_dist = []
    for prokuror_elem, id in zip(prokuror_arr, prokuror_id):
        distance = linalg(metalurg_arr, prokuror_elem[0])
        prok_dist.append({"sent": id, "dist": distance})
    sorted(prok_dist, key=lambda x: x["dist"])[:3]


linalg(np.array([1.0, 2.0, 3.0]), np.array([1.0, 2.0, 3.0]))  # compile

In [None]:
@jit(nopython=True, nogil=True, cache=True)
def test_cycle(metalurg_arr, prokuror_arr, prokuror_id):
    prok_dist = np.zeros(prokuror_arr.shape[0], dtype=np.float64)
    for i, arr in enumerate(prokuror_arr):
        prok_dist[i] = np.linalg.norm(metalurg_arr - arr[0])
    indexes = np.argsort(prok_dist)
    return prok_dist[indexes][:3], prokuror_id[indexes][:3]


test_cycle(
    np.array([1.0, 2.0, 3.0]), np.array([[1.0], [2.0], [3.0]]), np.array([1, 2, 3, 4])
)  # compile

In [None]:
@jit(nopython=True, nogil=True, cache=True)
def numba_in_numba(metalurg_arr, prokuror_arr, prokuror_id):
    prok_dist = np.zeros(prokuror_arr.shape[0], dtype=np.float64)
    for i, arr in enumerate(prokuror_arr):
        prok_dist[i] = linalg(metalurg_arr, arr[0])
    indexes = np.argsort(prok_dist)
    return prok_dist[indexes][:3], prokuror_id[indexes][:3]


numba_in_numba(
    np.array([1.0, 2.0, 3.0]), np.array([[1.0], [2.0], [3.0]]), np.array([1, 2, 3, 4])
)  # compile

In [None]:
%timeit original(metalurg_arr, prokuror_arr, prokuror_id)
%timeit numba_v1(metalurg_arr, prokuror_arr, prokuror_id)
%timeit test_cycle(metalurg_arr, prokuror_arr, prokuror_id)
%timeit numba_in_numba(metalurg_arr, prokuror_arr, prokuror_id)

In [None]:
# 1000 prokuror_arr on cuda

%timeit F.pairwise_distance(metalurg_arr[0].to(device), prokuror_arr, p=2)

In [None]:
# 1000 prokuror_arr

%timeit find_neighbors(metalurg_a, test, test_id)

## Other

In [None]:
df = pd.read_csv("data/metalurg_dist2.csv.zip", compression="zip", index_col=0)

In [None]:
df.head()

In [None]:
df["prokuror_distance"].quantile([i / 10 for i in range(10)])