# 0. Import stuff

In [1]:
import pandas as pd
import numpy as np
import pickle
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm
from torch import nn
from razdel import tokenize, sentenize
from transformers import AutoModel, AutoTokenizer
import torch.nn.utils.rnn as rnn_utils
import gc

# 1. Load data

In [2]:
class SentenceBertTransformer:
    def __init__(self, model_name='setu4993/LaBSE', device='cpu', max_len=512):
        self.model_name = model_name
        self.device = device
        self.max_len = max_len
        
    def load_model(self):
        self.model = AutoModel.from_pretrained(self.model_name)
        self.model = self.model.to(self.device)
        self.model = self.model.eval()
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        
    def transform(self, text):
        inputs = self.tokenizer(
            [text], return_tensors="pt", padding=True, max_length=self.max_len, verbose=False, truncation=True
        )
        inputs = inputs.to(self.device)
        with torch.no_grad():
            outputs = self.model(**inputs)
            
        return outputs[1][0].detach().cpu()
tf = SentenceBertTransformer(device="cuda")
tf.load_model()
tf.transform("i go home")


tensor([-9.5668e-01, -7.9142e-01, -5.0812e-01, -9.8696e-01,  6.6171e-02,
        -5.4171e-01,  4.5069e-01, -8.2792e-01, -9.8220e-01, -8.5164e-01,
         7.3792e-01, -7.0949e-01,  2.1814e-01, -6.0538e-01, -1.0774e-01,
        -9.6845e-01, -6.5996e-01, -2.1084e-02,  1.0497e-01, -3.8302e-01,
        -4.3587e-02, -9.6350e-01, -4.4819e-01, -7.4100e-01, -9.3513e-01,
        -9.3505e-02, -2.7327e-01,  3.2618e-01,  1.6681e-01, -2.8431e-01,
        -9.8772e-01, -4.3425e-01, -2.5844e-02, -4.6207e-01, -4.5117e-01,
        -4.7463e-01,  6.1623e-01, -7.2307e-01, -7.8088e-01,  3.3429e-01,
        -5.6118e-01, -6.7522e-01, -6.9028e-01, -9.4665e-01, -2.4793e-01,
         4.7554e-02,  2.7928e-01, -8.6677e-01, -6.4995e-01, -9.7788e-01,
        -9.7562e-01, -9.5297e-01, -6.8018e-01, -8.8297e-01, -2.2271e-02,
        -4.9821e-01, -4.4296e-01, -3.7419e-01, -6.6041e-01, -1.8406e-02,
        -7.3359e-01, -4.5413e-01,  5.1767e-01,  3.5751e-01, -9.8970e-02,
        -8.5916e-01, -3.0190e-01, -3.1376e-01, -7.9

In [3]:
df = pd.read_parquet('avito_cv2vac_with_ranks_clear.pq')
vac_to_id = dict(zip(df['vac_des'].unique(), range(df['vac_des'].nunique())))
df['vac_id'] = df['vac_des'].apply(lambda x: vac_to_id[x])
df

Unnamed: 0_level_0,User_id,Item_id,EventDate,last_resume_id,event_name,Region,City,session_hash,microcat_name,Platform_id,res_title,res_des,vac_title,vac_des,label,rank,max_rank,vac_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
0,1.030430e+12,1.353529e+12,2023-03-13 11:21:22,1.427381e+12,Пользовательские события / Объявления / Просмо...,Москва,Москва,8.753473e+18,Полный день,3.0,Подработка,"Бригадир раздачи газет на ж/ д станциях, интер...",Диспетчер оператор (менеджер),! ! ! ВНИМАНИЕ ! ! ! ОЧЕНЬ ВАЖНО ! ! ! требуют...,1,1,1,0
1,8.000233e+11,1.237305e+12,2023-03-13 03:02:08,1.401474e+12,Чат / События полученные через AMQP / Отправка...,Иркутская область,Братск,8.963607e+18,Сменный график,3.0,Не имеет значения,Срочно нужна работа !!!,Сборщики в магазин,! В гипермаркет требуются сборщики заказов.\n\...,1,1,2,1
2,1.073960e+12,1.237305e+12,2023-03-13 13:18:16,1.314115e+12,Чат / События полученные через AMQP / Отправка...,Иркутская область,Братск,2.662302e+18,Сменный график,4.0,Начинающий специалист,"Ответственная ,умею общаться с людьми",Сборщики в магазин,! В гипермаркет требуются сборщики заказов.\n\...,1,2,2,1
3,1.128231e+12,1.247251e+12,2023-03-13 10:39:39,1.429179e+12,Чат / События полученные через AMQP / Отправка...,Воронежская область,Воронеж,4.597737e+18,Сменный график,2.0,Продавец консультант,Продавец консультант,Сборщики заказов,! В гипермаркет требуются сборщики заказов.\n\...,1,1,1,2
4,6.528780e+11,1.121617e+12,2023-03-13 01:15:04,1.425790e+12,Чат / События полученные через AMQP / Отправка...,Московская область,Химки,6.790487e+17,Вахтовый метод,3.0,Медсестра на дом капельницы уколы,Я Настя из Таджикистана год рождения 1990 стаж...,"Работа вахта на складе 15/15, с проживанием)","! ВАХТА в МОСКОВСКОЙ ОБЛАСТИ !\n\nВОЗМОЖНО, БЕ...",1,1,21,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
465633,8.245448e+11,1.427353e+12,2023-03-13 13:45:18,1.423216e+12,Чат / События полученные через AMQP / Отправка...,Москва,Москва,6.613096e+17,Сменный график,3.0,Официантка работа,Ответственная исполнительная пунктуальность,Помощник на обработку документов на 4 часа,Требуется помощник на обработку входящей докум...,0,0,0,81055
465634,1.181303e+12,1.373503e+12,2023-03-13 08:20:30,1.431757e+12,Пользовательские события / Объявления / Просмо...,Псковская область,Великие Луки,3.857741e+18,Сменный график,3.0,"Любая работа, подработка",Быстро обучаюсь,Статрший администратор службы приема и размещения,"Описание работодателя:\nОтель 4 звезды, г. А...",0,0,0,42846
465635,1.127932e+12,1.428407e+12,2023-03-13 19:20:25,1.429851e+12,Чат / События полученные через AMQP / Отправка...,Татарстан,Набережные Челны,1.586361e+18,Вахтовый метод,4.0,Репетитор по английскому языку,стрессоустойчивый и коммуникабельный профессионал,Вахта проживание + питание/Упаковщик/20 смен,Вахта 15 смен с питанием и проживанием - ЗВОНИ...,0,0,0,92719
465636,9.636738e+11,1.431304e+12,2023-03-13 13:14:56,1.341265e+12,Пользовательские события / Объявления / Просмо...,Башкортостан,Стерлитамак,5.963610e+18,Полный день,3.0,Тракторист на мтз 82,ищу работу на своем тракторе мтз 82!,Менеджер по работе с клиентами,Требуется сотрудница в сервисный центр/магазин...,0,0,0,82627


# 2. Constants

In [4]:
LEN_BERT = 768
BATCH_SIZE = 128
DEVICE = "cuda"

# 3. Make SiameseDataset and collate_fn

In [5]:
class SiameseDataset(Dataset):
    def __init__(self, df, res_title, res_des, vac_title, vac_des, label_column, rank_column, max_rank_column):
        """
         Create dataset for Siamese Net training.

         Parameters
         ----------
         df : pd.DataFrame
             the dataframe we create dataset from
         vac_embed_column: str
             name of the column of the vacancy embeddings
         res_embed_column: str
             name of the column of the resume embeddings
         res_des_column: str
             name of the column of the resume text description
         label_column: str
             name of the column of the vacancy embeddings
         rank_column: str
             name of the column of the resume rank 
         max_rank_column: str
             name of the column of the max resumes rank for this vacancy 
        need new desription

             Returns
         -------
         None
         """
        self.df = df[[res_title, res_des, vac_title, vac_des,label_column, rank_column, max_rank_column]]

        self.res_title = res_title
        self.res_des = res_des
        self.vac_title = vac_title
        self.vac_des = vac_des
        self.label_column = label_column
        self.rank_column = rank_column
        self.max_rank_column = max_rank_column

        # предполагаю, что каждое резюме кидается ровно на 1 вакансию, составляя одну пару
        self.nunique_pairs = df[res_des].nunique()



    def __len__(self):
        """
         Return total amount of unique pairs: (vac_embed, res_embed).

         Parameters
         ----------
         None

         Returns
         -------
         int
             total amount of unique pairs: (vac_embed, res_embed) 
         """
        return self.nunique_pairs
  
    def __getitem__(self, idx):
        '''
         Return training object: (vac_embed, res_embed, label, rank, max_rank);
         Return rank and max_rank to penalty most appropriate samples more.

         Parameters
         ----------
         idx: int
             index of the samples we want to get.

         Returns
         -------
         tuple[torch.tensor]
             training object like a tuple: (vac_embed, res_embed, label, rank, max_rank)

        '''
        demandimg_row = self.df.iloc[idx, :]
        
        return torch.cat([get_embed(demandimg_row[self.res_title]), get_embed(demandimg_row[self.res_des])], dim=0), \
                torch.cat([get_embed(demandimg_row[self.vac_title]), get_embed(demandimg_row[self.vac_des])],dim=0), \
           torch.tensor(demandimg_row[self.label_column]), torch.tensor(demandimg_row[self.rank_column]), torch.tensor(demandimg_row[self.max_rank_column])

In [7]:
def get_embed(data, tf):
    answer = []
    for item in list(sentenize(data)):
            answer.append(tf.transform(item.text).reshape(1,LEN_BERT))
    return torch.cat(answer, dim=0)

In [8]:
text = '''
... - "Так в чем же дело?" - "Не ра-ду-ют".
... И т. д. и т. п. В общем, вся газета.
... Я пошел домой
... '''
list(sentenize(text))
get_embed(text), list(sentenize("я иду домой"))

(tensor([[-0.0507,  0.0391, -0.5538,  ...,  0.5727,  0.5194, -0.6324],
         [-0.1854, -0.0636, -0.7324,  ..., -0.7390,  0.4568, -0.7054],
         [ 0.2169, -0.8906, -0.7857,  ..., -0.1507, -0.4306, -0.1799],
         [-0.8015, -0.8706, -0.7585,  ...,  0.3070, -0.1467,  0.2445],
         [-0.9528, -0.8152, -0.8993,  ..., -0.3896, -0.9953, -0.6194]]),
 [Substring(0, 11, 'я иду домой')])

In [11]:
def collate_fn(data):
    """     
     Make dict samples from tuples (it is easier to use);

     Parameters
     ----------
       data: is a list of tuples with (vac_embed, res_embed, label, rank, max_rank)
      
    """
    res, vac,  label, rank, max_rank = zip(*data)

    dict_data = {'res': res, 
                 'vac': vac,
                 'label': label,
                 'rank': rank,
                 'max_rank': max_rank}

    return dict_data

# 4. Create dataset amd dataloader instance 

In [12]:
dataset = SiameseDataset(df, 'res_title', 'res_des', "vac_title", "vac_des", 'label', 'rank', 'max_rank')

In [13]:
dataloader = DataLoader(dataset, batch_size=128, shuffle=True, collate_fn=collate_fn)

In [14]:
with open("val_id.pickle", 'rb') as f:
    val_id = pickle.load(f)

with open("test_id.pickle", 'rb') as f:
    test_id = pickle.load(f)

with open("train_id.pickle", 'rb') as f:
    train_id = pickle.load(f)

In [15]:
train_dataset = SiameseDataset(df[df['vac_id'].isin(train_id)],'res_title', 'res_des', "vac_title", "vac_des", 'label', 'rank', 'max_rank')

val_dataset = SiameseDataset(df[df['vac_id'].isin(val_id)],'res_title', 'res_des', "vac_title", "vac_des", 'label', 'rank', 'max_rank')

test_dataset = SiameseDataset(df[df['vac_id'].isin(test_id)],'res_title', 'res_des', "vac_title", "vac_des", 'label', 'rank', 'max_rank')

# 5. Get a batch and overview it

In [13]:
batch = next(iter(dataloader))

In [14]:
# батч -- это словарь

type(batch)

dict

In [15]:
# его ключи 

batch.keys()

dict_keys(['res', 'vac', 'label', 'rank', 'max_rank'])

In [16]:
# внутри каждого ключа кортеж длиной batch_size

type(batch['res'])

tuple

In [17]:
len(batch['res'][0]), batch['res'][0]

(6,
 tensor([[-0.5827, -0.6846, -0.5353,  ...,  0.4666,  0.1050, -0.2236],
         [-0.6029, -0.2126,  0.2192,  ..., -0.0095,  0.1594, -0.5768],
         [-0.2919, -0.1481, -0.8866,  ..., -0.0223,  0.1018, -0.6278],
         [-0.4945, -0.1658, -0.6466,  ...,  0.8908, -0.1328, -0.0491],
         [-0.3670,  0.1026, -0.1941,  ...,  0.1574, -0.2153, -0.2915],
         [-0.5902, -0.5470, -0.1846,  ..., -0.0166,  0.2765,  0.0419]]))

In [18]:
# каждый элемент уже то, что заявлено в ключах

type(batch['vac'])

tuple

In [19]:
# размерность эмбеддинга
for item in batch['vac']:
    print(item.shape)

torch.Size([11, 768])
torch.Size([19, 768])
torch.Size([23, 768])
torch.Size([11, 768])
torch.Size([19, 768])
torch.Size([19, 768])
torch.Size([17, 768])
torch.Size([4, 768])
torch.Size([17, 768])
torch.Size([8, 768])
torch.Size([7, 768])
torch.Size([10, 768])
torch.Size([6, 768])
torch.Size([6, 768])
torch.Size([13, 768])
torch.Size([12, 768])
torch.Size([15, 768])
torch.Size([16, 768])
torch.Size([15, 768])
torch.Size([6, 768])
torch.Size([3, 768])
torch.Size([13, 768])
torch.Size([19, 768])
torch.Size([21, 768])
torch.Size([7, 768])
torch.Size([14, 768])
torch.Size([13, 768])
torch.Size([7, 768])
torch.Size([20, 768])
torch.Size([19, 768])
torch.Size([5, 768])
torch.Size([4, 768])
torch.Size([7, 768])
torch.Size([5, 768])
torch.Size([21, 768])
torch.Size([2, 768])
torch.Size([5, 768])
torch.Size([8, 768])
torch.Size([16, 768])
torch.Size([30, 768])
torch.Size([6, 768])
torch.Size([9, 768])
torch.Size([21, 768])
torch.Size([15, 768])
torch.Size([2, 768])
torch.Size([7, 768])
torch.Si

# 6. Architecture

In [16]:
class SiameseCVNet(nn.Module):
    def __init__(self,  rnn_hidden_dim, 
               hidden_layers, fc1_output=512, fc2_output=1):
    
        super(SiameseCVNet, self).__init__()

        self.rnn_hidden_dim = rnn_hidden_dim
        self.hidden_layers = hidden_layers
    
        self.rnn = nn.LSTM(input_size=LEN_BERT,
                           hidden_size=rnn_hidden_dim,
                           num_layers=hidden_layers,
                           batch_first=True)
        # считаем после конкатенации в forward_one
        self.fc1_input_one = 2 * (LEN_BERT+(self.hidden_layers + 1) * self.rnn_hidden_dim)

        # но мы конкатенируем 2 сэмпла!
        self.fc1_input = 2 * self.fc1_input_one

        self.fc1_output = fc1_output
        self.fc2_output = fc2_output

        fc1 = nn.Linear(self.fc1_input, self.fc1_output)
        relu = nn.ReLU()
        fc2 = nn.Linear(self.fc1_output, self.fc2_output)
        sigmoid = nn.Sigmoid()

        self.nn_head = nn.Sequential(
            fc1,
            relu,
            fc2,
            sigmoid

          ) 

    def forward(self, vac_embeds, res_embeds):

        catted_output_vac = self.forward_one(vac_embeds)
        catted_output_res = self.forward_one(res_embeds)

        # конкатенируем и пускаем через dense
#         print(catted_output_vac.shape, catted_output_res.shape)
        catted_output = torch.cat((catted_output_vac, catted_output_res), dim=-1)
#         print(catted_output.shape, self.fc1_input, self.fc1_output)
        sigm_output = self.nn_head(catted_output)

        return sigm_output

    def forward_one(self, batch):
        '''
        image there is just a tensor of embeddings
        shit with dims for sure
        '''

#         print('sample:', batch.shape)
        rnn_output, (hidden_states, cell_states) = self.rnn(batch)

        # print('rnn output:', rnn_output.shape)

        embed_max_pool = batch.max(dim=1)[0]
        embed_avg_pool = batch.sum(dim=1) / len(batch)

        rnn_max_pool = rnn_output.max(dim=1)[0]
        rnn_avg_pool = rnn_output.sum(dim=1) / len(rnn_output)  

        # print('embed pool:', embed_max_pool.shape, embed_avg_pool.shape)
        # print('rnn output pool:', rnn_max_pool.shape, rnn_max_pool.shape) 
        # print('hidden and state: ', hidden_states.shape, cell_states.shape)

        # тут 0 ось -- кол-во слоев в rnn-блоке
        hidden_states = torch.cat([hidden_states[i, :, :] for i in range(hidden_states.shape[0])], dim=-1)
        cell_states = torch.cat([cell_states[i, :, :] for i in range(cell_states.shape[0])], dim=-1)

        # print('hidden and state: ', hidden_states.shape, cell_states.shape)

        catted_output = torch.cat((embed_max_pool, embed_avg_pool, rnn_max_pool, 
                                  rnn_avg_pool, hidden_states, cell_states), dim=-1)

        return catted_output 

# 7. Model

In [17]:
model_parameters = {
    "rnn_hidden_dim": 512,
    "hidden_layers": 1,
    "fc1_output": 256,
    "fc2_output": 1,
}

experement_parameters = {
    "optimizer_lr": 1e-4,
    "optimizer_weight_decay": 1e-4,
    "scheduler_patience": 5,
    "scheduler_factor": 0.8,
    "dataset_train": train_dataset,
    "dataset_test": val_dataset,
    "dataset_val": test_dataset,
    "batch_size_train": 64, 
    "batch_size_val": 64, 
    "batch_size_test": 64,
    "num_epochs": 5,
    "train_shuffle": True,
    "collate_fn": collate_fn,
    "verbose_every_n_batches": 300,
    "eval_on_train": False,
}

# model = SiameseCVNet(   
#     rnn_hidden_dim=model_parameters["rnn_hidden_dim"],
#     hidden_layers=model_parameters["hidden_layers"],
#     fc1_output=model_parameters["fc1_output"],
#     fc2_output=model_parameters["fc2_output"]
# )

# optimizer = torch.optim.AdamW(
#     params=model.parameters(),
#     lr=experement_parameters["optimizer_lr"],
#     weight_decay=experement_parameters["optimizer_weight_decay"]
# )

# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
#     optimizer=optimizer,
#     patience=experement_parameters["scheduler_patience"],
#     factor=experement_parameters["scheduler_factor"]
# )
loss = torch.nn.BCELoss()

In [18]:

model = SiameseCVNet(   
    rnn_hidden_dim=model_parameters["rnn_hidden_dim"],
    hidden_layers=model_parameters["hidden_layers"],
    fc1_output=model_parameters["fc1_output"],
    fc2_output=model_parameters["fc2_output"]
)
model_state_dict = torch.load("model.pt")
model.load_state_dict(model_state_dict)
optimizer = torch.optim.AdamW(
    params=model.parameters(),
    lr=experement_parameters["optimizer_lr"],
    weight_decay=experement_parameters["optimizer_weight_decay"]
)
optimizer.load_state_dict(torch.load("optimizer.pt"))
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer=optimizer,
    patience=experement_parameters["scheduler_patience"],
    factor=experement_parameters["scheduler_factor"]
)

In [19]:

def convert_tuple_to_tensor(tensors):
    ns = list([tensor.shape[0] for tensor in tensors])
    tensor_list = [torch.randn(n, LEN_BERT) for n in ns]

    # определяем максимальную размерность
    max_size = max(ns)

    # добавляем нулевые строки к тензорам, чтобы они имели одинаковую размерность
    for i, elem in enumerate(tensors):
        if tensor_list[i].shape[0] < max_size:
            tensor_list[i] = torch.cat((elem, torch.zeros(max_size-tensor_list[i].shape[0], LEN_BERT)), dim=0)

    # объединяем тензоры по первой размерности
    return  torch.cat([tensor.unsqueeze(0) for tensor in tensor_list], dim=0)

In [20]:
def train_epoch(model, optimizer, dataset, batch_size, shuffle, collate_fn, 
                verbose_every_n_batches=300):
    
    torch_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, collate_fn=collate_fn)
    loss = torch.nn.BCELoss()

    total_train_loss = 0
    num_batches = 1
    
    model = model.to(DEVICE)
    model.train()
    for batch in torch_dataloader:   
        vac, res, true = batch["vac"], batch["res"], batch["label"]
        vac = convert_tuple_to_tensor(vac)
        res = convert_tuple_to_tensor(res)
        pred = torch.flatten(model(vac.to(DEVICE), res.to(DEVICE)).float())
        true = torch.stack(list([elem for elem in true]), dim=0).to(DEVICE).float()
        batch_loss = loss(pred, true)
        
        batch_loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        total_train_loss += batch_loss * batch_size
        
        if num_batches == 1:
            print(f"Train loss after first batch: {total_train_loss}", end='\r')    
        
        if num_batches % verbose_every_n_batches == 0:
            print(f"Mean train loss on the last {verbose_every_n_batches} batches: {total_train_loss / verbose_every_n_batches};", end="\r")
            
        num_batches += 1
            
    print(f"Mean train loss after epoch: {total_train_loss / num_batches}")
    print(f"Total train loss after epoch: {total_train_loss}")
      


def eval_model(model, dataset, batch_size, collate_fn, 
               verbose_every_n_batches=300):
    torch_dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)
    preds, targets = [], []
    total_valid_loss = 0
    num_batches = 0

    model.eval()
    with torch.no_grad():
        for batch in torch_dataloader:
            vac, res, true = batch["vac"], batch["res"], batch["label"]
            vac = convert_tuple_to_tensor(vac)
            res = convert_tuple_to_tensor(res)
            pred = torch.flatten(model(vac.to(DEVICE), res.to(DEVICE)).float())
            true = torch.stack(list([elem for elem in true]), dim=0).to(DEVICE).float()
            preds.extend(pred.detach().cpu().numpy())
            targets.extend(true)
    return targets, preds

In [21]:

def clean_cuda_cache():
    start_available, reserved = torch.cuda.mem_get_info()
    torch.cuda.empty_cache()
    gc.collect()
    fin_available, reserved = torch.cuda.mem_get_info()
    print(f"cleaned {(fin_available - start_available) / 2**10} gb")
    print(f"available {fin_available / 2**10} gb")

In [22]:
def train_val_loop(
    model, optimizer, scheduler, 
    dataset_train, dataset_val, dataset_test,
    batch_size_train, batch_size_test, batch_size_val,
    num_epochs, train_shuffle, collate_fn, 
    verbose_every_n_batches, eval_on_train, 
    early_stopping_patience=7
):
    with open("logs_lstm.txt", 'w') as f:
        pass
    model = model.to(DEVICE)
    for n_epoch in tqdm(range(1, num_epochs + 1), desc="nums"):
        train_epoch(\
                    model=model, \
                    optimizer=optimizer, \
                    dataset=dataset_train, \
                    batch_size=batch_size_train, \
                    shuffle=train_shuffle, \
                    collate_fn=collate_fn, \
                    verbose_every_n_batches=verbose_every_n_batches
                  )
                    
        # clean cache before validation    
        clean_cuda_cache()     
        targets_val, preds_val = eval_model(
            model=model,
            dataset=dataset_val,
            batch_size=batch_size_val,
            collate_fn=collate_fn
        )
        # for logging while validating
        loss = torch.nn.BCELoss()
        # count common loss
        val_loss = loss(torch.tensor(preds_val).float(),torch.tensor(targets_val).float())
        
        with open("logs_lstm.txt", 'a') as f:
            f.write(f'epoch {n_epoch}. valid loss: {val_loss}\n')

        # torch.save(model.state_dict(), os.path.join(EXP_CHECKPOINTS_PATH, f"epoch_{n_epoch}_{datetime.now().strftime('%Y-%m-%d')}_{datetime.now().strftime('%H:%M:%S')}_testCC_{round(test_roc_auc_CC, 3)}_testPIL_{round(test_roc_auc_PIL, 3)}.pt"))             
        targets_test, preds_test = eval_model(
            model=model,
            dataset=dataset_test,
            batch_size=batch_size_test,
            collate_fn=collate_fn
        )
        
        # count common loss
        test_loss = loss(torch.tensor(preds_test).float(), torch.tensor(targets_test).float())
        scheduler.step(val_loss)
        if eval_on_train:
                targets_train, preds_train = eval_model(
                model=model,
                dataset=dataset_train,
                batch_size=batch_size_train,
                collate_fn=collate_fn
            )
            
                # count common loss
                train_loss = loss(torch.tensor(targets_train), torch.tensor(preds_train))
                



In [33]:

train_val_loop(
  model=model, optimizer=optimizer, scheduler=scheduler, 
  dataset_train=experement_parameters["dataset_train"], 
  dataset_test=experement_parameters["dataset_test"], 
  dataset_val=experement_parameters["dataset_val"],
  batch_size_train=experement_parameters["batch_size_train"],
  batch_size_val=experement_parameters["batch_size_val"], 
  batch_size_test=experement_parameters["batch_size_test"],
  num_epochs=experement_parameters["num_epochs"], 
  train_shuffle=experement_parameters["train_shuffle"], 
  collate_fn=experement_parameters["collate_fn"], 
  verbose_every_n_batches=experement_parameters["verbose_every_n_batches"],
  eval_on_train=False,
)


nums:   0%|                                                                                      | 0/5 [00:00<?, ?it/s]

Mean train loss after epoch: 0.2445689737796783440601158142;
Total train loss after epoch: 233.07424926757812
cleaned 270336.0 gb
available 3202048.0 gb



nums:  20%|██████████████▍                                                         | 1/5 [1:52:26<7:29:44, 6746.15s/it]

Mean train loss after epoch: 1.2201545587231521e-06505007e-06;;;
Total train loss after epoch: 0.001162807340733707
cleaned 339968.0 gb
available 3212288.0 gb



nums:  40%|████████████████████████████▊                                           | 2/5 [3:45:24<5:38:14, 6764.98s/it]

Mean train loss after epoch: 1.204613226946094e-07498051053e-07;
Total train loss after epoch: 0.0001147996517829597
cleaned 329728.0 gb
available 3212288.0 gb



nums:  60%|███████████████████████████████████████████▏                            | 3/5 [5:38:17<3:45:37, 6768.59s/it]

Mean train loss after epoch: 5.30376880192307e-081359567556e-07;
Total train loss after epoch: 5.0544920668471605e-05
cleaned 327680.0 gb
available 3212288.0 gb



nums:  80%|█████████████████████████████████████████████████████████▌              | 4/5 [7:31:17<1:52:53, 6773.19s/it]

Mean train loss after epoch: 1.8451861194535013e-070998164e-07;
Total train loss after epoch: 0.0001758462458383292
cleaned 327680.0 gb
available 3212288.0 gb


nums: 100%|██████████████████████████████████████████████████████████████████████████| 5/5 [9:24:18<00:00, 6771.75s/it]


In [50]:
import torch
torch.save(optimizer.state_dict(), 'optimizer.pt')
torch.save(model.state_dict(), 'model.pt')

In [51]:
clean_cuda_cache()   

cleaned 274432.0 gb
available 3214336.0 gb


In [24]:
def model_inference_one_vac(model, dataset, vac_text):
    '''
      Инференс на 1 вакансии. У вас функция будет отличаться, очевидно
    '''
    location = dataset[dataset['vac_des'] == vac_text]['City'].iloc[0]
    cluster = dataset[(dataset["City"] == location) | \
                    (dataset["microcat_name"] =="Удаленная работа") | \
                    (dataset['microcat_name'] == 'Вахтовый метод')]
    vac_title =  cluster[cluster['vac_des'] == vac_text]['vac_title'].iloc[0]
    similarities = []
    for res_text in cluster['res_des']:
        if res_text:
            vacc = torch.cat([get_embed(cluster[cluster['res_des'] == res_text]['res_title'].iloc[0]),\
                              get_embed(res_text)], dim=0).to(DEVICE)
            ress = torch.cat([get_embed(cluster[cluster['vac_des'] == vac_text]['vac_title'].iloc[0]), \
                              get_embed(vac_text)], dim=0).to(DEVICE)
            padd = convert_tuple_to_tensor1((vacc, ress))
            similarity = model(padd[0].unsqueeze(0).to(DEVICE), padd[1].unsqueeze(0).to(DEVICE))
            similarities.append(similarity.item())
        else:
            similarities.append(-1)
    return similarities, np.arange(cluster.shape[0])

def model_inference_dataset(model, dataset):
    '''
      Инференс на всех вакансиях в dataset
    '''
    preds = np.zeros((dataset.shape[0]))
    for vac in dataset['vac_des'].unique():
        model_pred, pred_ind = model_inference_one_vac(model, dataset, vac)
        preds[pred_ind] = model_pred

    return preds
def convert_tuple_to_tensor1(tensors):
    ns = list([tensor.shape[0] for tensor in tensors])
    tensor_list = [torch.randn(n, LEN_BERT).to(DEVICE) for n in ns]

    # определяем максимальную размерность
    max_size = max(ns)

    # добавляем нулевые строки к тензорам, чтобы они имели одинаковую размерность
    for i, elem in enumerate(tensors):
        tensor_list[i].to(DEVICE)
        if tensor_list[i].shape[0] < max_size:
            tensor_list[i] = torch.cat((elem, torch.zeros(max_size-tensor_list[i].shape[0], LEN_BERT).to(DEVICE)), dim=0)

    # объединяем тензоры по первой размерности
    return  torch.cat([tensor.unsqueeze(0).to(DEVICE) for tensor in tensor_list], dim=0).to(DEVICE)

In [27]:
model.to(DEVICE)

SiameseCVNet(
  (rnn): LSTM(768, 512, batch_first=True)
  (nn_head): Sequential(
    (0): Linear(in_features=7168, out_features=256, bias=True)
    (1): ReLU()
    (2): Linear(in_features=256, out_features=1, bias=True)
    (3): Sigmoid()
  )
)

In [25]:
from tqdm import tqdm
from sklearn.metrics import ndcg_score

def count_ndcg_for_all_resumes(model, df, k=None):
    '''
  ранжируем к вакансии вообще все резюме из кластера 
  это походит на боевые условия, но имхо, ждать тут высокого ndcg не стоит из-за разреженнности векторов
    '''
    if not k:
        k = df.shape[0]

    df = df.drop_duplicates(['vac_des', 'res_des'])
    df.index = range(df.shape[0])
    if df.shape[0] > 0:
        ndcg_vals = []
        for vac in tqdm(df['vac_des'].unique()):
            possible_res = df[df['vac_des'] == vac][['res_des', 'rank']]
            if possible_res.shape[0] < 3:
                continue

            true_rank_vector = np.zeros((df.shape[0]))
            true_rank_vector[possible_res.index] = possible_res['rank']

            model_preds = model_inference_dataset(model, df)

            if len(true_rank_vector) < k:
                ndcg_vals.append(ndcg_score([true_rank_vector], [model_preds]))
            else:
                ndcg_vals.append(ndcg_score([true_rank_vector[:k]], [model_preds[:k]]))

    return ndcg_vals
  
def count_ndcg_for_appropriate_resumes(model, df, k=None):
    '''
  ранжируем только подходящие к вакансии резюме
  то есть делаем предикт только для них и смотрим, как алгоритм умеет сортировать "хорошие" сэмпл
    '''
    if not k:
        k = float('inf')

    df = df.drop_duplicates(['vac_des', 'res_des'])
    df.index = range(df.shape[0])
    if df.shape[0] > 0:
        ndcg_vals = []
        for vac in tqdm(df['vac_des'].unique()):
            possible_res = df[df['vac_des'] == vac][['City', 'microcat_name', 'vac_des','vac_title','res_title', 'res_des', 'rank']]
            if possible_res.shape[0] < 3:
                continue

            true_rank_vector = possible_res['rank'].values
            model_probs = model_inference_dataset(model, possible_res)

            if len(true_rank_vector) < k:
                ndcg_vals.append(ndcg_score([true_rank_vector], [model_probs]))
            else:
                ndcg_vals.append(ndcg_score([true_rank_vector[:k]], [model_probs[:k]]))

    return ndcg_vals

In [None]:

mod = count_ndcg_for_appropriate_resumes(model, df.iloc[val_id], k=10)

In [76]:
model.fc1_input_one

2048

In [78]:
2 * (0+768+(1 + 1) * 512)*2

7168

In [93]:
for name, param in model.named_parameters():
    print(name, param.shape)

rnn.weight_ih_l0 torch.Size([2048, 768])
rnn.weight_hh_l0 torch.Size([2048, 512])
rnn.bias_ih_l0 torch.Size([2048])
rnn.bias_hh_l0 torch.Size([2048])
nn_head.0.weight torch.Size([256, 7168])
nn_head.0.bias torch.Size([256])
nn_head.2.weight torch.Size([128, 256])
nn_head.2.bias torch.Size([128])


In [25]:
sum(mod)/len(mod)

0.8291703082115134

In [28]:
test = count_ndcg_for_appropriate_resumes(model, df.iloc[test_id], k=10)
sum(test)/len(test)

100%|████████████████████████████████████████████████████████████████████████████| 11139/11139 [03:13<00:00, 57.58it/s]


0.8666537535464872