<h3>Векторизуем словарь Meddra</h3>

In [1]:
from vectorization import ConceptVectorizer

In [2]:
CV = ConceptVectorizer('cointegrated/rubert-tiny2', '../../Data/External/pt_rus.asc')

loading model...


Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
CV.fit_transform(mode='cls_token')

getting concept embeddings in cls_token mode...


In [4]:
CV.thesaurus_embeddings

tensor([[ 0.0047,  0.6057,  0.8048,  ...,  0.4303,  1.0715, -0.4888],
        [ 0.5371, -0.3121,  0.1231,  ...,  0.5167,  0.4018, -0.3266],
        [ 0.2913, -0.5215,  0.4901,  ...,  0.9543,  0.4253, -0.1851],
        ...,
        [ 0.0108,  0.0136,  0.9450,  ...,  0.1267,  1.0630, -0.8467],
        [ 0.1349, -0.0723,  0.2402,  ...,  0.2063,  0.3276, -0.7158],
        [ 0.2104, -0.1122,  0.4180,  ...,  0.0780,  0.4703, -0.8100]])

In [5]:
print(len(CV.thesaurus_embeddings))

23954


In [6]:
CV.vectorization_mode

'cls_token'

<h3>Создадим датасет RDR</h3>

In [7]:
import jsonlines
import numpy as np
from sklearn.model_selection import train_test_split

In [8]:
ds = []
with jsonlines.open('../../Data/Raw/medNorm_14012022.jsonlines') as reader:
    for obj in reader:
        ds.append(obj)

In [9]:
X_train, X_test = train_test_split(ds, test_size=0.33, random_state=42)

In [10]:
#выцепим фразы с нормализацией по Meddra без их контекста

train_phrases = []
train_concepts = []

test_phrases = []
test_concepts = []

for review in X_train:
    for ent in review['objects']['MedEntity']:
        if 'MedDRA' in ent.keys() and ent['MedDRA']!='':
            #try:
            train_concepts.append(CV.meddra_term_to_meddra_code[ent['MedDRA'].split('|')[0]])
            #except:
            #markup_errors+=1
            #continue
            train_phrases.append(ent['text'])
            
            
for review in X_test:
    for ent in review['objects']['MedEntity']:
        if 'MedDRA' in ent.keys() and ent['MedDRA']!='':
            #try:
            test_concepts.append(CV.meddra_term_to_meddra_code[ent['MedDRA'].split('|')[0]])
            #except:
            #    markup_errors+=1
            #    continue
            test_phrases.append(ent['text'])
            
print('Всего фраз в трейне: %s'%len(train_phrases))
print('Всего фраз в тесте: %s'%len(test_phrases))

print('Уникальных фраз в трейне: %s'%len(set(train_phrases)))
print('Уникальных фраз в тесте: %s'%len(set(test_phrases)))

#Посмотрим на статистику разбиения
print('%s концептов не входящих либо в трейн, либо в тест'%len(set.union(set(train_concepts), set(test_concepts)) - set.intersection(set(test_concepts), set(train_concepts))))
print('%s концептов, которые есть в тесте, но нет в трейне'%len(set(test_concepts) - set(train_concepts)))
print('%s концептов, которые есть в трейне, но нет в тесте'%len(set(train_concepts) - set(test_concepts)))

Всего фраз в трейне: 3476
Всего фраз в тесте: 1750
Уникальных фраз в трейне: 1204
Уникальных фраз в тесте: 750
162 концептов не входящих либо в трейн, либо в тест
53 концептов, которые есть в тесте, но нет в трейне
109 концептов, которые есть в трейне, но нет в тесте


In [11]:
from dataset import MedNormDataset

RDR_train = MedNormDataset(train_phrases, train_concepts, CV, use_cuda=True)
RDR_test = MedNormDataset(test_phrases, test_concepts, CV, use_cuda=True)

<h2>Сама модель</h2>

In [12]:
import torch.nn as nn
import torch.nn.functional as F
import torch
#Динамический импорт класса нужной PreTrained модели по автоконфигурации
from transformers import AutoConfig

#Все, что нужно указать
model_path = 'cointegrated/rubert-tiny2'

cfg = AutoConfig.from_pretrained(model_path)
ConfigModelClass = cfg.__class__
PreTrainedModelClassName = ConfigModelClass.__name__.replace('Config', 'PreTrainedModel')
ModelClassName = cfg.__class__.__name__.replace('Config', 'Model')
exec("from transformers import %s as PreTrainedModelClass"%PreTrainedModelClassName)
exec("from transformers import %s as ModelClass"%ModelClassName)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
#device = 'cpu'

def get_cls_token_emb(model_output):
    embeddings = model_output.last_hidden_state[:, 0, :]
    #embeddings = torch.nn.functional.normalize(embeddings, dim=1)
    return embeddings

normalized_embs = CV.thesaurus_embeddings.norm(dim=1)[:, None]
normalized_embs = CV.thesaurus_embeddings / torch.clamp(normalized_embs, min=1e-8)
normalized_embs = normalized_embs.transpose(0, 1)
normalized_embs = normalized_embs.to(device)

class Net(PreTrainedModelClass):
    def __init__(self, config: ConfigModelClass):
        super(Net, self).__init__(config)
        self.bert = ModelClass(config)

    def forward(self, x):
        emb = self.bert(**x)
        #with torch.no_grad():
        x = get_cls_token_emb(emb)
        #имеем две матрицы x - (batch_size, emb_size) и thesaurus_embeddings - (thesaurus_size, emb_size)
        #надо посчитать косинусную близость близость между каждым вектором x и каждым вложением из тезауруса
        #решение: https://stackoverflow.com/questions/50411191/how-to-compute-the-cosine-similarity-in-pytorch-for-all-rows-in-a-matrix-with-re
        x_n = x.norm(dim=1)[:, None] 
        x_n = x / torch.clamp(x_n, min=1e-8)
        #b_norm = CV.thesaurus_embeddings / torch.clamp(b_, min=1e-9)
        cos_sim = torch.mm(x_n, normalized_embs)
        x = F.softmax(cos_sim, dim=1)
        return x


net = Net.from_pretrained('cointegrated/rubert-tiny2', config=cfg)
net.to(device)
print('Net loaded')

#net2 = Net.from_pretrained('cointegrated/rubert-tiny2', config=cfg)
#net2.to(device)
#print('Net loaded')
#normalized_embs.to(device)

#for param in net.parameters():
#    print(param)

Some weights of the model checkpoint at cointegrated/rubert-tiny2 were not used when initializing Net: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing Net from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Net from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Net loaded


<h2>Обучение модели</h2>

In [13]:
import torch.optim as optim
import torch
import numpy as np

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(net.parameters())
#optimizer2 = optim.AdamW(net2.parameters())

In [14]:
scaler = torch.cuda.amp.GradScaler()

Для большей детерменированности

In [15]:
import os 

os.environ["CUBLAS_WORKSPACE_CONFIG"]=":16:8"
torch.use_deterministic_algorithms(mode=False)
np.random.seed(0)
torch.manual_seed(0)
torch.backends.cudnn.benchmark = False

In [16]:
from tqdm import trange
from tqdm import tqdm


batch_size=2
trainloader = torch.utils.data.DataLoader(RDR_train, batch_size=batch_size,
                                          shuffle=False, num_workers=0)

net.train()
for epoch in range(1, 2):
    with tqdm(trainloader, unit="batch") as tepoch:
        for data in tepoch:

            tepoch.set_description(f"Epoch {epoch}")

            inputs = data['tokenized_phrases']
            labels = data['one_hot_labels']

            optimizer.zero_grad()
            if device=='cuda':
                with torch.cuda.amp.autocast():
                    outputs = net(inputs)
                    loss = criterion(outputs, labels)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = net(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
            #чтобы посчитать accuracy нужно конвертнуть в one-hot
            
            #max_idx = torch.argmax(outputs, 1, keepdim=True)
            #one_hot = torch.FloatTensor(outputs.shape).to(device)
            #one_hot.zero_()
            #one_hot.scatter_(1, max_idx, 1)
            #correct = torch.all(torch.eq(labels, one_hot),  dim=1).sum().item()

            #correct = (one_hot == labels)
            #accuracy = correct / batch_size
            #tepoch.set_postfix(loss=loss.item(), accuracy=100. * accuracy)
            
print('Finished Training')

Epoch 1: 100%|██████████| 1738/1738 [00:42<00:00, 40.98batch/s]

Finished Training





<h2>Тест модели с cls_token</h2>

In [17]:
from sklearn.metrics import classification_report, f1_score
from tqdm import tqdm


net.eval()

model_answers=[]
real_answers=[]

testloader = torch.utils.data.DataLoader(RDR_test, batch_size=1,
                                          shuffle=False)


with tqdm(testloader, unit="batch") as eval_process:
    for data in eval_process:

        #tepoch.set_description(f"Progress")

        inputs = data['tokenized_phrases']
        labels = data['one_hot_labels']

        with torch.no_grad():
            pred_meddra_code = CV.meddra_codes[net(inputs).argmax()]


        model_answers.append(pred_meddra_code)
        real_answers.append(data['label_codes'])

f1_score(real_answers, model_answers, average='micro')

100%|██████████| 1750/1750 [00:07<00:00, 242.88batch/s]


0.4257142857142857

<h2>Инференс</h2>

In [23]:
from random import randint

i = randint(1, len(RDR_test))

net.eval()

phrase = {k: tensor.unsqueeze(0) for k, tensor in RDR_test[i]['tokenized_phrases'].items()}
concept = RDR_test[i]['label_codes']

with torch.no_grad():
    model_answer = CV.meddra_codes[net(phrase).argmax()]
    


print('phrase: %s'%RDR_test[i]['phrases'])
print('model: %s'%CV.meddra_code_to_meddra_term[model_answer])



print('real: %s'%RDR_test[i]['label_terms'])

phrase: температурой 37,8
model: Ринорея
real: Пирексия


<h2>Сохранение и загрузка модели</h2>

In [24]:
torch.save(net, './cadec_SoTa_on_RDR_rubert_cls_tok.pt')
torch.save(optimizer.state_dict(), './cadec_SoTa_on_RDR_rubert_cls_tok_opt.pt')