<h3>Векторизуем словарь Meddra</h3>

In [1]:
from vectorization import ConceptVectorizer

In [2]:
#модели нам не нужны, ведь вложения получены в ячейке fit_transform() и сохранены, поэтому use_model = False
#чтобы получить вложения, надо вызвать CV с use_model = True и вызвать fit_transform в расчете вложений словаря
CV = ConceptVectorizer('DeepPavlov/rubert-base-cased', '../../Data/External/pt_rus.asc', \
                             use_concept_less=False, use_model=False)

Расчет вложений словаря

In [4]:
#используется только для рассчетов вложений словаря
#когда они готовы, проще их загрузить ячейкой с загрузкой
CV.fit_transform(mode='mean_pooling')

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


getting concept embeddings in mean_pooling mode...
Compute embeddings...
Embedding aggregation...


Сохранение рассчитанных вложений

In [None]:
import torch

torch.save(CV.thesaurus_embeddings, 'rubert_thesaurus_embeddings.pt')

Загрузка готовых вложений (если они есть). Загружать вложения после fit_transform() не надо

In [3]:
import torch

CV.thesaurus_embeddings = torch.load('rubert_thesaurus_embeddings.pt')
CV.normalization_mode = 'mean_pooling'

<h3>Создадим датасет RDR</h3>

In [4]:
import jsonlines
import numpy as np
from sklearn.model_selection import train_test_split

In [5]:
ds = []
with jsonlines.open('../../Data/Raw/medNorm_16022022.jsonlines') as reader:
    for obj in reader:
        ds.append(obj)

In [6]:
X_train, X_test = train_test_split(ds, test_size=0.33, random_state=42)

In [7]:
#выцепим фразы с нормализацией по Meddra без их контекста
train_phrases = []
train_sentences = []
train_concepts = []

test_phrases = []
test_sentences = []
test_concepts = []

test_phrases_without_conceptless = []
test_concepts_without_conceptless = []

log_markup_errors = []

#в трейне не будем собирать conceptless термины

USE_CONCEPT_LESS = False

for review in X_train:
    for ent in review['objects']['MedEntity']:
        if 'MedDRA' in ent.keys():
            if ent['MedDRA']=='':
                if USE_CONCEPT_LESS:
                    ent['MedDRA'] = 'CONCEPT_LESS'
                else:
                    continue
            try:
                train_concepts.append(CV.meddra_term_to_meddra_code[ent['MedDRA'].split('|')[0]])
                train_phrases.append(ent['text'])
            except KeyError:
                log_markup_errors.append({'review_id': review['meta']['fileName'], 'entity_id': ent['xmiID']})
                
for review in X_test:
    for ent in review['objects']['MedEntity']:
        if 'MedDRA' in ent.keys():
            if ent['MedDRA']=='':
                if USE_CONCEPT_LESS:
                    ent['MedDRA'] = 'CONCEPT_LESS'
                else:
                    continue
            try:
                test_concepts_without_conceptless.append(CV.meddra_term_to_meddra_code[ent['MedDRA'].split('|')[0]])
                test_phrases_without_conceptless.append(ent['text'])
            except KeyError:
                #log_markup_errors.append({'review_id': review['meta']['fileName'], 'entity_id': ent['xmiID']})
                pass
            
print('Всего фраз в трейне: %s'%len(train_phrases))
print('Всего фраз в тесте: %s'%len(test_phrases))

print('Уникальных фраз в трейне: %s'%len(set(train_phrases)))
print('Уникальных фраз в тесте: %s'%len(set(test_phrases)))

#Посмотрим на статистику разбиения
print('%s концептов не входящих либо в трейн, либо в тест'%len(set.union(set(train_concepts), set(test_concepts)) - set.intersection(set(test_concepts), set(train_concepts))))
print('%s концептов, которые есть в тесте, но нет в трейне'%len(set(test_concepts) - set(train_concepts)))
print('%s концептов, которые есть в трейне, но нет в тесте'%len(set(train_concepts) - set(test_concepts)))

Всего фраз в трейне: 8104
Всего фраз в тесте: 0
Уникальных фраз в трейне: 4365
Уникальных фраз в тесте: 0
784 концептов не входящих либо в трейн, либо в тест
0 концептов, которые есть в тесте, но нет в трейне
784 концептов, которые есть в трейне, но нет в тесте


In [None]:
from dataset import MedNormDataset

RDR_train = MedNormDataset(train_phrases, train_concepts, CV, use_cuda=True)
RDR_test = MedNormDataset(test_phrases, test_concepts, CV, use_cuda=True)

<h2>Импорт модели</h2>

In [7]:
from models import CADEC_SoTa


model_path = 'DeepPavlov/rubert-base-cased'
net = CADEC_SoTa(model_path, CV.thesaurus_embeddings) #, score_threshold=6.1977e-05
device = 'cuda' if torch.cuda.is_available() else 'cpu'
net.to(device)
print('Net loaded')

Some weights of the model checkpoint at DeepPavlov/rubert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Net loaded


Демонстрация работы с concept_less

In [8]:
from transformers import AutoTokenizer

net.eval()
phrase = 'сонливость'
model_path = 'DeepPavlov/rubert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_path)
encoded_input = tokenizer([phrase], padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
    outputs_dict = net(encoded_input.to('cuda'))
    outputs_dict.label_concepless_tensors(score_treshold=6.1977e-05)
    pred_meddra_code = CV.meddra_codes[outputs_dict['output'].argmax()]
    
print('phrase: %s'%phrase)
print('model: %s'%CV.meddra_code_to_meddra_term[pred_meddra_code])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


phrase: сонливость
model: Аномалия специальных видов чувствительности врожденная


<h2>Обучение модели c логированием  wandb</h2>

In [11]:
USE_WANDB= False

In [12]:
import torch.optim as optim
import torch.nn as nn
import torch
import numpy as np
if USE_WANDB:
    import wandb
    wandb.login()
    
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(net.parameters(), lr=0.0001)

In [22]:
optimizer.defaults['lr']

0.0001

In [23]:
scaler = torch.cuda.amp.GradScaler()

Для большей детерменированности

In [24]:
import os 

os.environ["CUBLAS_WORKSPACE_CONFIG"]=":16:8"
torch.use_deterministic_algorithms(mode=False)
np.random.seed(0)
torch.manual_seed(0)
torch.backends.cudnn.benchmark = False

In [25]:
from tqdm import trange
from tqdm import tqdm
from sklearn.metrics import classification_report, f1_score

batch_size=16
epochs = 1
if USE_WANDB:
    wandb.init(
      project="MedNormalization", 
      config={
      "learning_rate": optimizer.defaults['lr'],
      "batch_size": batch_size,
      "architecture": "CADEC_SoTa",
      "dataset": "RDR",
      "epochs": epochs,
      })
    
trainloader = torch.utils.data.DataLoader(RDR_train, batch_size=batch_size,
                                          shuffle=False, num_workers=0)
testloader = torch.utils.data.DataLoader(RDR_test, batch_size=1, shuffle=False)

net.train()
initial_loss = None
for epoch in range(1, epochs):
    net.train()
    with tqdm(trainloader, unit="batch") as tepoch:
        for data in tepoch:

            tepoch.set_description(f"Epoch {epoch}")

            inputs = data['tokenized_phrases']
            labels = data['one_hot_labels']

            optimizer.zero_grad()
            if device=='cuda':
                with torch.cuda.amp.autocast():
                    outputs = net(inputs)['output']
                    loss = criterion(outputs, labels)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            else:
                outputs = net(inputs)['output']
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
            if initial_loss is None:
                initial_loss = loss.item()
            tepoch.set_postfix(loss_decrease = str(initial_loss/loss.item()))
    net.eval()
    model_answers=[]
    real_answers=[]
    with tqdm(testloader, unit="batch") as eval_process:
        for data in eval_process:

            #tepoch.set_description(f"Progress")

            inputs = data['tokenized_phrases']
            #labels = data['one_hot_labels']

            with torch.no_grad():
                outputs_dict = net(inputs)
                #outputs_dict.label_concepless_tensors(score_treshold = 6.1977e-05)
                pred_meddra_code = CV_test_without_conceptless.meddra_codes[outputs_dict['output'].argmax()]


            model_answers.append(pred_meddra_code)
            real_answers.append(data['label_codes'])

    print(f1_score(real_answers, model_answers, average='micro'))
            
print('Finished Training')

NameError: name 'RDR_train' is not defined

<h2>Тест модели с mean_pooling c CONCEPT_LESS</h2>

In [14]:
from sklearn.metrics import classification_report, f1_score
from tqdm import tqdm


net.eval()

model_answers=[]
real_answers=[]

testloader = torch.utils.data.DataLoader(RDR_test, batch_size=1,
                                          shuffle=False)


with tqdm(testloader, unit="batch") as eval_process:
    for data in eval_process:

        #tepoch.set_description(f"Progress")

        inputs = data['tokenized_phrases']
        #labels = data['one_hot_labels']

        with torch.no_grad():
            outputs_dict = net(inputs)
            outputs_dict.label_concepless_tensors(score_treshold = 6.1977e-05)
            pred_meddra_code = CV_test.meddra_codes[outputs_dict['output'].argmax()]


        model_answers.append(pred_meddra_code)
        real_answers.append(data['label_codes'])

f1_score(real_answers, model_answers, average='micro')

100%|██████████| 3999/3999 [00:41<00:00, 96.32batch/s] 


0.695173793448362

In [65]:
outputs_dict

{'output': tensor([[3.9106e-05, 4.1192e-05, 4.0345e-05,  ..., 4.0723e-05, 3.8689e-05,
         3.7171e-05]], device='cuda:0'), 'max_scores': tensor([8.9186e-05], device='cuda:0')}

Инференс

In [49]:
phrase = 'Хуй соси губой тряси ff'
model_path = 'DeepPavlov/rubert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_path)
encoded_input = tokenizer([phrase], padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
    outputs_dict = net(inputs)
    outputs_dict.label_concepless_tensors(score_treshold = 6.1977e-05)
    pred_meddra_code = CV.meddra_codes[outputs_dict['output'].argmax()]
    
print('phrase: %s'%phrase)
print('model: %s'%CV.meddra_code_to_meddra_term[pred_meddra_code])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


phrase: Хуй соси губой тряси ff
model: Гнев


<h2>Тест модели с mean_pooling</h2>

In [125]:
from sklearn.metrics import classification_report, f1_score
from tqdm import tqdm


net.eval()

model_answers=[]
real_answers=[]

testloader = torch.utils.data.DataLoader(RDR_test, batch_size=1,
                                          shuffle=False)


with tqdm(testloader, unit="batch") as eval_process:
    for data in eval_process:

        #tepoch.set_description(f"Progress")

        inputs = data['tokenized_phrases']
        #labels = data['one_hot_labels']

        with torch.no_grad():
            pred_meddra_code = CV.meddra_codes[net(inputs).argmax()]


        model_answers.append(pred_meddra_code)
        real_answers.append(data['label_codes'])

f1_score(real_answers, model_answers, average='micro')

100%|██████████| 1749/1749 [00:20<00:00, 83.56batch/s] 


0.8147512864493998

<h2>Тест модели с mean_pooling</h2>

In [15]:
from sklearn.metrics import classification_report, f1_score
from tqdm import tqdm


net.eval()

model_answers=[]
real_answers=[]

testloader = torch.utils.data.DataLoader(RDR_test, batch_size=1,
                                          shuffle=False)


with tqdm(testloader, unit="batch") as eval_process:
    for data in eval_process:

        #tepoch.set_description(f"Progress")

        inputs = data['tokenized_phrases']
        labels = data['one_hot_labels']

        with torch.no_grad():
            pred_meddra_code = CV.meddra_codes[net(inputs).argmax()]


        model_answers.append(pred_meddra_code)
        real_answers.append(data['label_codes'])

f1_score(real_answers, model_answers, average='micro')

100%|██████████| 1750/1750 [00:17<00:00, 98.77batch/s] 


0.7994285714285714

In [19]:
from sklearn.metrics import classification_report, f1_score
from tqdm import tqdm


net.eval()

model_answers=[]
real_answers=[]

testloader = torch.utils.data.DataLoader(RDR_test, batch_size=1,
                                          shuffle=False)


with tqdm(testloader, unit="batch") as eval_process:
    for data in eval_process:

        #tepoch.set_description(f"Progress")

        inputs = data['tokenized_phrases']
        labels = data['one_hot_labels']

        with torch.no_grad():
            pred_meddra_code = CV.meddra_codes[net(inputs).argmax()]


        model_answers.append(pred_meddra_code)
        real_answers.append(data['label_codes'])

f1_score(real_answers, model_answers, average='micro')

100%|██████████| 1750/1750 [00:21<00:00, 81.52batch/s]


0.72

In [32]:
from sklearn.metrics import classification_report, f1_score
from tqdm import tqdm


net.eval()

model_answers=[]
real_answers=[]

testloader = torch.utils.data.DataLoader(RDR_test, batch_size=1,
                                          shuffle=False)


with tqdm(testloader, unit="batch") as eval_process:
    for data in eval_process:

        #tepoch.set_description(f"Progress")

        inputs = data['tokenized_phrases']
        labels = data['one_hot_labels']

        with torch.no_grad():
            pred_meddra_code = CV.meddra_codes[net(inputs).argmax()]


        model_answers.append(pred_meddra_code)
        real_answers.append(data['label_codes'][0])

f1_score(real_answers, model_answers, average='micro')

100%|██████████| 1750/1750 [00:20<00:00, 87.42batch/s]


0.792

<h2> Тест необученной модели (на всякий случай) </h2>

In [25]:
from sklearn.metrics import classification_report, f1_score
from tqdm import tqdm

#net.train()
net.eval()

model_answers=[]
real_answers=[]

testloader = torch.utils.data.DataLoader(RDR_test, batch_size=1,
                                          shuffle=False)

with tqdm(testloader, unit="batch") as eval_process:
    for data in eval_process:

        #tepoch.set_description(f"Progress")

        inputs = data['tokenized_phrases']
        labels = data['one_hot_labels']

        with torch.no_grad():
            pred_meddra_code = CV.meddra_codes[net(inputs).argmax()]


        model_answers.append(pred_meddra_code)
        real_answers.append(data['label_codes'])

f1_score(real_answers, model_answers, average='micro')

100%|██████████| 1750/1750 [00:05<00:00, 294.15batch/s]


0.17142857142857143

<h2>Инференс</h2>

In [28]:
from random import randint

#i = randint(1, len(RDR_test))

net.eval()

#phrase = {k: tensor.unsqueeze(0) for k, tensor in RDR_test[i]['tokenized_phrases'].items()}
concept = RDR_test[i]['label_codes']

with torch.no_grad():
    model_answer = CV.meddra_codes[net(phrase).argmax()]
    


print('phrase: %s'%RDR_test[i]['phrases'])
print('model: %s'%CV.meddra_code_to_meddra_term[model_answer])



print('real: %s'%RDR_test[i]['label_terms'])

phrase: температурой 37,8
model: Пирексия
real: Пирексия


Где ошиблась модель

In [51]:
#net = torch.load('./cadec_SoTa_on_RDR_rubert_base_2_epoch.pt')['param_groups']

net.eval()

model_wrong_answers = []
gold_truth_answers = []
phrases = []

testloader = torch.utils.data.DataLoader(RDR_test, batch_size=1,
                                          shuffle=False)

with tqdm(testloader, unit="batch") as eval_process:
    for data in eval_process:

        #tepoch.set_description(f"Progress")

        inputs = data['tokenized_phrases']
        labels = data['one_hot_labels']

        with torch.no_grad():
            pred_meddra_code = CV.meddra_codes[net(inputs).argmax()]
        
        if str(pred_meddra_code)!=str(data['label_codes'][0]):
            model_wrong_answers.append(pred_meddra_code)
            gold_truth_answers.append(data['label_codes'][0])
            phrases.append(data['phrases'][0])
        #model_wrong_answers
        #model_answers.append(pred_meddra_code)
        #real_answers.append(data['label_codes'])
        
import pandas as pd

df = pd.DataFrame(columns=['phrase', 'system output', 'gold markup'])

for phrase, m_a, g_a in zip(phrases, model_wrong_answers, gold_truth_answers):
    new_row = pd.DataFrame({'phrase': [phrase], 'system output': [m_a], 'gold markup': [g_a]})
    df = pd.concat([df, new_row], join='inner', ignore_index=True)
    #gold_truth_answers.append(g_a)
    
df

100%|██████████| 1750/1750 [00:19<00:00, 88.99batch/s]


Unnamed: 0,phrase,system output,gold markup
0,раздрожалась,10025482,10022998
1,чувство тревоги,10033670,10002855
2,угнетенном состоянии,10025482,10040007
3,Начала плохо спать,10022437,10062519
4,ухудшилось эмоциональное состояние,10061284,10014551
...,...,...,...
359,проблема с кожей на лице,10015150,10000496
360,шаткая нервная система,10029216,10003549
361,острой респираторной вирусной инфекции,10074831,10062352
362,с гнойно-слизистым секретом,10023848,10039083


In [59]:
df.to_csv('rubert_base_RDR_wrong.csv', index=False)

In [74]:
CV.meddra_code_to_meddra_term['CONCEPT_LESS'] = 'CONCEPT_LESS'

In [89]:
import pandas as pd

df = pd.read_csv('elastic_wrong_predictions.csv')
df['system concept'] = df.apply(lambda x: CV.meddra_code_to_meddra_term[str(x['system output'])], axis=1)#], join='outer', ignore_index=True)
df['gold concept'] = df.apply(lambda x: CV.meddra_code_to_meddra_term[str(x['gold markup'])], axis=1)
df

Unnamed: 0,phrase,system output,gold markup,system concept,gold concept
0,профилактики,10036898,10036898,Профилактика,Профилактика
1,для профилактики,10036898,10036898,Профилактика,Профилактика
2,профилактики,10036898,10036898,Профилактика,Профилактика
3,профилактики,10036898,10036898,Профилактика,Профилактика
4,стрессы,10042209,10042209,Стресс,Стресс
...,...,...,...,...,...
1745,тревожности,CONCEPT_LESS,10033670,CONCEPT_LESS,Паническая реакция
1746,ослабленного иммунитета,CONCEPT_LESS,10021425,CONCEPT_LESS,Нарушение со стороны иммунной системы
1747,чихал,CONCEPT_LESS,10041232,CONCEPT_LESS,Чихание
1748,кашлял,CONCEPT_LESS,10011224,CONCEPT_LESS,Кашель


In [95]:
df_wrong = df[df['system output'].apply(lambda x: int(x) if x!='CONCEPT_LESS' else 'CONCEPT_LESS')!=df['gold markup']]

In [94]:
df_wrong[df_wrong['system output']!='CONCEPT_LESS']

Unnamed: 0,phrase,system output,gold markup,system concept,gold concept


In [96]:
df_wrong.to_csv('elastic_RDR_wrong.csv', index=False)

In [80]:
df['system output']

0       True
1       True
2       True
3       True
4       True
        ... 
1745    True
1746    True
1747    True
1748    True
1749    True
Length: 1750, dtype: bool

In [82]:
df['system output']

0           10036898
1           10036898
2           10036898
3           10036898
4           10042209
            ...     
1745    CONCEPT_LESS
1746    CONCEPT_LESS
1747    CONCEPT_LESS
1748    CONCEPT_LESS
1749        10005911
Name: system output, Length: 1750, dtype: object

In [81]:
df['gold markup']

0       10036898
1       10036898
2       10036898
3       10036898
4       10042209
          ...   
1745    10033670
1746    10021425
1747    10041232
1748    10011224
1749    10005911
Name: gold markup, Length: 1750, dtype: int64

In [71]:
df.to_csv('rubert_base_wrong_preds_RDR.csv', index=False)

In [44]:
df.drop('Unnamed: 0', axis=1)

Unnamed: 0,system output,gold markup
0,10025482,10022998
1,10033670,10002855
2,10025482,10040007
3,10022437,10062519
4,10061284,10014551
...,...,...
359,10015150,10000496
360,10029216,10003549
361,10074831,10062352
362,10023848,10039083


In [33]:
df = pd.DataFrame(columns=['system output', 'gold markup'])

for m_a, g_a in zip(model_wrong_answers, gold_truth_answers):
    new_row = pd.DataFrame({'system output': [m_a], 'gold markup': [g_a]})
    df = pd.concat([df, new_row], join='inner', ignore_index=True)
    #gold_truth_answers.append(g_a)
    
df

Unnamed: 0,system output,gold markup
0,10036898,10036898
1,10036898,10036898
2,10036898,10036898
3,10036898,10036898
4,10042209,10042209
...,...,...
1745,10033670,10033670
1746,10021425,10021425
1747,10041232,10041232
1748,10011224,10011224


In [19]:
net.keys()

dict_keys(['state', 'param_groups'])

<h2>Сохранение и загрузка модели</h2>

In [27]:
torch.save(net, './cadec_SoTa_on_RDR_rubert_base_2_epoch.pt')
torch.save(optimizer.state_dict(), './cadec_SoTa_on_RDR_rubert_base_2_epoch_opt.pt')

In [30]:
the_model = torch.load('./cadec_SoTa_on_RDR_rubert_right_exp.pt')

Покажем, что это ТА ЖЕ модель

In [32]:
the_model.eval()

model_answers=[]
real_answers=[]

testloader = torch.utils.data.DataLoader(RDR_test, batch_size=1,
                                          shuffle=False)


with tqdm(testloader, unit="batch") as eval_process:
    for data in eval_process:


        inputs = data['tokenized_phrases']
        labels = data['one_hot_labels']

        with torch.no_grad():
            pred_meddra_code = CV.meddra_codes[the_model(inputs).argmax()]


        model_answers.append(pred_meddra_code)
        real_answers.append(data['label_codes'])

f1_score(real_answers, model_answers, average='micro')

100%|██████████| 1750/1750 [00:06<00:00, 259.93batch/s]


0.5891428571428572