In [60]:
# add autoreload
%load_ext autoreload
%autoreload 2

import os
import sys
import regex
import clinlp
import medcat as mc
import gensim as gs
import spacy
import numpy as np


import sklearn as sk
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.decomposition import LatentDirichletAllocation, NMF, PCA
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score, multilabel_confusion_matrix

import pandas as pd

import seaborn as sns

from typing import List, Tuple
from collections import defaultdict


import matplotlib.pyplot as plt
import seaborn as sns

from gensim.models import FastText, Word2Vec, KeyedVectors
from sentence_transformers import SentenceTransformer
import gc
from numba import jit

sys.path.append(os.path.abspath(os.path.join('..', 'src')))
import echo_models, echo_utils, deabber
plt.style.use('ggplot')

from sklearn.svm import SVC
from xgboost import XGBClassifier

import pprint


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load data

In [3]:
os.chdir('T://lab_research/RES-Folder-UPOD/Echo_label/E_ResearchData/2_ResearchData')

In [4]:
os.listdir("./echo_doc_labels")

['aortic_regurgitation.jsonl',
 'aortic_stenosis.jsonl',
 'diastolic_dysfunction.jsonl',
 'lv_dil.jsonl',
 'lv_syst_func.jsonl',
 'merged_labels.jsonl',
 'mitral_regurgitation.jsonl',
 'old',
 'pe.jsonl',
 'rv_dil.jsonl',
 'rv_syst_func.jsonl',
 'tricuspid_regurgitation.jsonl',
 'wma.jsonl']

In [5]:
STOPWORDS = ['de', 'het', 'een', 'is', 'bij', 'van', 'met', 'en', 'in', 'voor']

In [6]:
Classes = ['lv_dil', 'pe', 'rv_dil', 'aortic_regurgitation', 
           'lv_syst_func', 'rv_syst_func', 'aortic_stenosis', 
           'diastolic_dysfunction', 'mitral_regurgitation',
           'tricuspid_regurgitation', 'wma']

Class = 'aortic_regurgitation'
lemmatize = True
lowercase = False
deabbreviate = False
filter_reports = True
reduce_labels = True
num_topics = 5
num_words_in_vocab = 5_000
weight_matrix = 'count' # count, tfidf, etm

model_TDIDF = True
model_ETM = False
model_SALT = False
SALT_METHOD = 'kmean'
model_embeddings_CUI = False
model_embeddings_W2V = False
model_embeddings_SBERT = False
EMBEDDING_AGGREGATOR = 'mean'
model_CLF = 'svm'
model_normaliser = 'standardize'


FLAG_TERMS = ['uitslag zie medische status', 'zie status', 'zie verslag status', 'slecht echovenster', 'echo overwegen', 'ge echo',
              'geen echovenster', 'geen beoordeelbaar echo', 'tee 190', 'hdf 36mnd', 'geen beoordeelbare echo', 'verslag op ic']
SAVE_TERMS = ['goed', 'geen', 'normaal', 'normale']

reduce_labels = True
REDUCED_LABELMAP = {
    'Present': 'Present',
    'No label': 'No label',
    'Normal': 'Normal',
    'Moderate': 'Present',
    'Severe': 'Present',
    'Mild': 'Present'
}


In [7]:
if deabbreviate:
    ABBREVIATIONS = benedict.benedict("../assets/abbreviations.yml")
    
if lemmatize:
    nlp = spacy.load("nl_core_news_lg", disable = ['parser','ner'])  

In [8]:
# make dictionary with labeled_documents
labeled_documents = pd.read_json(f"./echo_doc_labels/{Class}.jsonl", lines=True)
label_col = 'label' if Class!='merged_labels' else 'labels'

train_ids = pd.read_csv('./train_echoid.csv', sep=',').input_hash.unique()
test_ids = pd.read_csv('./test_echoid.csv', sep=',').input_hash.unique()

labeled_documents['_hash'] = labeled_documents.text.str.strip().apply(lambda x: hash(x))
labeled_documents = labeled_documents.drop_duplicates(subset=['_hash']).reset_index(drop=True)

if reduce_labels:
    labeled_documents['label'] = labeled_documents['label'].map(REDUCED_LABELMAP)

In [9]:
WordLists = labeled_documents.text.str.split(" ").tolist()
WordCount = defaultdict(int)
for d in WordLists:
    for t in d:
        WordCount[t.lower()] += 1

In [16]:
sorted(WordCount.items(), key=lambda x: x[1], reverse=True)[:10]

[('geen', 4886),
 ('goede', 4465),
 ('en', 3754),
 ('normale', 3703),
 ('met', 3170),
 ('lv', 2548),
 ('functie.', 2441),
 ('globaal', 2131),
 ('systolische', 1905),
 ('niet', 1832)]

In [17]:
# Expand with label columns
if Class == 'merged_labels':
    target_df = pd.DataFrame.from_records(labeled_documents[label_col])
    Target_maps = {
        _Class: {Label:i for i,Label in enumerate(target_df[Class].unique())}
        for _Class in target_df.columns
    }
else:
    Target_maps = {
        Class: {Label: i for i,Label in enumerate(labeled_documents['label'].unique())} 
    }
    
if Class == 'merged_labels':
    DF = labeled_documents[['text', '_input_hash']].join(target_df[Class])
else:
    DF = labeled_documents[['text', '_input_hash', 'label']]

DF.columns = ['sentence', '_input_hash', 'labels']

label2id = Target_maps[Class]
id2label = {v:k for k,v in label2id.items()}
num_labels = len(label2id)

DF= DF.assign(label=DF['labels'].map(label2id))

# Clean text

In [20]:
DF = DF.assign(sentence=DF.sentence.str.replace(r'[\r\n]', '', regex=True)) 

if lemmatize:
    print("Lemmatizing...")
    docs = nlp.pipe(DF.sentence.values)
    new_texts = [" ".join([token.lemma_ for token in doc]) for doc in docs] 
    DF = DF.assign(sentence = new_texts)

if lowercase:
    print("Lowercasing...")
    DF = DF.assign(sentence = DF.sentence.str.lower())
    
if filter_reports:
    print("Filtering...")
    DF = DF.assign(sentence = echo_utils.report_filter(DF.sentence, 
                                            flag_terms=FLAG_TERMS, 
                                            save_terms=SAVE_TERMS)[0])
    DF = DF.loc[DF.sentence.notna()]

if deabbreviate:
    print("Deabbreviate...")
    DeAbber = deabber.deabber(model_type='sbert', abbreviations=ABBREVIATIONS['nl']['echocardiogram'], min_sim=0.5, top_k=10)
    DF = DF.assign(sentence=DeAbber.deabb(DF.sentence.values, TokenRadius=3))

# Make folds

In [53]:
# Stratified cross-validation

def fold_indices(targets: pd.Series=None, stratified: bool=True, seed: int=42, numfolds: int=10)->Tuple[List,List]:
    if stratified:
        splitter = StratifiedKFold(n_splits=numfolds, shuffle=True, random_state=seed)
        _Targets = targets
    else:
        splitter = KFold(n_splits=numfolds, shuffle=True, random_state=seed)
        _Targets = None

    train_indcs, test_indcs = [], []
    for train_index, test_index in splitter.split(X=targets, y=_Targets):
        train_indcs.append(train_index)
        test_indcs.append(test_index)

    return zip(train_indcs, test_indcs)

def make_folds(targets: pd.Series=None, 
               train_test: tuple=None, 
               n_folds: int=10, 
               stratified: bool=True,
               splitting: str='CV',
               label_col: str='labels',
               text_col: str='sentence'):

    TTDict = defaultdict(dict)
    if splitting == 'CV':
        for k,(train_index, test_index) in enumerate(fold_indices(targets=targets[label_col], 
                                                                  stratified=stratified,
                                                                  numfolds=n_folds)):
            TTDict[k]['Xtrain'] = targets.iloc[train_index][text_col]
            TTDict[k]['Xtest'] = targets.iloc[test_index][text_col]
            
            TTDict[k]['ytrain'] = targets.iloc[train_index][label_col]
            TTDict[k]['ytest'] = targets.iloc[test_index][label_col]
    else:
        train_ids, test_ids = train_test
        TTDict[0]['Xtrain'] = targets.loc[targets._input_hash.isin(train_ids)][text_col]
        TTDict[0]['Xtest'] = targets.loc[targets._input_hash.isin(test_ids)][text_col]
        
        TTDict[0]['ytrain'] = targets.loc[targets._input_hash.isin(train_ids)][label_col]
        TTDict[0]['ytest'] = targets.loc[targets._input_hash.isin(test_ids)][label_col]
    
    return TTDict

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(probs, labels, threshold=0.5):
    # next, use threshold to turn them into integer predictions
    y_pred = np.argmax(probs, axis=1)
    # finally, compute metrics
    
    y_true = labels
    #y_true = tf.keras.backend.eval(y_true)
    #y_pred = tf.keras.backend.eval(y_pred)
    
    f1_macro = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
    f1_weighted = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
    f1_micro = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    prec_macro = precision_score(y_true=y_true, y_pred=y_pred, average='macro')
    prec_weighted = precision_score(y_true=y_true, y_pred=y_pred, average='weighted')
    prec_micro = precision_score(y_true=y_true, y_pred=y_pred, average='micro')
    recall_macro = recall_score(y_true=y_true, y_pred=y_pred, average='macro')
    recall_weighted = recall_score(y_true=y_true, y_pred=y_pred, average='weighted')
    recall_micro = recall_score(y_true=y_true, y_pred=y_pred, average='micro')
    try:
        roc_auc_weighted = roc_auc_score(y_true, probs, average = 'weighted')
        roc_auc_macro = roc_auc_score(y_true, probs, average = 'macro')
        roc_auc_micro = roc_auc_score(y_true, probs, average = 'micro')
    except ValueError:
        roc_auc_weighted = None
        roc_auc_macro = None
        roc_auc_micro = None

    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1_macro': f1_macro,
               'f1_weighted': f1_weighted,
               'prec_macro': prec_macro,
               'prec_weighted': prec_weighted,
               'recall_macro': recall_macro,
               'recall_weighted': recall_weighted,
               'roc_auc_macro': roc_auc_macro,
               'roc_auc_weighted': roc_auc_weighted,
               'accuracy': accuracy}
    return metrics

In [24]:
TrainTestDict = make_folds(DF, 
                           (train_ids, test_ids), 
                           n_folds=10, 
                           stratified=True, 
                           splitting='from_file',
                           label_col='label')

# Get Topic models using LDA

In [25]:
from scipy import sparse

In [26]:
def ETM(tfidf_matrix, lda_theta, lda_beta):
    # beta_{k,j} : probability of word j, given topic k
    # theta_{i,k}: probability of topic k, given document i
    # tfidf_matrix
    
    # Get Gamma_i for all documents
    num_topics = lda_theta.shape[1]
    AverageSentenceLength = np.mean(np.sum(tfidf_matrix>0, axis=1))
    gammas=AverageSentenceLength/np.array(np.sum(tfidf_matrix>0, axis=1))[:,0]
    # replace inf-gammas with 0
    gammas[np.where(np.isinf(gammas))]=0
    
    # prep omega
    indices = tfidf_matrix.indices
    indptr = tfidf_matrix.indptr
    z = []
    
    for d,row in enumerate(tfidf_matrix):
        idcs = row.indices
        for w in idcs:
            omega = 0
            for k in range(num_topics):
                omega = omega + gammas[d]*lda_theta[d,k]*lda_beta[k, w]
            z.append(omega)
    omega_sparse = sparse.csr_matrix((z, indices, indptr), shape=tfidf_matrix.shape)
    return tfidf_matrix.multiply(omega_sparse)
    
def SALT(tfidf_matrix, cluster_object=None):
    # https://github.com/bagheria/saltclass/blob/master/saltclass/saltclass.py
    pass

In [38]:
# do for each TrainTestDict[k]['Xtrain]

xtrain = TrainTestDict[0]['Xtrain']
xtest = TrainTestDict[0]['Xtest']

LDA = LatentDirichletAllocation(n_components=num_topics)
TVEC =  CountVectorizer(
    max_df=0.95, min_df=2, max_features=num_words_in_vocab, stop_words=STOPWORDS
)
TFVEC = TfidfVectorizer(
    max_df=0.95, min_df=2, max_features=num_words_in_vocab, stop_words=STOPWORDS
)
TVEC.fit(xtrain)
train_tf = TVEC.transform(xtrain)
test_tf = TVEC.transform(xtest)
LDA.fit(train_tf)
lda_theta = LDA.transform(train_tf)
lda_theta_test = LDA.transform(test_tf)

lda_beta = LDA.components_ / LDA.components_.sum(axis=1)[:, np.newaxis]

TFVEC.fit(xtrain)
train_tfidf = TFVEC.transform(xtrain)
test_tfidf = TFVEC.transform(xtest)

train_weight_matrix = ETM(train_tfidf, lda_theta, lda_beta)
test_weight_matrix = ETM(test_tfidf, lda_theta_test, lda_beta)
gc.collect()



237

In [61]:
clf = SVC(probability=True)
clf.fit(train_weight_matrix, TrainTestDict[0]['ytrain'])

preds = clf.predict_proba(test_weight_matrix)
ytest = TrainTestDict[0]['ytest']

pprint.pp(multi_label_metrics(preds, ytest, threshold=0.5))

{'f1_macro': 0.7461156477320502,
 'f1_weighted': 0.7950138809283359,
 'prec_macro': 0.781427910359399,
 'prec_weighted': 0.7999731899968268,
 'recall_macro': 0.7236181839488484,
 'recall_weighted': 0.8019981834695731,
 'roc_auc_macro': None,
 'roc_auc_weighted': None,
 'accuracy': 0.8019981834695731}


In [64]:
clf = XGBClassifier(probability=True)
clf.fit(train_weight_matrix, TrainTestDict[0]['ytrain'])

preds = clf.predict_proba(test_weight_matrix)
ytest = TrainTestDict[0]['ytest']

pprint.pp(multi_label_metrics(preds, ytest, threshold=0.5))

{'f1_macro': 0.8872860396372437,
 'f1_weighted': 0.9182822657103805,
 'prec_macro': 0.8930873183677671,
 'prec_weighted': 0.9190109349459678,
 'recall_macro': 0.8830569393634798,
 'recall_weighted': 0.9191643960036331,
 'roc_auc_macro': None,
 'roc_auc_weighted': None,
 'accuracy': 0.9191643960036331}


In [65]:
clf = XGBClassifier(probability=True)
clf.fit(train_tfidf, TrainTestDict[0]['ytrain'])

preds = clf.predict_proba(test_tfidf)
ytest = TrainTestDict[0]['ytest']

pprint.pp(multi_label_metrics(preds, ytest, threshold=0.5))

{'f1_macro': 0.8875059593849306,
 'f1_weighted': 0.9200610193347314,
 'prec_macro': 0.8946266306253579,
 'prec_weighted': 0.9201617897118376,
 'recall_macro': 0.8813470294181535,
 'recall_weighted': 0.9209809264305178,
 'roc_auc_macro': None,
 'roc_auc_weighted': None,
 'accuracy': 0.9209809264305178}


## Assign Cui-embeddings to UMLS-entities with RotatE

In [None]:
# {'cui': emb}
Emb_path = r'T:\\laupodteam\AIOS\Bram\data\Supporting_Data\External_embeddings'
rotate_path = os.path.join(Emb_path, 'GraphEmbeddings', 'RotatE_entity2id.wv')
cui2vec_rotate = KeyedVectors.load_word2vec_format(rotate_path, binary=False)

In [None]:
cui2vec_rotate['C0411888']

# 

## Add Word2vec from pre-trained embeddings

In [None]:
# {'token': embedding}
# load clinical NLP embeddings, hear we are limited to fourgrams
StaticEmbedding = KeyedVectors.load('path_to_embeddings.wv')

## Add SBERT-embeddings

In [None]:
# jegormeister/robbert-v2-dutch-base-mqa-finetuned, 
# textgain/allnli-GroNLP-bert-base-dutch-cased,
# NetherlandsForensicInstitute/robbert-2022-dutch-sentence-transformers
SentenceModel = SentenceTransformer('FremyCompany/BioLORD-2023-M')


# Aggregate per document using LDA, UMLS, Word2Vec, SBERT

## 