In [1]:
# add autoreload
%load_ext autoreload
%autoreload 2

import os
import sys
import regex
import clinlp
#import medcat as mc
import gensim as gs
import spacy
import numpy as np
import dotenv

import sklearn as sk
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.decomposition import LatentDirichletAllocation, NMF, PCA
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, roc_auc_score, multilabel_confusion_matrix

import pandas as pd

import seaborn as sns

from typing import List, Tuple
from collections import defaultdict


import matplotlib.pyplot as plt
import seaborn as sns

from gensim.models import FastText, Word2Vec, KeyedVectors

import gc
#from numba import jit

sys.path.append(os.path.abspath(os.path.join('..', 'src')))
import echo_models, echo_utils, deabber
plt.style.use('ggplot')

from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from xgboost import XGBClassifier

import pprint
import benedict

dotenv.load_dotenv("../.env")
EmbeddingPath = os.environ['WORD_EMBEDDINGS']




# Load data

In [2]:
STOPWORDS = ['de', 'het', 'een', 'is', 'bij', 'van', 'met', 'en', 'in', 'voor']

In [346]:
Classes = ['lv_dil', 'pe', 'rv_dil', 'aortic_regurgitation', 
           'lv_syst_func', 'rv_syst_func', 'aortic_stenosis', 
           'diastolic_dysfunction', 'mitral_regurgitation',
           'tricuspid_regurgitation', 'wma']

Class = 'aortic_regurgitation'
lemmatize = True
lowercase = False
deabbreviate = False
filter_reports = True
remove_interpunction = True
reduce_labels = True
num_topics = 20 # 10, 20
num_words_in_vocab = 5_000

model_TDIDF = True
model_ETM = True
model_LDA = True
model_embeddings= False
EMBEDDING_AGGREGATOR = 'mean'

FLAG_TERMS = ['uitslag zie medische status', 'zie status', 'zie verslag status', 'slecht echovenster', 'echo overwegen', 'ge echo',
              'geen echovenster', 'geen beoordeelbaar echo', 'tee 190', 'hdf 36mnd', 'geen beoordeelbare echo', 'verslag op ic']
SAVE_TERMS = ['goed', 'geen', 'normaal', 'normale']

REDUCED_LABELMAP = {
    'Present': 'Present',
    'No label': 'No label',
    'Normal': 'Normal',
    'Moderate': 'Present',
    'Severe': 'Present',
    'Mild': 'Present'
}


In [347]:
if deabbreviate:
    ABBREVIATIONS = benedict.benedict("../assets/abbreviations.yml")
    
if lemmatize:
    nlp = spacy.load("nl_core_news_lg", disable = ['parser','ner'])  

In [348]:
os.chdir('T://lab_research/RES-Folder-UPOD/Echo_label/E_ResearchData/2_ResearchData')

os.listdir("./echo_doc_labels")

['aortic_regurgitation.jsonl',
 'aortic_stenosis.jsonl',
 'diastolic_dysfunction.jsonl',
 'lv_dil.jsonl',
 'lv_syst_func.jsonl',
 'merged_labels.jsonl',
 'mitral_regurgitation.jsonl',
 'old',
 'pe.jsonl',
 'rv_dil.jsonl',
 'rv_syst_func.jsonl',
 'tricuspid_regurgitation.jsonl',
 'wma.jsonl']

In [349]:
# make dictionary with labeled_documents
labeled_documents = pd.read_json(f"./echo_doc_labels/{Class}.jsonl", lines=True)
label_col = 'label' if Class!='merged_labels' else 'labels'

train_ids = pd.read_csv('./train_echoid.csv', sep=',').input_hash.unique()
test_ids = pd.read_csv('./test_echoid.csv', sep=',').input_hash.unique()

labeled_documents['_hash'] = labeled_documents.text.str.strip().apply(lambda x: hash(x))
labeled_documents = labeled_documents.drop_duplicates(subset=['_hash']).reset_index(drop=True)

if reduce_labels:
    labeled_documents['label'] = labeled_documents['label'].map(REDUCED_LABELMAP)

In [350]:
WordLists = labeled_documents.text.str.split(" ").tolist()
WordCount = defaultdict(int)
for d in WordLists:
    for t in d:
        WordCount[t.lower()] += 1

In [351]:
sorted(WordCount.items(), key=lambda x: x[1], reverse=True)[:10]

[('geen', 4886),
 ('goede', 4465),
 ('en', 3754),
 ('normale', 3703),
 ('met', 3170),
 ('lv', 2548),
 ('functie.', 2441),
 ('globaal', 2131),
 ('systolische', 1905),
 ('niet', 1832)]

In [352]:
# Expand with label columns
if Class == 'merged_labels':
    target_df = pd.DataFrame.from_records(labeled_documents[label_col])
    Target_maps = {
        _Class: {Label:i for i,Label in enumerate(target_df[Class].unique())}
        for _Class in target_df.columns
    }
else:
    Target_maps = {
        Class: {Label: i for i,Label in enumerate(labeled_documents['label'].unique())} 
    }
    
if Class == 'merged_labels':
    DF = labeled_documents[['text', '_input_hash']].join(target_df[Class])
else:
    DF = labeled_documents[['text', '_input_hash', 'label']]

DF.columns = ['sentence', '_input_hash', 'labels']

label2id = Target_maps[Class]
id2label = {v:k for k,v in label2id.items()}
num_labels = len(label2id)

DF= DF.assign(label=DF['labels'].map(label2id))

# Clean text

In [353]:
DF = DF.assign(sentence=DF.sentence.str.replace(r'[\r\n]', '', regex=True)) 

if lemmatize:
    print("Lemmatizing...")
    docs = nlp.pipe(DF.sentence.values)
    new_texts = [" ".join([token.lemma_ for token in doc]) for doc in docs] 
    DF = DF.assign(sentence = new_texts)

if lowercase:
    print("Lowercasing...")
    DF = DF.assign(sentence = DF.sentence.str.lower())

if remove_interpunction:
    print("Removing interpunction")
    DF = DF.assign(sentence = DF.sentence.str.replace(r'([A-Z])[\.]([A-Z])', '\\1\\2', regex=True))
    
if filter_reports:
    print("Filtering...")
    DF = DF.assign(sentence = echo_utils.report_filter(DF.sentence, 
                                            flag_terms=FLAG_TERMS, 
                                            save_terms=SAVE_TERMS)[0])
    DF = DF.loc[DF.sentence.notna()]

if deabbreviate:
    print("Deabbreviate...")
    DeAbber = deabber.deabber(model_type='sbert', abbreviations=ABBREVIATIONS['nl']['echocardiogram'], min_sim=0.5, top_k=10)
    DF = DF.assign(sentence=DeAbber.deabb(DF.sentence.values, TokenRadius=3))

Lemmatizing...
Removing interpunction
Filtering...


# Make folds

In [354]:
# Stratified cross-validation

def fold_indices(targets: pd.Series=None, stratified: bool=True, seed: int=42, numfolds: int=10)->Tuple[List,List]:
    if stratified:
        splitter = StratifiedKFold(n_splits=numfolds, shuffle=True, random_state=seed)
        _Targets = targets
    else:
        splitter = KFold(n_splits=numfolds, shuffle=True, random_state=seed)
        _Targets = None

    train_indcs, test_indcs = [], []
    for train_index, test_index in splitter.split(X=targets, y=_Targets):
        train_indcs.append(train_index)
        test_indcs.append(test_index)

    return zip(train_indcs, test_indcs)

def make_folds(targets: pd.Series=None, 
               train_test: tuple=None, 
               n_folds: int=10, 
               stratified: bool=True,
               splitting: str='CV',
               label_col: str='labels',
               text_col: str='sentence'):

    TTDict = defaultdict(dict)
    if splitting == 'CV':
        for k,(train_index, test_index) in enumerate(fold_indices(targets=targets[label_col], 
                                                                  stratified=stratified,
                                                                  numfolds=n_folds)):
            TTDict[k]['Xtrain'] = targets.iloc[train_index][text_col]
            TTDict[k]['Xtest'] = targets.iloc[test_index][text_col]
            
            TTDict[k]['ytrain'] = targets.iloc[train_index][label_col]
            TTDict[k]['ytest'] = targets.iloc[test_index][label_col]
    else:
        train_ids, test_ids = train_test
        TTDict[0]['Xtrain'] = targets.loc[targets._input_hash.isin(train_ids)][text_col]
        TTDict[0]['Xtest'] = targets.loc[targets._input_hash.isin(test_ids)][text_col]
        
        TTDict[0]['ytrain'] = targets.loc[targets._input_hash.isin(train_ids)][label_col]
        TTDict[0]['ytest'] = targets.loc[targets._input_hash.isin(test_ids)][label_col]
    
    return TTDict

# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(probs, labels, threshold=0.5):
    # next, use threshold to turn them into integer predictions
    y_pred = np.argmax(probs, axis=1)
    # finally, compute metrics
    
    y_true = labels
    #y_true = tf.keras.backend.eval(y_true)
    #y_pred = tf.keras.backend.eval(y_pred)
    
    f1_macro = f1_score(y_true=y_true, y_pred=y_pred, average='macro')
    f1_weighted = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
    f1_micro = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    prec_macro = precision_score(y_true=y_true, y_pred=y_pred, average='macro')
    prec_weighted = precision_score(y_true=y_true, y_pred=y_pred, average='weighted')
    prec_micro = precision_score(y_true=y_true, y_pred=y_pred, average='micro')
    recall_macro = recall_score(y_true=y_true, y_pred=y_pred, average='macro')
    recall_weighted = recall_score(y_true=y_true, y_pred=y_pred, average='weighted')
    recall_micro = recall_score(y_true=y_true, y_pred=y_pred, average='micro')
    try:
        roc_auc_weighted = roc_auc_score(y_true, probs, average = 'weighted')
        roc_auc_macro = roc_auc_score(y_true, probs, average = 'macro')
        roc_auc_micro = roc_auc_score(y_true, probs, average = 'micro')
    except ValueError:
        roc_auc_weighted = None
        roc_auc_macro = None
        roc_auc_micro = None

    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1_macro': f1_macro,
               'f1_weighted': f1_weighted,
               'prec_macro': prec_macro,
               'prec_weighted': prec_weighted,
               'recall_macro': recall_macro,
               'recall_weighted': recall_weighted,
               'roc_auc_macro': roc_auc_macro,
               'roc_auc_weighted': roc_auc_weighted,
               'accuracy': accuracy}
    return metrics

In [355]:
TrainTestDict = make_folds(DF, 
                           (train_ids, test_ids), 
                           n_folds=10, 
                           stratified=True, 
                           splitting='from_file',
                           label_col='label')

# Get Topic models using LDA

In [356]:
from scipy import sparse

In [357]:
if model_embeddings:
    Text2Vecs = echo_utils.TextToVectors(source='cardio_wv',
                                     embedding_path=EmbeddingPath)

In [358]:
# do for each TrainTestDict[k]['Xtrain]
DataVersions = dict()

xtrain = TrainTestDict[0]['Xtrain']
xtest = TrainTestDict[0]['Xtest']


TFVEC = TfidfVectorizer(
    max_df=0.95, min_df=2, max_features=num_words_in_vocab, stop_words=STOPWORDS, lowercase=lowercase
)

print("TFIDF extraction")
TFVEC.fit(xtrain)
tokenid2word =  {v:k for k,v in TFVEC.vocabulary_.items()}
xmatrix_train = TFVEC.transform(xtrain)
xmatrix_test = TFVEC.transform(xtest)


DataVersions['tfidf'] = {
    'train': xmatrix_train,
    'test': xmatrix_test
}

if model_ETM | model_LDA:
    print("Latent DA")
    LDA = LatentDirichletAllocation(n_components=num_topics)
    LDA.fit(xmatrix_train)

    lda_theta = LDA.transform(xmatrix_train)
    lda_theta_test = LDA.transform(xmatrix_test)
    lda_beta = LDA.components_ / LDA.components_.sum(axis=1)[:, np.newaxis]

    if model_ETM:
        print("ETM")
        xmatrix_train = echo_utils.ETM(xmatrix_train, lda_theta, lda_beta)
        xmatrix_test = echo_utils.ETM(xmatrix_test, lda_theta_test, lda_beta)

        DataVersions['tfidf_ETM'] = {
            'train': xmatrix_train,
            'test': xmatrix_test
        }
        str_ETM = "_ETM"

    if model_LDA:
        print("Add LDA betas")
        train_weight_matrix_lil = xmatrix_train.tolil()
        num_cols = train_weight_matrix_lil.shape[1]
        train_weight_matrix_lil.resize((train_weight_matrix_lil.shape[0], 
                                        num_cols+num_topics))
        
        for i in range(train_weight_matrix_lil.shape[0]):
            for j in range(num_topics):
                train_weight_matrix_lil[i, num_cols+j] = lda_theta[i][j]
        xmatrix_train = train_weight_matrix_lil.tocsr()

        test_weight_matrix_lil = xmatrix_test.tolil()
        num_cols = test_weight_matrix_lil.shape[1]
        test_weight_matrix_lil.resize((test_weight_matrix_lil.shape[0], 
                                        num_cols+num_topics))
        
        for i in range(test_weight_matrix_lil.shape[0]):
            for j in range(num_topics):
                test_weight_matrix_lil[i, num_cols+j] = lda_theta_test[i][j]
        xmatrix_test = test_weight_matrix_lil.tocsr()

        DataVersions[f'tfidf{str_ETM}_LDA'] = {
            'train': xmatrix_train,
            'test': xmatrix_test
        }
        DataVersions['LDA'] = {
            'train': lda_theta,
            'test': lda_theta_test
        }        
        
        prior_str = f"tfidf{str_ETM}_LDA"
        
    if model_embeddings:
        xmatrix_train_emb = Text2Vecs.vector_aggregation_LIL(xtrain)
        xmatrix_test_emb = Text2Vecs.vector_aggregation_LIL(xtest)                 

        test_weight_matrix_lil = xmatrix_test.tolil()
        num_cols = test_weight_matrix_lil.shape[1]
        test_weight_matrix_lil.resize((test_weight_matrix_lil.shape[0], 
                                        num_cols+xmatrix_test_emb.shape[1]))

        for i in range(xmatrix_test.shape[0]):
            for j in range(xmatrix_test_emb.shape[1]):
                test_weight_matrix_lil[i, num_cols+j] = xmatrix_test_emb[i,j]
        xmatrix_test = test_weight_matrix_lil.tocsr()

        train_weight_matrix_lil = xmatrix_train.tolil()
        num_cols = train_weight_matrix_lil.shape[1]
        train_weight_matrix_lil.resize((train_weight_matrix_lil.shape[0], 
                                        num_cols+xmatrix_train_emb.shape[1]))

        for i in range(xmatrix_train.shape[0]):
            for j in range(xmatrix_train_emb.shape[1]):
                train_weight_matrix_lil[i, num_cols+j] = xmatrix_train_emb[i,j]
        xmatrix_train = train_weight_matrix_lil.tocsr() 
       
        DataVersions[f"{prior_str}_EMBS"] = {
            'train': xmatrix_train,
            'test': xmatrix_test
        }
        
        DataVersions["EMBS"] = {
            'train': xmatrix_train_emb,
            'test': xmatrix_test_emb
        }
        
       
gc.collect()


TFIDF extraction
Latent DA
ETM
Add LDA betas


566

In [359]:
for k,v in DataVersions.items():
    print(k)
    
    clf = XGBClassifier(seed=42, n_estimators=150, max_depth=5, learning_rate=1e-1)
    clf.fit(v['train'], TrainTestDict[0]['ytrain'])

    preds = clf.predict_proba(v['test'])
    ytest = TrainTestDict[0]['ytest']

    DataVersions[k].update({"results": multi_label_metrics(preds,
                                                           ytest, 
                                                           threshold=0.5)})

tfidf


tfidf_ETM
tfidf_ETM_LDA
LDA


In [360]:
for k,v in DataVersions.items():
    print(f"Results for {k}")
    res_string = f"F1 {round(v['results']['f1_weighted'], 3)} ({round(v['results']['f1_macro'], 3)})"
    res_string += f", REC {round(v['results']['recall_weighted'], 3)} ({round(v['results']['recall_macro'], 3)})"
    res_string += f", PREC {round(v['results']['prec_weighted'], 3)} ({round(v['results']['prec_macro'], 3)})"
    #pprint.pp(f"F1 {round(v['results']['f1_weighted'], 3)} ({round(v['results']['f1_macro'],3)})")
    #pprint.pp(f"REC {round(v['results']['recall_weighted'],3)} ({round(v['results']['recall_macro'], 3)})")
    #pprint.pp(f"PREC {round(v['results']['prec_weighted'],3)} ({round(v['results']['prec_macro'], 3)})")
    print(res_string)
    print(30*"+")

Results for tfidf
F1 0.917 (0.882), REC 0.918 (0.875), PREC 0.918 (0.891)
++++++++++++++++++++++++++++++
Results for tfidf_ETM
F1 0.918 (0.885), REC 0.919 (0.883), PREC 0.92 (0.89)
++++++++++++++++++++++++++++++
Results for tfidf_ETM_LDA
F1 0.921 (0.892), REC 0.922 (0.886), PREC 0.922 (0.899)
++++++++++++++++++++++++++++++
Results for LDA
F1 0.604 (0.461), REC 0.637 (0.462), PREC 0.61 (0.552)
++++++++++++++++++++++++++++++


## 