In [166]:
import os

import numpy as np

import regex
import clinlp
import medcat as mc
import gensim as gs
import spacy

import sklearn as sk
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.decomposition import LatentDirichletAllocation, NMF, PCA
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
import pandas as pd

import seaborn as sns

from typing import List, Tuple
from collections import defaultdict


import matplotlib.pyplot as plt
import seaborn as sns

from gensim.models import FastText, Word2Vec, KeyedVectors
from sentence_transformers import SentenceTransformer
import gc
from numba import jit

# Load data

In [2]:
os.chdir('T://lab_research/RES-Folder-UPOD/Echo_label/E_ResearchData/2_ResearchData')

In [3]:
os.listdir("./echo_doc_labels")

['aortic_regurgitation.jsonl',
 'aortic_stenosis.jsonl',
 'diastolic_dysfunction.jsonl',
 'lv_dil.jsonl',
 'lv_syst_func.jsonl',
 'merged_labels.jsonl',
 'mitral_regurgitation.jsonl',
 'pe.jsonl',
 'rv_dil.jsonl',
 'rv_syst_func.jsonl',
 'tricuspid_regurgitation.jsonl',
 'wma.jsonl']

In [57]:
STOPWORDS = ['de', 'het', 'een', 'is', 'bij', 'van', 'met', 'en', 'in', 'voor']

In [40]:
Class = 'aortic_regurgitation'
lemmatize = True
lowercase = False
num_topics = 5
num_words_in_vocab = 5_000
weight_matrix = 'count' # count, tfidf, etm

In [32]:
plt.style.use('ggplot')
if lemmatize:
    nlp = spacy.load("nl_core_news_lg", disable = ['parser','ner'])    

In [12]:
labeled_documents = pd.read_json(f"./echo_doc_labels/{Class}.jsonl", lines=True)
label_col = 'label' if Class!='merged_labels' else 'labels'

train_ids = pd.read_csv('./train_echoid.csv', sep=',').input_hash.unique()
test_ids = pd.read_csv('./test_echoid.csv', sep=',').input_hash.unique()

labeled_documents['_hash'] = labeled_documents.text.str.strip().apply(lambda x: hash(x))
labeled_documents = labeled_documents.drop_duplicates(subset=['_hash']).reset_index(drop=True)


In [None]:
WordLists = labeled_documents.text.str.split(" ").tolist()
WordCount = defaultdict(int)
for d in WordLists:
    for t in d:
        WordCount[t.lower()] += 1

In [15]:
# Expand with label columns
if Class == 'merged_labels':
    target_df = pd.DataFrame.from_records(labeled_documents[label_col])
    Target_maps = {
        _Class: {Label:i for i,Label in enumerate(target_df[Class].unique())}
        for _Class in target_df.columns
    }
else:
    Target_maps = {
        Class: {Label: i for i,Label in enumerate(labeled_documents['label'].unique())} 
    }
    
if Class == 'merged_labels':
    DF = labeled_documents[['text', '_input_hash']].join(target_df[Class])
else:
    DF = labeled_documents[['text', '_input_hash', 'label']]

DF.columns = ['sentence', '_input_hash', 'labels']

label2id = Target_maps[Class]
id2label = {v:k for k,v in label2id.items()}
num_labels = len(label2id)

DF= DF.assign(label=DF['labels'].map(label2id))

# Clean text

In [None]:
DF = DF.assign(sentence=DF.sentence.str.replace(r'[\r\n]', '', regex=True)) 

if lemmatize:
    docs = nlp.pipe(DF.sentence.values)
    new_texts = [" ".join([token.lemma_ for token in doc]) for doc in docs] 
    DF = DF.assign(sentence = new_texts)

if lowercase:
    DF = DF.assign(sentence = DF.sentence.str.lower())

# Make folds

In [22]:
# Stratified cross-validation

def fold_indices(targets: pd.Series=None, stratified: bool=True, seed: int=42, numfolds: int=10)->Tuple[List,List]:
    if stratified:
        splitter = StratifiedKFold(n_splits=numfolds, shuffle=True, random_state=seed)
        _Targets = targets
    else:
        splitter = KFold(n_splits=numfolds, shuffle=True, random_state=seed)
        _Targets = None

    train_indcs, test_indcs = [], []
    for train_index, test_index in splitter.split(X=targets, y=_Targets):
        train_indcs.append(train_index)
        test_indcs.append(test_index)

    return zip(train_indcs, test_indcs)

def make_folds(targets: pd.Series=None, 
               train_test: tuple=None, 
               n_folds: int=10, 
               stratified: bool=True,
               splitting: str='CV',
               label_col: str='labels',
               text_col: str='sentence'):

    TTDict = defaultdict(dict)
    if splitting == 'CV':
        for k,(train_index, test_index) in enumerate(fold_indices(targets=targets[label_col], 
                                                                  stratified=stratified,
                                                                  numfolds=n_folds)):
            TTDict[k]['Xtrain'] = targets.iloc[train_index][text_col]
            TTDict[k]['Xtest'] = targets.iloc[test_index][text_col]
            
            TTDict[k]['ytrain'] = targets.iloc[train_index][label_col]
            TTDict[k]['ytest'] = targets.iloc[test_index][label_col]
    else:
        train_ids, test_ids = train_test
        TTDict[0]['Xtrain'] = targets.loc[targets._input_hash.isin(train_ids)][text_col]
        TTDict[0]['Xtest'] = targets.loc[targets._input_hash.isin(test_ids)][text_col]
        
        TTDict[0]['ytrain'] = targets.loc[targets._input_hash.isin(train_ids)][label_col]
        TTDict[0]['ytest'] = targets.loc[targets._input_hash.isin(test_ids)][label_col]
    
    return TTDict
    

In [29]:
TrainTestDict = make_folds(DF, 
                           (train_ids, test_ids), 
                           n_folds=10, 
                           stratified=True, 
                           splitting='from_file',
                           label_col='label')

In [245]:
t = TrainTestDict[0]['Xtrain'].to_frame().assign(num_tokens=TrainTestDict[0]['Xtrain'].apply(lambda x: len(x.split())))

In [None]:
# REMOVE from train/test?

#3D echo
#Zie verslag status
#Geen echovenster
#tee 190
#HDF 36mnd

# Get Topic models using LDA

In [164]:
from scipy import sparse

In [223]:
def ETM(tfidf_matrix, lda_theta, lda_beta):
    # beta_{k,j} : probability of word j, given topic k
    # theta_{i,k}: probability of topic k, given document i
    # tfidf_matrix
    
    # Get Gamma_i for all documents
    num_topics = lda_theta.shape[1]
    AverageSentenceLength = np.mean(np.sum(tfidf_matrix>0, axis=1))
    gammas=AverageSentenceLength/np.array(np.sum(tfidf_matrix>0, axis=1))[:,0]
    # replace inf-gammas with 0
    gammas[np.where(np.isinf(gammas))]=0
    
    # prep omega
    indices = tfidf_matrix.indices
    indptr = tfidf_matrix.indptr
    z = []
    
    for d,row in enumerate(tfidf_matrix):
        idcs = row.indices
        for w in idcs:
            omega = 0
            for k in range(num_topics):
                omega = omega + gammas[d]*lda_theta[d,k]*lda_beta[k, w]
            z.append(omega)
    omega_sparse = sparse.csr_matrix((z, indices, indptr), shape=tfidf_matrix.shape)
    return tfidf_matrix.multiply(omega_sparse)
    
def SALT(tfidf_matrix, cluster_object=None):
    # https://github.com/bagheria/saltclass/blob/master/saltclass/saltclass.py
    pass

In [231]:
# do for each TrainTestDict[k]['Xtrain]

test = TrainTestDict[0]['Xtrain']

LDA = LatentDirichletAllocation(n_components=num_topics)
TVEC =  CountVectorizer(
    max_df=0.95, min_df=2, max_features=num_words_in_vocab, stop_words=STOPWORDS
)
TFVEC = TfidfVectorizer(
    max_df=0.95, min_df=2, max_features=num_words_in_vocab, stop_words=STOPWORDS
)
test_tf = TVEC.fit_transform(test)
LDA.fit(test_tf)

lda_theta = LDA.transform(test_tf)
lda_beta = LDA.components_ / LDA.components_.sum(axis=1)[:, np.newaxis]

test_tfidf = TFVEC.fit_transform(test)

weight_matrix, gammas = ETM(test_tfidf, lda_theta, lda_beta)
gc.collect()


  gammas=AverageSentenceLength/np.array(np.sum(tfidf_matrix>0, axis=1))[:,0]


0

# Extract UMLS

In [None]:
# text -> UMLS_extractor -> [CUI0,.., CUIX] 

## Assign Cui-embeddings to UMLS-entities with RotatE

In [34]:
# {'cui': emb}
Emb_path = r'T:\\laupodteam\AIOS\Bram\data\Supporting_Data\External_embeddings'
rotate_path = os.path.join(Emb_path, 'GraphEmbeddings', 'RotatE_entity2id.wv')
cui2vec_rotate = KeyedVectors.load_word2vec_format(rotate_path, binary=False)

In [36]:
cui2vec_rotate['C0411888']

array([-4.75310981e-02,  5.64379245e-03, -2.65905098e-03, -1.43344132e-02,
       -1.73746161e-02,  1.22708757e-03,  3.99836563e-02,  1.31232571e-02,
       -3.89336539e-03,  1.49966194e-03, -1.92586123e-03, -3.62286181e-03,
        1.40360612e-02, -5.31081716e-03, -6.14983141e-02,  5.53463399e-02,
       -2.38368288e-02,  4.05533530e-04, -2.74435547e-03,  5.02221147e-03,
        2.22005788e-02, -9.90363955e-03, -2.56597484e-03,  2.86204950e-03,
        8.67487937e-02, -1.87239784e-03,  2.63481122e-03, -1.13010034e-02,
       -2.44271550e-02,  2.22457759e-02, -1.99064147e-02, -3.01551763e-02,
       -1.90576899e-03, -2.58410175e-04, -1.71828680e-02, -5.40143922e-02,
       -1.80036314e-02, -2.25445590e-04, -1.76690836e-02, -1.08256778e-02,
       -1.68524799e-04,  2.40636729e-02, -2.31742300e-02,  1.06420089e-02,
        9.03841108e-04, -2.76733632e-03, -7.23636299e-02,  2.03280061e-01,
       -7.49604311e-04, -1.40114333e-02,  2.32459754e-02, -9.57354307e-02,
       -2.98987143e-02, -

# 

# Add Word2vec from pre-trained embeddings

In [None]:
# {'token': embedding}
# load clinical NLP embeddings, hear we are limited to fourgrams
StaticEmbedding = KeyedVectors.load('path_to_embeddings.wv')

In [None]:
# jegormeister/robbert-v2-dutch-base-mqa-finetuned, 
# textgain/allnli-GroNLP-bert-base-dutch-cased,
# NetherlandsForensicInstitute/robbert-2022-dutch-sentence-transformers
SentenceModel = SentenceTransformer('FremyCompany/BioLORD-2023-M')


# Aggregate per document using PoS, LDA, UMLS