In [10]:
# add autoreload
%load_ext autoreload
%autoreload 2
import spacy
from clinlp import Term
import json
import benedict
import pandas as pd
import os
import gensim
from tqdm import tqdm

from collections import defaultdict




In [42]:
def distill_regex(regex_list: list=None)-> str:
    try:
        combo_list = []
        for regex_ in regex_list:
            # SKIP THIS FOR NOW
            _regex = regex_['TEXT']
            if 'NOT_IN' in _regex.keys():
                return None
            else:
                combo_list.append(f"({_regex.get('REGEX', _regex.get('LOWER', ''))})")          
        return " ".join(combo_list)
    except:
        return None

def collect_regex(regexes: list=None)-> dict:
    res = defaultdict(list)    
    for regex_str in regexes:
        regex_dict = eval(regex_str)
        distilled = distill_regex(regex_dict['pattern'])
        if distilled is not None:
            res[regex_dict['label']].append(distilled)
    return res

# The goal of this notebook

Is to establish the basic scripts to collect relevant **phrases** for classification
and relevant **context-vectors**.


In [2]:
# load the dictionary with terms of interest
TermsOfInterest = benedict.benedict().from_yaml('../assets/token_list_new.yml')
RegexOfInterest_dict = 

In [2]:
with open('../assets/regex/mi_v8.txt', 'r') as fr:
    regex_lines = fr.readlines()

In [43]:
collect_regex(regex_lines)

defaultdict(list,
            {'mitral_valve_native_regurgitation_severe': ['(([Kk]ritische?|[Ee]rnstige?|[Bb]elangrijke)) (^[Mm](itr|itralis)?(klep|\\.|-)?[Ii](nsuff|nsuffici[eÃ«]ntie)?(\\W)?$)',
              '(([Kk]ritische?|[Ee]rnstige?|[Bb]elangrijke)) (^[Mm](itralis)?(klep|[Vv])?(\\W)?$) ([Ii]nsuff(ici[Ã«e]ntie)?)',
              '(^[Mm](itr|itralis)?(klep|\\.|-)?[Ii](nsuff|nsuffici[eÃ«]ntie)?(\\W)?$) ((kritische?|ernstige?|belangrijke))',
              '(^[Mm](itr|itralis)?(klep|\\.|-)?[Ii](nsuff|nsuffici[eÃ«]ntie)?(\\W)?$) (^[Gg]r(aad)?(\\.)?$) (^([Ii][Ii][Ii]-|3-)?([Ii][Vv]|4)(?=\\/)?)',
              '(^[Mm](itralis)?(klep|[Vv])?(\\W)?$) ([Ii]nsuff(ici[Ã«e]ntie)?) (^[Gg]r(aad)?(\\.)?$) (^([Ii][Ii][Ii]-|3-)?([Ii][Vv]|4)(?=\\/)?)',
              '(^[Gg]r(aad)?(\\.)?$) (^([Ii][Ii][Ii]-|3-)?([Ii][Vv]|4)(?=\\/)?) (^[Mm](itr|itralis)?(klep|\\.|-)?[Ii](nsuff|nsuffici[eÃ«]ntie)?(\\W)?$)',
              '(^[Gg]r(aad)?(\\.)?$) (^([Ii][Ii][Ii]-|3-)?([Ii][Vv]|4)(?=\\/)?) (^[Mm](itralis)?

In [None]:
UniqueTargetTerms = 

# We would like to expand this!

Given that ```clinlp``` is able to handle syntactically similar terms we only want to expand this with "semantically" similar terms.

In [3]:
# load an echo report corpus

echoReports = pd.read_parquet('../../../../data/Digin/echo.parquet') 

echoReports = echoReports.assign(TEXT=
                                 echoReports.Brief_txt.str.decode('latin-1') + " " + echoReports.Conclusions_ECHO.str.decode('latin-1')
                                 )

echo_path = 'T://lab_research/RES-Folder-UPOD/Echo_label/E_ResearchData/2_ResearchData'
labeled_texts = pd.read_json(os.path.join(echo_path, 'outdb_140423.jsonl'), lines=True)

In [4]:
# load clinical NLP embeddings, hear we are limited to fourgrams
from gensim.models import FastText, Word2Vec, KeyedVectors
StaticEmbedding = KeyedVectors.load('../../../../language_modeling/Embeddings/CARDIO/without_tokenizer/fasttext/cardio_cbow.wv')

In [5]:
# an alternative is the use of a sentence transformer model 
from sentence_transformers import SentenceTransformer
# jegormeister/robbert-v2-dutch-base-mqa-finetuned, textgain/allnli-GroNLP-bert-base-dutch-cased
sent_model = SentenceTransformer('NetherlandsForensicInstitute/robbert-2022-dutch-sentence-transformers')

# or a clinical BERT model such as MedRoBERTa.nl
from scipy.spatial.distance import cosine as cosine_similarity
def phrase_similarity(phrase1, phrase2, model):
    """
    Compute the cosine similarity between two phrases
    """
    phrase1 = model.encode(phrase1)
    phrase2 = model.encode(phrase2)
    return 1 - cosine_similarity(phrase1, phrase2)

In [6]:
# We have to inspect a nested dictionary with lists, we can do this with a recursive function
def find_new_terms(query_list, min_sim=0.95, topn=50, CHECKLIST=set([])):
    new_terms = []
    for query in query_list:
        new_terms.extend([query])
        new_terms.extend(set([t.replace("BREAK", " ")\
                                           .replace("_", " ")\
                                           .strip(".")\
                                           .strip(",")\
                                           .strip(":")\
                                           .lower() for t,s in StaticEmbedding.most_similar(query, topn=topn) 
                                           if (s>min_sim) & (t not in CHECKLIST)])
        )
    return list(set(new_terms))

In [7]:
from collections.abc import MutableMapping

def flatten(dictionary, parent_key='', separator='|'):
    items = []
    for key, value in dictionary.items():
        new_key = parent_key + separator + key if parent_key else key
        if isinstance(value, MutableMapping):
            items.extend(flatten(value, new_key, separator=separator).items())
        else:
            items.append((new_key, value))
    return dict(items)

ToI_flat = flatten(ToI)
ToI_flat = {tuple(k.split("|")):v for k,v in ToI_flat.items()}

L = [l for l in ToI_flat.values() if isinstance(l, list)]
ALL_TERMS = set([_l for l in L for _l in l])

In [8]:
ToI_flat_expanded = defaultdict(list)
empty_keys = []
for k,v in tqdm(ToI_flat.items()):
    if isinstance(v, list):
        new_terms = find_new_terms(v, min_sim=0.95, topn=50, CHECKLIST=ALL_TERMS)
    else:
        empty_keys.append(k)
        continue
    ToI_flat_expanded[k] = new_terms
        

100%|██████████| 172/172 [01:24<00:00,  2.04it/s]


In [9]:
# expand back to nested dictionary
ToI_expanded = defaultdict(dict)

key_level_dict = defaultdict(set)
for k,v in ToI_flat_expanded.items():
    for level, _k in enumerate(k):
        key_level_dict[level].add(_k)


In [10]:
def tuple_keys_to_nested_dict(d):
    """
    Converts a dictionary with tuple keys to a nested dictionary.

    :param d: dict, the original dictionary with tuple keys.
    :return: dict, the resulting nested dictionary.
    """
    nested_dict = {}
    for keys, value in d.items():
        temp_d = nested_dict  # Start from the top-level dictionary
        for key in keys[:-1]:  # Until the second last key, create/get nested dicts
            temp_d = temp_d.setdefault(key, {})
        temp_d[keys[-1]] = value  # Set the value for the deepest key
    return nested_dict

ToI_expanded = tuple_keys_to_nested_dict(ToI_flat_expanded)

In [33]:
# expand with de-abbreviated variants
import deabber


# We need to find the negatives!

What do we mean?? **Easy**, let's take

```normale echo```, then a negative would be ```geen normale echo``` .

So, how do we find this in bulk?

# Time to construct the ```clinlp``` object!


In [11]:
# Make concepts dictionary with the terms of interest
clinlp_concepts = defaultdict(list)
for k, v in ToI_expanded.items():
    for term in v:
        clinlp_concepts[term].append(Term(k, 
                                   proximity=1,
                                   fuzzy=3,
                                   fuzzy_min_len=7))
# pseudo=True means it is a negative example

# Time to make our classifier using Sklearn base classes!