# Medical Text

Medical Text Dataset [https://www.kaggle.com/datasets/chaitanyakck/medical-text/data]

In [1]:
import pandas as pd
import nltk
from spacy import displacy
import spacy
from transformers import pipeline
import pickle
import json
import os

In [2]:
df = pd.read_csv('data/train.dat', sep="\t", header=None)

In [3]:
df.rename(columns={0:'condition', 1:'abstract'}, inplace=True)
df.head()

Unnamed: 0,condition,abstract
0,4,Catheterization laboratory events and hospital...
1,5,Renal abscess in children. Three cases of rena...
2,2,Hyperplastic polyps seen at sigmoidoscopy are ...
3,5,Subclavian artery to innominate vein fistula a...
4,4,Effect of local inhibition of gamma-aminobutyr...


## Lexical Analysis
Lexical analysis consists of the operations of:
- **Sentence Splitting**: The technique aims to identify the beginning and end of a textual fragment (sentence or clause) with informative content, even if simple. To achieve this, it uses orthographic features of words (e.g., uppercase initial letters) and delimiters (e.g., punctuation).
- **Tokenization**: The goal of tokenization is to pinpoint the starting and ending positions of each token, whether it’s a word, a number, or a combination of symbols. As with sentence splitting, the process relies on orthographic features (e.g., initial capital letters) and delimiters (e.g., punctuation).
- **Lemmatization**: Post-tokenization techniques address the morphological analysis of word-tokens. Lemmatization identifies the base form (lemma) of inflected words, preserving their meaning and grammatical category. For example, the token _liked_ maps to the lemma *like*. This process minimizes lexical variation by consolidating different forms of the same word into a unified representation.
- **Stemming**: Like lemmatization, processes inflected forms but reduces them to their root, which may not correspond to a dictionary word. Unlike lemmatization, it focuses on inflections that create new words and may change the grammatical class, such as *probable* (adjective) stemming to *probably* (adverb).
- **POS Tagging**: Part-of-speech (POS) tagging assigns a grammatical category to each token, such as noun, verb, or adjective

It focuses on the main components of a text (words), and aims to recognize them in relation to the context in which they are used, such as sentences or clauses.

#### Funzioni utili
From now on there will be repetitive steps in order to check files existence prior to create each one: the following functions aims to simplify the process.

In [4]:
def check_existence(obj):
    """
    checks if we already stored the file
    Arg:
    obj = str name of the object
    """
    path = 'objects/'+obj+'.pkl'
    if os.path.exists(path):
        with open(path, 'rb') as file:
            file = pickle.load(file)
            return file
    else:
        return False

In [5]:
def save_step(name, obj):
    """
    Saves object to pickle and json files
    Args:
    name = 'obj_name'
    obj = object to save
    """

    with open('objects/'+name+'.pkl', 'wb') as file:
        pickle.dump(obj, file)

    with open('objects/'+name+'.json', 'w') as file:
        json.dump(obj, file)

In [6]:
lexical_df = df.copy() # a df to compute lexical analysis on

### Sentence Splitting, Word Tokenization, Lemmatization, Stemming and POS Tagging


In [7]:
# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")



In [8]:
if check_existence('sentences'): # if the first obj exists for sure we created the others(?)
    sentences = check_existence('sentences')
    words = check_existence('words')
    lemmatization = check_existence('lemmatization')
    stemming = check_existence('stemming')
    pos_tag = check_existence('pos_tag')

else:
    sentences = []
    words = []
    lemmatization = []

    stemming = []
    porterStemmer = nltk.PorterStemmer()

    pos_tag = []

    for record in df["abstract"]:
        doc = nlp(record)
        split_record = []
        tokenized_record = []
        lemmatized_record = []
        stemmed_record = []
        pos_tag_record = []

        for sentence in doc.sents:
            split_record.append([sentence.text]) # Sentence Splitting
            tokenized_sent = []
            lemmatized_sent = []
            stemmed_sent = []
            pos_tag_sent = []

            for token in sentence:
                word = token.text # Tokenization
                tokenized_sent.append(word)

                lemmatized_sent.append(token.lemma_) # Lemmatization

                stemmed_sent.append(porterStemmer.stem(word)) # Stemming

                pos_tag_sent.append((word,token.tag_)) # POS Tagging

                # print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)
            tokenized_record.append(tokenized_sent)
            lemmatized_record.append(lemmatized_sent)
            stemmed_record.append(stemmed_sent)
            pos_tag_record.append(pos_tag_sent)

        sentences.append(split_record)
        words.append(tokenized_record)
        lemmatization.append(lemmatized_record)
        stemming.append(stemmed_record)
        pos_tag.append(pos_tag_record)

    # Save each step to a file
    save_step('sentences',sentences)
    save_step('words',words)
    save_step('lemmatization',lemmatization)
    save_step('stemming',stemming)
    save_step('pos_tag',pos_tag)

In [9]:
lexical_df["sentences"] = sentences
lexical_df['words'] = words
lexical_df['lemmatization'] = lemmatization
lexical_df['stemming'] = stemming
lexical_df['pos_tag'] = pos_tag

In [10]:
lexical_df.head()

Unnamed: 0,condition,abstract,sentences,words,lemmatization,stemming,pos_tag
0,4,Catheterization laboratory events and hospital...,[[Catheterization laboratory events and hospit...,"[[Catheterization, laboratory, events, and, ho...","[[catheterization, laboratory, event, and, hos...","[[catheter, laboratori, event, and, hospit, ou...","[[(Catheterization, NN), (laboratory, NN), (ev..."
1,5,Renal abscess in children. Three cases of rena...,"[[Renal abscess in children.], [Three cases of...","[[Renal, abscess, in, children, .], [Three, ca...","[[Renal, abscess, in, child, .], [three, case,...","[[renal, abscess, in, children, .], [three, ca...","[[(Renal, NNP), (abscess, NN), (in, IN), (chil..."
2,2,Hyperplastic polyps seen at sigmoidoscopy are ...,[[Hyperplastic polyps seen at sigmoidoscopy ar...,"[[Hyperplastic, polyps, seen, at, sigmoidoscop...","[[hyperplastic, polyp, see, at, sigmoidoscopy,...","[[hyperplast, polyp, seen, at, sigmoidoscopi, ...","[[(Hyperplastic, JJ), (polyps, NNS), (seen, VB..."
3,5,Subclavian artery to innominate vein fistula a...,[[Subclavian artery to innominate vein fistula...,"[[Subclavian, artery, to, innominate, vein, fi...","[[subclavian, artery, to, innominate, vein, fi...","[[subclavian, arteri, to, innomin, vein, fistu...","[[(Subclavian, JJ), (artery, NN), (to, TO), (i..."
4,4,Effect of local inhibition of gamma-aminobutyr...,[[Effect of local inhibition of gamma-aminobut...,"[[Effect, of, local, inhibition, of, gamma, -,...","[[effect, of, local, inhibition, of, gamma, -,...","[[effect, of, local, inhibit, of, gamma, -, am...","[[(Effect, NN), (of, IN), (local, JJ), (inhibi..."


In [11]:
sentences[0][0]

['Catheterization laboratory events and hospital outcome with direct angioplasty for acute myocardial infarction To assess the safety of direct infarct angioplasty without antecedent thrombolytic therapy, catheterization laboratory and hospital events were assessed in consecutively treated patients with infarctions involving the left anterior descending (n = 100 patients), right (n = 100), and circumflex (n = 50) coronary arteries.']

### Stop-words Removal
Stop-words are common words that do not carry specific meaning, such as articles, prepositions, and conjunctions.
It is usually performed after lexical analysis to avoid inaccuracies in subsequent syntactic or semantic analyses.

In [12]:
# nltk.download('stopwords')

In [13]:
if check_existence('stopwords_removal'):
    stopwords_removal = check_existence('stopwords_removal')

else:
    stopwords = nltk.corpus.stopwords.words('english')

    stopwords_removal = []

    for record in lexical_df['sentences']:
        filtered_record = []
        for sentence in record:
            filtered_sentence = [word for word in sentence if word.lower() not in stopwords]
            filtered_record.append(filtered_sentence)

        stopwords_removal.append(filtered_record)

    save_step('stopwords_removal',stopwords_removal)

In [14]:
lexical_df["stopwords_removal"] = stopwords_removal

In [15]:
lexical_df.head()

Unnamed: 0,condition,abstract,sentences,words,lemmatization,stemming,pos_tag,stopwords_removal
0,4,Catheterization laboratory events and hospital...,[[Catheterization laboratory events and hospit...,"[[Catheterization, laboratory, events, and, ho...","[[catheterization, laboratory, event, and, hos...","[[catheter, laboratori, event, and, hospit, ou...","[[(Catheterization, NN), (laboratory, NN), (ev...",[[Catheterization laboratory events and hospit...
1,5,Renal abscess in children. Three cases of rena...,"[[Renal abscess in children.], [Three cases of...","[[Renal, abscess, in, children, .], [Three, ca...","[[Renal, abscess, in, child, .], [three, case,...","[[renal, abscess, in, children, .], [three, ca...","[[(Renal, NNP), (abscess, NN), (in, IN), (chil...","[[Renal abscess in children.], [Three cases of..."
2,2,Hyperplastic polyps seen at sigmoidoscopy are ...,[[Hyperplastic polyps seen at sigmoidoscopy ar...,"[[Hyperplastic, polyps, seen, at, sigmoidoscop...","[[hyperplastic, polyp, see, at, sigmoidoscopy,...","[[hyperplast, polyp, seen, at, sigmoidoscopi, ...","[[(Hyperplastic, JJ), (polyps, NNS), (seen, VB...",[[Hyperplastic polyps seen at sigmoidoscopy ar...
3,5,Subclavian artery to innominate vein fistula a...,[[Subclavian artery to innominate vein fistula...,"[[Subclavian, artery, to, innominate, vein, fi...","[[subclavian, artery, to, innominate, vein, fi...","[[subclavian, arteri, to, innomin, vein, fistu...","[[(Subclavian, JJ), (artery, NN), (to, TO), (i...",[[Subclavian artery to innominate vein fistula...
4,4,Effect of local inhibition of gamma-aminobutyr...,[[Effect of local inhibition of gamma-aminobut...,"[[Effect, of, local, inhibition, of, gamma, -,...","[[effect, of, local, inhibition, of, gamma, -,...","[[effect, of, local, inhibit, of, gamma, -, am...","[[(Effect, NN), (of, IN), (local, JJ), (inhibi...",[[Effect of local inhibition of gamma-aminobut...


# Syntax Analysis
Syntax analysis consists of:
- Shallow Parsing
- Deep Parsing

### Shallow Parsing
Syntactic parsing extends chunking by generating a parse tree. This tree organizes POS-tagging results as leaf nodes and syntactic structures (often chunks) as intermediate nodes, connected hierarchically without representing specific relationships.


In [16]:
syntax_df = df.copy() # a df to compute syntax analysis on

In [17]:
if check_existence('chunking'):
    chunking = check_existence('chunking')

else:
    # Define the grammar and the chunk parser
    grammar = "NP: {<NNP><NNP>}"
    cp = nltk.RegexpParser(grammar) # chunk parser

    # Apply chunking to each record
    chunking = []
    for record in lexical_df['pos_tag']:
        chunked_record = [cp.parse(sentence) for sentence in record]

        chunking.append(chunked_record)

    save_step('chunking',chunking)

In [18]:
# Save the chunking results into the dataframe
syntax_df["shallow_parsing"] = chunking

In [19]:
# Display the dataframe
syntax_df.head()

Unnamed: 0,condition,abstract,shallow_parsing
0,4,Catheterization laboratory events and hospital...,"[[(Catheterization, NN), (laboratory, NN), (ev..."
1,5,Renal abscess in children. Three cases of rena...,"[[(Renal, NNP), (abscess, NN), (in, IN), (chil..."
2,2,Hyperplastic polyps seen at sigmoidoscopy are ...,"[[(Hyperplastic, JJ), (polyps, NNS), (seen, VB..."
3,5,Subclavian artery to innominate vein fistula a...,"[[(Subclavian, JJ), (artery, NN), (to, TO), (i..."
4,4,Effect of local inhibition of gamma-aminobutyr...,"[[(Effect, NN), (of, IN), (local, JJ), (inhibi..."


In [20]:
# Display the chunking result for the first record
print(syntax_df['shallow_parsing'][0][4])
#syntax_df['chunking'][0][4]

(S
  There/EX
  was/VBD
  one/CD
  in/IN
  -/HYPH
  laboratory/NN
  death/NN
  (/-LRB-
  shock/NN
  patient/NN
  with/IN
  infarction/NN
  of/IN
  the/DT
  left/JJ
  anterior/JJ
  descending/VBG
  coronary/JJ
  artery/NN
  )/-RRB-
  ./.)


In [21]:
# To draw the parse tree
#syntax_df['chunking'][1][1].draw()

### Deep Parsing
Differently from _Shallow parsing_, _Deep parsing_ aims to infer dependency relationships between nodes.
The result is a dependency graph which relates words that are syntactically linked.

In [22]:
if check_existence('deep_parsing'):
    deep_parsing = check_existence('deep_parsing')

else:
    deep_parsing = []
    for sentences in syntax_df["abstract"]:
        sentence_dep = []
        doc = nlp(sentences)
        for token in doc:
            sentence_dep.append((str(token.text), str(token.dep_), str(token.head.text), str([child for child in token.children])))
            # creates a tuple containing the token, dependency nature, head and all dependents of the token
        deep_parsing.append(sentence_dep)

    save_step('deep_parsing',deep_parsing)

In [23]:
syntax_df["deep_parsing"] = deep_parsing

In [24]:
syntax_df.head()

Unnamed: 0,condition,abstract,shallow_parsing,deep_parsing
0,4,Catheterization laboratory events and hospital...,"[[(Catheterization, NN), (laboratory, NN), (ev...","[(Catheterization, compound, events, []), (lab..."
1,5,Renal abscess in children. Three cases of rena...,"[[(Renal, NNP), (abscess, NN), (in, IN), (chil...","[(Renal, nsubj, abscess, []), (abscess, ROOT, ..."
2,2,Hyperplastic polyps seen at sigmoidoscopy are ...,"[[(Hyperplastic, JJ), (polyps, NNS), (seen, VB...","[(Hyperplastic, amod, polyps, []), (polyps, ns..."
3,5,Subclavian artery to innominate vein fistula a...,"[[(Subclavian, JJ), (artery, NN), (to, TO), (i...","[(Subclavian, amod, artery, []), (artery, nsub..."
4,4,Effect of local inhibition of gamma-aminobutyr...,"[[(Effect, NN), (of, IN), (local, JJ), (inhibi...","[(Effect, ROOT, Effect, [of, :, study, .]), (o..."


In [25]:
# Seleziona la prima riga del dataframe
sentence = syntax_df["abstract"].iloc[0]

# Analizza la frase
doc = nlp(sentence)

# Visualizza il grafico di dipendenze
displacy.render(doc, style="dep", jupyter=True)

## Semantic Analysis

Semantic analysis aims to extract the meaning of a text, focusing on the relationships between entities and the context in which they appear.

### Entity Extraction

Entity extraction identifies named entities in a text, such as people, organizations, or locations.

In [26]:
semantic_df = df.copy() # a df to compute semantic analysis on

This code extracts named entities from the "abstract" column of a DataFrame called syntax_df using a NLP model.
For each record, it collects the entities and their labels, storing them in a list of tuples.

In [27]:
pipe = pipeline("token-classification", model="Clinical-AI-Apollo/Medical-NER", aggregation_strategy='simple')

In [28]:
# if the entities are already extracted, load them from the pickle file
if check_existence('entities'):
    entities = check_existence('entities')

else:
    entities = []
    for record in semantic_df["abstract"]:
        result = pipe(record)

        record_entities = [(entity['word'], entity['entity_group']) for entity in result]

        entities.append(record_entities)

    save_step('entities',entities)

In [29]:
semantic_df["entities"] = entities

To get a sense of the entities extracted from the text, we can display the unique entities found in the dataset.

In [30]:
# Flatten the list of entities and extract only the labels
all_labels = pd.Series([label for record in semantic_df["entities"] for _, label in record])

# Perform value counts on the labels
all_labels.value_counts()

DETAILED_DESCRIPTION      100224
DIAGNOSTIC_PROCEDURE       74383
SIGN_SYMPTOM               68825
LAB_VALUE                  68092
DISEASE_DISORDER           61689
BIOLOGICAL_STRUCTURE       55139
THERAPEUTIC_PROCEDURE      27167
MEDICATION                 19053
DATE                       15674
DURATION                    5689
DOSAGE                      5519
AGE                         3286
CLINICAL_EVENT              2792
DISTANCE                    2380
SEVERITY                    2013
NONBIOLOGICAL_LOCATION      1819
HISTORY                     1763
SEX                         1451
FAMILY_HISTORY               426
AREA                         139
ADMINISTRATION               129
COREFERENCE                  122
OTHER_ENTITY                  15
PERSONAL_BACKGROUND           13
VOLUME                         9
TIME                           4
Name: count, dtype: int64

In [31]:
semantic_df.head()

Unnamed: 0,condition,abstract,entities
0,4,Catheterization laboratory events and hospital...,"[(Catheterization laboratory events, DIAGNOSTI..."
1,5,Renal abscess in children. Three cases of rena...,"[(Renal, BIOLOGICAL_STRUCTURE), (abscess, DISE..."
2,2,Hyperplastic polyps seen at sigmoidoscopy are ...,"[(Hyperplastic polyps, SIGN_SYMPTOM), (sigmoid..."
3,5,Subclavian artery to innominate vein fistula a...,"[(Subclavian artery to innominate vein, BIOLOG..."
4,4,Effect of local inhibition of gamma-aminobutyr...,"[(dorsomedial hypothalamus, BIOLOGICAL_STRUCTU..."


In [32]:
def display_entities(pipe, document):
    """
    Display recognized entities using displacy.render().

    Parameters:
        pipe: Hugging Face pipeline for entity recognition.
        document (str): Text to process.

    Returns:
        - Visualization of entities using displacy.
        - List of entities and their labels.
    """
    # Run the pipeline on the document
    result = pipe(document)

    # Create a blank SpaCy model to handle the document
    nlp = spacy.blank("en")
    doc = nlp.make_doc(document)

    # Add entities manually
    ents = []
    for entity in result:
        start_char = entity['start']
        end_char = entity['end']
        label = entity['entity_group']
        # Create a span manually
        span = doc.char_span(start_char, end_char, label=label, alignment_mode="expand")
        if span is not None:
            ents.append(span)

    # Ensure no overlapping spans
    ents = spacy.util.filter_spans(ents)

    # Assign the entities to the document
    doc.ents = ents

    # Display with displacy
    displacy.render(doc, style='ent', jupyter=True)

In [33]:
display_entities(pipe, semantic_df["abstract"].iloc[2])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [34]:
semantic_df.head()

Unnamed: 0,condition,abstract,entities
0,4,Catheterization laboratory events and hospital...,"[(Catheterization laboratory events, DIAGNOSTI..."
1,5,Renal abscess in children. Three cases of rena...,"[(Renal, BIOLOGICAL_STRUCTURE), (abscess, DISE..."
2,2,Hyperplastic polyps seen at sigmoidoscopy are ...,"[(Hyperplastic polyps, SIGN_SYMPTOM), (sigmoid..."
3,5,Subclavian artery to innominate vein fistula a...,"[(Subclavian artery to innominate vein, BIOLOG..."
4,4,Effect of local inhibition of gamma-aminobutyr...,"[(dorsomedial hypothalamus, BIOLOGICAL_STRUCTU..."


In [35]:
# Filter and display only the tuples where the label is BIOLOGICAL_STRUCTURE
semantic_df['entities'].apply(lambda x: [(text, label) for text, label in x if label == 'MEDICATION'])

0                                                       []
1                              [(Antibiotics, MEDICATION)]
2                                                       []
3                                                       []
4        [(nipecotic acid, MEDICATION), (KCl, MEDICATIO...
                               ...                        
14433                                                   []
14434                                                   []
14435                [(platelet concentrates, MEDICATION)]
14436                                [(renal, MEDICATION)]
14437       [(heparin, MEDICATION), (heparin, MEDICATION)]
Name: entities, Length: 14438, dtype: object

### Relation Extraction

Relation extraction identifies the connections between entities in a text, such as the subject, object, and verb of a sentence.

This process is more complex than entity extraction, as it requires understanding the syntactic structure of the text to infer relationships between entities.

1. **Iterate through sentences**:
   Process each sentence (`doc`) and its entities, storing entities in a set for faster lookup (`entities_set`).

2. **Extract relations from tokens**:
   Loop through tokens with dependencies like "ROOT" or "VERB" to identify:
   - **Subjects**: Found in children with dependencies like "nsubj" or "agent."
   - **Objects**: Found in children with dependencies like "dobj" or "pobj."

3. **Record relations**:
   - Direct relations: `(subject, verb, object).`
   - Prepositional relations: Handle `prep` and `pobj` to form `(subj, verb_prep, obj)` or similar.

4. **Handle conjunctions**:
   Add relations involving conjunctive tokens (`conj`).



### TF_IDF

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

The following vectorizer takes a sequence of byte strings, converts it into lowercase, extracts unigrams, and calculates a TF-IDF score. It contains stop words from english vocabulary, the n-grams that occur in more than 60% of documents or in less than 10% of documents will be ignored.

In [37]:
vectorizer = TfidfVectorizer(input='content', use_idf=True, lowercase=True,
analyzer='word', ngram_range=(1, 1), stop_words='english', vocabulary=None, min_df=0.10, max_df=0.60)

In [38]:
tfidf_matrix = vectorizer.fit_transform(df['abstract'].values)
print(f"Matrix dimension: {tfidf_matrix.shape}")

Matrix dimension: (14438, 62)


In [39]:
feature_names = vectorizer.get_feature_names_out()
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=df.index, columns=feature_names)

In [40]:
tfidf_df.head()

Unnamed: 0,10,11,12,15,20,acute,age,analysis,associated,blood,...,therapy,time,treated,treatment,tumor,use,used,using,year,years
0,0.151755,0.0,0.0,0.0,0.0,0.176468,0.164997,0.0,0.0,0.0,...,0.166472,0.0,0.170159,0.0,0.0,0.0,0.0,0.0,0.0,0.462777
1,0.204264,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.199219,0.0,...,0.224074,0.0,0.0,0.0,0.248062,0.0,0.0,0.0,0.0,0.207635
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.278452,0.0,0.0,0.0,0.0
4,0.136076,0.0,0.0,0.159619,0.0,0.0,0.0,0.0,0.132715,0.151375,...,0.0,0.0,0.0,0.124461,0.0,0.0,0.150682,0.152792,0.0,0.0


In [41]:
tfidf_df.loc['00_Document Frequency'] = (tfidf_df > 0).sum()

In [42]:
tfidf_df.tail()

Unnamed: 0,10,11,12,15,20,acute,age,analysis,associated,blood,...,therapy,time,treated,treatment,tumor,use,used,using,year,years
14434,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.665659,0.0,0.0
14435,0.252997,0.450696,0.139863,0.0,0.0,0.147099,0.0,0.0,0.0,0.14072,...,0.138766,0.0,0.0,0.0,0.0,0.14881,0.0,0.0,0.0,0.0
14436,0.182328,0.0,0.604771,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.200009,0.410164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14437,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00_Document Frequency,2639.0,1590.0,1984.0,1654.0,1709.0,1700.0,2085.0,1601.0,2821.0,1948.0,...,2031.0,1884.0,1902.0,3323.0,1479.0,1639.0,1975.0,1894.0,1915.0,2524.0


In [43]:
mapping = {
    'condition': {1: "neoplasm",
                  2: "digestive system disease",
                  3: "nervous system disease",
                  4: "cardiovascular disease",
                  5: "general pathological conditions"}
}

semantic_df.replace(mapping, inplace = True)

In [44]:
semantic_df.head()

Unnamed: 0,condition,abstract,entities
0,cardiovascular disease,Catheterization laboratory events and hospital...,"[(Catheterization laboratory events, DIAGNOSTI..."
1,general pathological conditions,Renal abscess in children. Three cases of rena...,"[(Renal, BIOLOGICAL_STRUCTURE), (abscess, DISE..."
2,digestive system disease,Hyperplastic polyps seen at sigmoidoscopy are ...,"[(Hyperplastic polyps, SIGN_SYMPTOM), (sigmoid..."
3,general pathological conditions,Subclavian artery to innominate vein fistula a...,"[(Subclavian artery to innominate vein, BIOLOG..."
4,cardiovascular disease,Effect of local inhibition of gamma-aminobutyr...,"[(dorsomedial hypothalamus, BIOLOGICAL_STRUCTU..."


## DATABASE

This is an image of the database schema we are going to create.

<div style="text-align: center;">
    <img src="images/er_schema.png" alt="Database Schema" width="700"/>
</div>

### Data Definition Language

In [175]:
# Drop rows where condition is 5
semantic_df = semantic_df[semantic_df['condition'] != 'general pathological conditions']

In [202]:
CREATE_MEDICATIONS = """
DROP TABLE IF EXISTS MEDICATIONS;

CREATE TABLE MEDICATIONS(
id SERIAL PRIMARY KEY,
name VARCHAR NOT NULL,
dosage VARCHAR
);
"""

# Save to file
with open('database/ddl/create_medications.sql', 'w') as file:
    file.write(CREATE_MEDICATIONS)

In [203]:
CREATE_DISEASES = """
DROP TABLE IF EXISTS DISEASES;

CREATE TABLE DISEASES(
id SERIAL PRIMARY KEY,
name VARCHAR NOT NULL,
condition VARCHAR NOT NULL
);
"""

# Save to file
with open('database/ddl/create_diseases.sql', 'w') as file:
    file.write(CREATE_DISEASES)

In [204]:
CREATE_SYMPTOMS = """
DROP TABLE IF EXISTS SYMPTOMS;

CREATE TABLE SYMPTOMS(
id SERIAL PRIMARY KEY,
name VARCHAR NOT NULL
);
"""

# Save to file
with open('database/ddl/create_symptoms.sql', 'w') as file:
    file.write(CREATE_SYMPTOMS)

In [205]:
CREATE_DIAGNOSTIC_TESTS = """
DROP TABLE IF EXISTS DIAGNOSTIC_TESTS;

CREATE TABLE DIAGNOSTIC_TESTS(
id SERIAL PRIMARY KEY,
name VARCHAR NOT NULL
);
"""

# Save to file
with open('database/ddl/create_diagnostic_tests.sql', 'w') as file:
    file.write(CREATE_DIAGNOSTIC_TESTS)

In [206]:
CREATE_BIOLOGICAL_STRUCTURES = """
DROP TABLE IF EXISTS BIOLOGICAL_STRUCTURES;

CREATE TABLE BIOLOGICAL_STRUCTURES(
id SERIAL PRIMARY KEY,
name VARCHAR NOT NULL
);
"""

# Save to file
with open('database/ddl/create_biological_structures.sql', 'w') as file:
    file.write(CREATE_BIOLOGICAL_STRUCTURES)

In [207]:
CREATE_TREATMENTS = """
DROP TABLE IF EXISTS TREATMENTS;

CREATE TABLE TREATMENTS(
id SERIAL PRIMARY KEY,
diseaseID INT NOT NULL,
treatmentID INT NOT NULL,,
FOREIGN KEY (diseaseID) REFERENCES DISEASES(id),
FOREIGN KEY (treatmentID) REFERENCES TREATMENTS(id)
);
"""

# Save to file
with open('database/ddl/create_treatments.sql', 'w') as file:
    file.write(CREATE_TREATMENTS)

In [208]:
CREATE_MANIFESTATIONS = """
DROP TABLE IF EXISTS MANIFESTATIONS;

CREATE TABLE MANIFESTATIONS(
id SERIAL PRIMARY KEY,
diseaseID INT NOT NULL,
symptomID INT NOT NULL,
severity VARCHAR,
FOREIGN KEY (diseaseID) REFERENCES DISEASES(id),
FOREIGN KEY (symptomID) REFERENCES SYMPTOMS(id)
);
"""

# Save to file
with open('database/ddl/create_manifestations.sql', 'w') as file:
    file.write(CREATE_MANIFESTATIONS)

In [209]:
CREATE_DIAGNOSIS = """
DROP TABLE IF EXISTS DIAGNOSIS;

CREATE TABLE DIAGNOSIS(
id SERIAL PRIMARY KEY,
diseaseID INT NOT NULL,
diagnosticTestID INT NOT NULL,
FOREIGN KEY (diseaseID) REFERENCES DISEASES(id),
FOREIGN KEY (diagnosticTestID) REFERENCES DIAGNOSTIC_TESTS(id)
)"""

# Save to file
with open('database/ddl/create_diagnosis.sql', 'w') as file:
    file.write(CREATE_DIAGNOSIS)

In [211]:
CREATE_INVOLVEMENTS = """
DROP TABLE IF EXISTS INVOLVEMENTS;

CREATE TABLE INVOLVEMENTS(
id SERIAL PRIMARY KEY,
diseaseID INT NOT NULL,
biologicalStrID INT NOT NULL,
FOREIGN KEY (diseaseID) REFERENCES DISEASES(id),
FOREIGN KEY (biologicalStrID) REFERENCES BIOLOGICAL_STRUCTURES(id)
"""

# Save to file
with open('database/ddl/create_involvements.sql', 'w') as file:
    file.write(CREATE_INVOLVEMENTS)

In [212]:
# List of DDL files to concatenate
ddl_files = [
    'database/ddl/create_medications.sql',
    'database/ddl/create_diseases.sql',
    'database/ddl/create_symptoms.sql',
    'database/ddl/create_diagnostic_tests.sql',
    'database/ddl/create_biological_structures.sql',
    'database/ddl/create_treatments.sql',
    'database/ddl/create_manifestations.sql',
    'database/ddl/create_diagnosis.sql',
    'database/ddl/create_involvements.sql'
]

# Output file
output_file = 'database/ddl/create_all_tables.sql'

# Concatenate the contents of all DDL files into the output file
with open(output_file, 'w') as outfile:
    for ddl_file in ddl_files:
        with open(ddl_file, 'r') as infile:
            outfile.write(infile.read())

### Data Manipulation Language

In [186]:
# Symptom entities
symptoms = []

for entities in semantic_df['entities']:
    for text, label in entities:
        if label == 'SIGN_SYMPTOM':
            symptoms.append(text)

symptoms

['infarction',
 'infarction',
 'infarction',
 'infarction',
 'hypotension',
 'infarction',
 'infarction',
 'infarction',
 'death',
 'infarction',
 'Hyperplastic polyps',
 'adenomas',
 'Asym',
 'Polyps',
 'polyps',
 'rectosigm',
 'polyps',
 'rectosigmoid adenoma',
 'hyperplastic polyps',
 'other findings',
 'adenoma',
 'tosigm',
 'polyps',
 'adenomas',
 'Proximal',
 'adenomas',
 'adenomas',
 'tachycardia',
 'o-excitatory',
 'tachycardia',
 'tachycardia',
 'infection',
 'infection',
 'infection',
 'infection',
 'infection',
 'infection',
 'infections',
 'infections',
 'infection',
 'infection',
 'antibodies',
 'HBsAg',
 'HBeAg',
 'hepatitis B virus DNA',
 'anti-HBe',
 'anti',
 'HBs',
 'Anti-',
 'responses',
 'ecchymoses',
 'thrombocytopenia',
 'adenocarcinoma',
 'immuno',
 'febrile',
 'nervous system',
 'infection',
 'ingeal irritation',
 'intracranial pressure',
 'signs',
 'CRP',
 'signs',
 'CRP',
 'meningeal signs',
 'CRP',
 'CRP',
 'primary',
 'metastatic lesions',
 'died',
 'tumor',


In [187]:
# Normalize to lowercase before creating the set
symptoms = list(set(s.lower() for s in symptoms))

In [188]:
# Insert the symptoms into the SYMPTOMS table
insert_symptoms = []

for symptom in symptoms:
    query = f"INSERT INTO SYMPTOMS (name) VALUES ('{symptom}');"
    insert_symptoms.append(query)

# Save the queries to a file
with open('database/dml/insert_symptoms.sql', 'w') as file:
    for query in insert_symptoms:
        file.write(query + '\n')

In [189]:
# List to store (condition, disease) tuples
diseases = []

# Iterate through the rows of the DataFrame
for condition, entities in zip(semantic_df['condition'], semantic_df['entities']):
    for text, label in entities:
        if label == 'DISEASE_DISORDER':
            diseases.append((condition, text))

len(diseases)

41952

In [190]:
# Remove duplicates in a case-insensitive way
unique_diseases = list({(condition, disease.lower()) for condition, disease in diseases})

len(unique_diseases)

13378

In [191]:
insert_diseases = []

for condition, disease in unique_diseases:
    query = f"INSERT INTO DISEASES (condition, name) VALUES ({condition}, '{disease}');"
    insert_diseases.append(query)

# Save the queries to a file
with open('database/dml/insert_diseases.sql', 'w') as file:
    for query in insert_diseases:
        file.write(query + '\n')

In [192]:
# Diagnostic Test entities
diagnostic_tests = []

for entities in semantic_df['entities']:
    for text, label in entities:
        if label == 'DIAGNOSTIC_PROCEDURE':
            diagnostic_tests.append(text)

diagnostic_tests

['Catheterization laboratory events',
 'hospital outcome',
 'catheterization laboratory',
 'hospital',
 'grade flow',
 'catheterization laboratory',
 'sigmoidoscopy',
 'colonoscopy',
 'sigmoidoscopy',
 'endo',
 'sigmoidoscopy',
 'colonoscopy',
 'sigmoid',
 'gamma-aminobutyric acid',
 'GABA',
 'heart rate',
 'blood pressure',
 'GABA',
 'taurine',
 'heart rate',
 'arterial pressure',
 'GABA',
 'aspartate',
 'glutamate',
 'taurine',
 'glycine',
 'alanine',
 'heart rate',
 'arterial pressure',
 'GABA',
 'heart rate',
 'GABA',
 'Pointing',
 'pointed',
 'serum',
 'Anti-pre-S antibodies',
 'enzyme immunoassays',
 'monoclonal antibodies',
 's',
 '-pre-S antibodies',
 'Anti-pre-S',
 'pre-S',
 'pre-S1',
 '-S2 proteins',
 'infectious',
 'standard workup',
 'Immunohistochemical',
 'P-glycoprotein',
 'P-glycoprotein',
 'Pgp',
 'multidrug',
 'Pgp',
 'Pgp',
 'Pgp',
 'Pgp',
 'Pgp',
 'Pgp',
 'avidin-biotin-complex',
 'immunohistochemical',
 'epitopes',
 'Pgp',
 'immunostaining',
 'MAb C219',
 '494',
 '

In [193]:
# Normalize to lowercase before creating the set
diagnostic_tests = list(set(d.lower() for d in diagnostic_tests))

In [194]:
insert_diagnostic_tests = []

for diagnostic_test in diagnostic_tests:
    query = f"INSERT INTO DIAGNOSTIC_TESTS (name) VALUES ('{diagnostic_test}');"
    insert_diagnostic_tests.append(query)

with open('database/dml/insert_diagnostic_tests.sql', 'w') as file:
    for query in insert_diagnostic_tests:
        file.write(query + '\n')

In [195]:
# Biological Structure entities
biological_structures = []

for entities in semantic_df['entities']:
    for text, label in entities:
        if label == 'BIOLOGICAL_STRUCTURE':
            biological_structures.append(text)

biological_structures

['left anterior descending',
 'coronary arteries',
 'left',
 'descending',
 'right',
 'left anterior descending',
 'left anterior descending coronary artery',
 'right coronary artery',
 'circumflex coronary artery',
 'left anterior descending coronary artery',
 'right coronary artery',
 'circumflex coronary artery',
 'left anterior descending coronary artery',
 'rec',
 'proximal sites',
 'dorsomedial hypothalamus',
 'dorsomedial hypothalamus',
 'dorsomedial hypothalamus',
 'dorsomedial hypothalamus',
 'dial hypothalamus',
 'catheter track',
 'exit site',
 'superficial',
 'catheter track',
 'epidural space',
 'deep track',
 'epidural space',
 'deep',
 'epidural',
 'spinal cord',
 'body part',
 'pre-S1 and pre-S2 regions',
 'hepatitis B virus',
 'pre-S1 and pre-S2 sequence',
 'serum',
 'pre',
 'spinal cord',
 'lumbosacral junction',
 'endometrial',
 'gravid',
 'secretory endometrium',
 'uterine secretory epithelium',
 'endometrial',
 'endometrium',
 'adjacent',
 'Endometrial capillaries'

In [196]:
# Normalize to lowercase before creating the set
biological_structures = list(set(b.lower() for b in biological_structures))

In [197]:
insert_biological_structures = []

for biological_structure in biological_structures:
    query = f"INSERT INTO BIOLOGICAL_STRUCTURE (name) VALUES ('{biological_structure}');"
    insert_biological_structures.append(query)

with open('database/dml/insert_biological_structures.sql', 'w') as file:
    for query in insert_biological_structures:
        file.write(query + '\n')

### Query Language

In [70]:
semantic_df[semantic_df['entities'].apply(
    lambda entities: any(label == 'BIOLOGICAL_STRUCTURE' for _, label in entities)
)]

Unnamed: 0,condition,abstract,entities
0,cardiovascular disease,Catheterization laboratory events and hospital...,"[(Catheterization laboratory events, DIAGNOSTI..."
2,digestive system disease,Hyperplastic polyps seen at sigmoidoscopy are ...,"[(Hyperplastic polyps, SIGN_SYMPTOM), (sigmoid..."
4,cardiovascular disease,Effect of local inhibition of gamma-aminobutyr...,"[(dorsomedial hypothalamus, BIOLOGICAL_STRUCTU..."
5,neoplasm,Infection during chronic epidural catheterizat...,"[(Infection, DISEASE_DISORDER), (chronic, DETA..."
8,nervous system disease,Multiple representations contribute to body kn...,"[(autotopagnosia, DISEASE_DISORDER), (schema, ..."
...,...,...,...
14431,nervous system disease,Early diagnosis and survival of ruptured abdom...,"[(ruptured, DISEASE_DISORDER), (abdominal aort..."
14432,cardiovascular disease,Embolization with particles in thoracic intram...,"[(Embolization, THERAPEUTIC_PROCEDURE), (parti..."
14434,neoplasm,Mammographic measurements before and after aug...,"[(Mammographic measurements, DIAGNOSTIC_PROCED..."
14436,digestive system disease,Complications of Tenckhoff catheters post remo...,"[(ckhoff, DISEASE_DISORDER), (January 1979 to,..."


In [71]:
display_entities(pipe, semantic_df["abstract"].iloc[3])

# FINALE FINALE GIURO (senza lemma)

In [170]:
import stanza
from itertools import product

# Initialize the NLP pipeline
nlp = stanza.Pipeline(lang='en', processors='tokenize,mwt,pos,lemma,depparse')

def find_nearest(anchor_text, candidates, words):
    """Finds the nearest candidate word to the anchor_text in a sentence."""
    if not candidates:
        return None  # No candidates available

    # Find closest match by iterating over words
    anchor_idx = None
    for idx, word in enumerate(words):
        if anchor_text.lower() in word.lower():
            anchor_idx = idx
            break

    if anchor_idx is None:
        return None  # Anchor word not found

    # Find nearest candidate
    nearest_candidate = None
    min_distance = float('inf')
    for cand in candidates:
        for idx, word in enumerate(words):
            if cand.lower() in word.lower():
                distance = abs(idx - anchor_idx)
                if distance < min_distance:
                    min_distance = distance
                    nearest_candidate = cand
    return nearest_candidate

def extract_relations_generic(text, entities, condition, rules):
    """Extracts relations from text using a list of rules."""
    results = {rule['relation_type']: [] for rule in rules}

    doc = nlp(text)
    for sentence in doc.sentences:
        words = [w.text for w in sentence.words]

        for rule in rules:
            role_candidates = {}
            for role_name, spec in rule["roles"].items():
                if role_name == "condition":
                    role_candidates[role_name] = [condition]  # Use actual condition value
                else:
                    role_candidates[role_name] = [
                        ent_text for ent_text, ent_label in entities
                        if ent_label == spec["label"] and ent_text in words
                    ]

            # Skip rule if any required role is missing
            if any(len(role_candidates[role]) == 0 for role, spec in rule["roles"].items() if not spec.get("optional", False)):
                continue

            # Process roles with selection functions
            fixed_roles = [role for role, spec in rule["roles"].items() if "select_func" not in spec]
            fixed_candidate_lists = [role_candidates[role] if role_candidates[role] else [None] for role in fixed_roles]

            for combo in product(*fixed_candidate_lists):
                relation = {role: combo[idx] for idx, role in enumerate(fixed_roles)}

                for role, spec in rule["roles"].items():
                    if "select_func" in spec:
                        anchor_role = spec["anchor"]
                        anchor_val = relation.get(anchor_role)
                        candidate_list = role_candidates.get(role, [])
                        relation[role] = spec["select_func"](anchor_val, candidate_list, words)

                results[rule["relation_type"]].append(relation)

    return results

# ----------------------------------------------
# Updated rules, ensuring `condition` is not constant but dynamically set.

rules = [
    {
        "relation_type": "med_disease",
        "roles": {
            "medication": {"label": "MEDICATION"},
            "dosage": {"label": "DOSAGE", "optional": True, "select_func": find_nearest, "anchor": "medication"},
            "disease": {"label": "DISEASE_DISORDER"},
            "condition": {}  # Now dynamically set
        }
    },
    {
        "relation_type": "disease_manifestation_symptom",
        "roles": {
            "disease": {"label": "DISEASE_DISORDER"},
            "symptom": {"label": "SIGN_SYMPTOM"},
            "severity": {
                "label": "SEVERITY",
                "optional": True,
                "select_func": lambda anchor, candidates, words: find_nearest(anchor, candidates, words) if candidates else None,
                "anchor": "symptom"
            },
            "condition": {}
        }
    },
    {
        "relation_type": "bio_disease",
        "roles": {
            "biological_structure": {"label": "BIOLOGICAL_STRUCTURE"},
            "disease": {"label": "DISEASE_DISORDER"},
            "condition": {}
        }
    },
    {
        "relation_type": "diagnosis",
        "roles": {
            "disease": {"label": "DISEASE_DISORDER"},
            "diagnostic_test": {"label": "DIAGNOSTIC_PROCEDURE"},
            "condition": {}
        }
    }
]

# ----------------------------------------------

all_results = {rule["relation_type"]: [] for rule in rules}

# Togliere head per farlo su tutto il dataframe
for idx, row in semantic_df.head(100).iterrows():
    text = row['abstract']
    entities = row['entities']
    condition_value = row['condition']  # Ensure correct condition value

    row_results = extract_relations_generic(text, entities, condition_value, rules)

    for rel_type in all_results:
        all_results[rel_type].extend(row_results.get(rel_type, []))

2025-02-06 19:03:37 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-02-06 19:03:37 INFO: Downloaded file to C:\Users\rubin\stanza_resources\resources.json
2025-02-06 19:03:38 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |
| depparse  | combined_charlm   |

2025-02-06 19:03:38 INFO: Using device: cpu
2025-02-06 19:03:38 INFO: Loading: tokenize
2025-02-06 19:03:38 INFO: Loading: mwt
2025-02-06 19:03:38 INFO: Loading: pos
2025-02-06 19:03:43 INFO: Loading: lemma
2025-02-06 19:03:43 INFO: Loading: depparse
2025-02-06 19:03:44 INFO: Done loading processors!


In [171]:
# 1. Deduplicate symptoms.
unique_symptoms = set()
# 2. Deduplicate manifestations.
# Each manifestation record is a tuple:
# (disease_name, disease_condition, symptom_name, severity)
unique_manifestations = set()

for rel in all_results["disease_manifestation_symptom"]:
    # Handle disease, symptom, and manifestation extraction safely
    disease = rel.get("disease", "")
    symptom = rel.get("symptom", "")
    manifestation = rel.get("manifestation", {})

    disease_condition = rel.get("condition", "").strip().lower() if "condition" in rel else None

    disease_name = disease.strip().lower()

    symptom_name = symptom.strip().lower()

    #  Extract severity correctly
    if isinstance(rel.get("severity"), str) and rel["severity"].strip():
        severity = rel["severity"].strip().lower()
    else:
        severity = None
    # Add to sets for deduplication

    unique_symptoms.add(symptom_name)
    unique_manifestations.add((disease_name, disease_condition, symptom_name, severity))

# --- Generate INSERT queries for SYMPTOMS ---
symptom_inserts = []
for symptom_name in unique_symptoms:
    query = f"INSERT INTO SYMPTOMS (name) VALUES ('{symptom_name}');"
    symptom_inserts.append(query)

# --- Generate INSERT queries for MANIFESTATIONS ---
manifestation_inserts = []
for disease_name, disease_condition, symptom_name, severity in unique_manifestations:
    severity_value = f"'{severity}'" if severity is not None else "NULL"

    # Disease subquery with condition and name
    disease_subquery = f"(SELECT id FROM DISEASES WHERE name = '{disease_name}' AND condition = '{disease_condition}')"
    symptom_subquery = f"(SELECT id FROM SYMPTOMS WHERE name = '{symptom_name}')"

    query = (f"INSERT INTO MANIFESTATIONS (severity, disease_id, symptom_id) VALUES (\n"
             f"    {severity_value},\n"
             f"    {disease_subquery},\n"
             f"    {symptom_subquery}\n"
             f");")
    manifestation_inserts.append(query)

# --- Save the INSERT queries to files ---

# Save Symptom INSERT queries
with open('database/dml/insert_symptoms.sql', 'w') as sym_file:
    for query in symptom_inserts:
        sym_file.write(query + "\n")

# Save Manifestation INSERT queries
with open('database/dml/insert_manifestations.sql', 'w') as man_file:
    for query in manifestation_inserts:
        man_file.write(query + "\n")

In [201]:
# --- Deduplication for generating INSERT queries ---
unique_medications = set()
unique_treatments = set()

for triple in all_results["med_disease"]:
    med_name = triple.get("medication", "").strip().lower()
    med_dosage = triple.get("dosage", "").strip().lower() if triple.get("dosage") else None

    # Add the medication as a tuple (name, dosage)
    unique_medications.add((med_name, med_dosage))

    disease_name = triple.get("disease", "").strip().lower()
    disease_condition = triple.get("condition", "").strip().lower() if "condition" in triple else None

    # The treatment record is now a tuple: (medication name, medication dosage, disease name, disease condition)
    unique_treatments.add((med_name, med_dosage, disease_name, disease_condition))

# --- Generate INSERT queries for Medications ---
medication_inserts = []
for med_name, med_dosage in unique_medications:
    dosage_value = f"'{med_dosage}'" if med_dosage is not None else "NULL"
    query = f"INSERT INTO MEDICATIONS (name, dosage) VALUES ('{med_name}', {dosage_value});"
    medication_inserts.append(query)

# --- Generate INSERT queries for Treatments ---
treatment_inserts = []
for med_name, med_dosage, disease_name, disease_condition in unique_treatments:
    # Medication subquery:
    if med_dosage is not None:
        med_subquery = f"(SELECT id FROM MEDICATIONS WHERE name = '{med_name}' AND dosage = '{med_dosage}')"
    else:
        med_subquery = f"(SELECT id FROM MEDICATIONS WHERE name = '{med_name}' AND dosage IS NULL)"

    # Disease subquery:
    if disease_condition:
        disease_subquery = f"(SELECT id FROM DISEASES WHERE name = '{disease_name}' AND condition = '{disease_condition}')"
    else:
        disease_subquery = f"(SELECT id FROM DISEASES WHERE name = '{disease_name}' AND condition IS NULL)"

    query = (f"INSERT INTO TREATMENTS (medication_id, disease_id) VALUES (\n"
             f"    {med_subquery},\n"
             f"    {disease_subquery}\n"
             f");")
    treatment_inserts.append(query)

# --- Save the INSERT queries to files ---

# Save Medication INSERT queries
with open('database/dml/insert_medications.sql', 'w') as med_file:
    for query in medication_inserts:
        med_file.write(query + "\n")

# Save Treatment INSERT queries
with open('database/dml/insert_treatments.sql', 'w') as treat_file:
    for query in treatment_inserts:
        treat_file.write(query + "\n")

In [173]:
# --- Deduplication ---
# We build a set of tuples: (bio_struct_name, disease_name, disease_condition)
unique_relations = set()

for rel in all_results["bio_disease"]:
    bio_struct = rel.get("biological_structure", "").strip().lower()
    disease_name = rel.get("disease", "").strip().lower()
    disease_condition = rel.get("condition", "").strip().lower() if "condition" in rel else None

    unique_relations.add((bio_struct, disease_name, disease_condition))

# --- Generate INSERT queries for INVOLVEMENTS ---
# The INVOLVEMENTS table has foreign keys for BIOLOGICAL_STRUCTURE and DISEASES.
# We use subqueries to look up these IDs.
involvement_inserts = []
for bio_struct, disease_name, disease_condition in unique_relations:
    bio_struct_subquery = f"(SELECT id FROM BIOLOGICAL_STRUCTURE WHERE name = '{bio_struct}')"

    # Disease subquery:
    if disease_condition:
        disease_subquery = f"(SELECT id FROM DISEASES WHERE name = '{disease_name}' AND condition = '{disease_condition}')"
    else:
        disease_subquery = f"(SELECT id FROM DISEASES WHERE name = '{disease_name}' AND condition IS NULL)"

    query = (
        f"INSERT INTO INVOLVEMENTS (biological_structure_id, disease_id) VALUES (\n"
        f"    {bio_struct_subquery},\n"
        f"    {disease_subquery}\n"
        f");"
    )
    involvement_inserts.append(query)

# --- Save the INSERT queries to a file ---
with open('database/dml/insert_involvements.sql', 'w') as file:
    for query in involvement_inserts:
        file.write(query + "\n")

In [174]:
# --- Deduplication ---
# Each unique relation is represented as a tuple: (diagnostic_test, disease_name, disease_condition)
unique_diagnoses = set()

for rel in all_results["diagnosis"]:
    diag = rel.get("diagnostic_test", "").strip().lower()
    disease_name = rel.get("disease", "").strip().lower()
    disease_condition = rel.get("condition", "").strip().lower() if "condition" in rel else None

    if diag and disease_name:  # Ensure essential values exist
        unique_diagnoses.add((diag, disease_name, disease_condition))

# --- Generate INSERT queries for DIAGNOSIS ---
diagnosis_inserts = []
for diag, disease_name, disease_condition in unique_diagnoses:
    # Subquery for diagnostic test
    diag_subquery = f"(SELECT id FROM DIAGNOSTIC_TESTS WHERE name = '{diag}')"

    # Subquery for disease (handle NULL condition)
    if disease_condition:
        disease_subquery = f"(SELECT id FROM DISEASES WHERE name = '{disease_name}' AND condition = '{disease_condition}')"
    else:
        disease_subquery = f"(SELECT id FROM DISEASES WHERE name = '{disease_name}' AND condition IS NULL)"

    query = (f"INSERT INTO DIAGNOSIS (diagnostic_test_id, disease_id) VALUES (\n"
             f"    {diag_subquery},\n"
             f"    {disease_subquery}\n"
             f");")
    diagnosis_inserts.append(query)

# --- Save the INSERT queries to a file ---
with open('database/dml/insert_diagnosis.sql', 'w') as diag_file:
    for query in diagnosis_inserts:
        diag_file.write(query + "\n")