In [17]:
import csv
import pandas as pd
import numpy as np
import re
import os

from tqdm.auto import tqdm

# Vectorization
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

# Frequency Analysis
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth
from mlxtend.preprocessing import TransactionEncoder


INPUT_CSV = 'data/PMC-Patients.csv'
PUBMED_CSV = 'data/pubmed_data.csv'
CLEAN_OUT = 'outputs/PMC_clean.parquet'
TRANS_OUT = 'outputs/transactions.parquet'
PATTERN_OUT = 'outputs/patterns.parquet'
TS_OUT = 'outputs/timeseries.parquet'
SNIPPET_OUT = 'outputs/snippets.parquet'
DEMOG_OUT = 'outputs/demographics.parquet'

NGRAM_RANGE = (1, 2)
MIN_DF = 10 # Minimum document frequency for CountVectorizer
MAX_FEATURES = 5000 # Maximum number of features for CountVectorizer
MIN_SUPPORT = 0.01 # Minimum support for apriori algorithm
RANDOM_SAMPLE = 10000 # Set to an integer for limited testing


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\teolo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\teolo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\teolo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\teolo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\teolo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\teolo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       d

In [18]:
from custom_stopwords import cust_sw

patients_df = pd.read_csv(INPUT_CSV)

patients_df = patients_df.dropna(subset=['patient']).reset_index(drop=True)

if RANDOM_SAMPLE is not None:
    patients_df = patients_df.sample(RANDOM_SAMPLE, random_state=42).reset_index(drop=True)

print(f"Total records to process: {len(patients_df)}")

pubmed_df = pd.read_csv(PUBMED_CSV)

for row in patients_df.itertuples():
    pmid = row.PMID
    pubmed_info = pubmed_df[pubmed_df['PMID'] == pmid]
    if not pubmed_info.empty:
        patients_df.at[row.Index, 'Year'] = pubmed_info.iloc[0]['Year']
        print(pubmed_info.iloc[0]['Year'])
        patients_df.at[row.Index, 'Month'] = pubmed_info.iloc[0]['Month']
        print(pubmed_info.iloc[0]['Month'])
        patients_df.at[row.Index, 'Day'] = pubmed_info.iloc[0]['Day']
        print(pubmed_info.iloc[0]['Day'])

# Convert 'Year', 'Month', 'Day' to numeric, coercing errors to NaN
patients_df['Year'] = pd.to_numeric(patients_df['Year'], errors='coerce')
patients_df['Month'] = pd.to_numeric(patients_df['Month'], errors='coerce') 
patients_df['Day'] = pd.to_numeric(patients_df['Day'], errors='coerce')

# Remove rows with missing date components
patients_df = patients_df.dropna(subset=['Year', 'Month', 'Day']).reset_index(drop=True)

# convert 'Year', 'Month', 'Day' to datetime
patients_df['pub_date'] = pd.to_datetime(patients_df[['Year', 'Month', 'Day']], errors='coerce')
patients_df.drop(columns=['Year', 'Month', 'Day'], inplace=True)

print(f"Records after date processing: {len(patients_df)}")


# Text Cleaning and Preprocessing
stop_words = set(stopwords.words('english'))

stop_words = stop_words.union(cust_sw)

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'): return wordnet.ADJ
    elif tag.startswith('V'): return wordnet.VERB
    elif tag.startswith('N'): return wordnet.NOUN
    elif tag.startswith('R'): return wordnet.ADV
    else: return wordnet.NOUN

def clean_and_lemmatize(text):
    s = str(text).lower()

    # pulizia base
    s = re.sub(r'\bn[\s]', ' ', s)                # rumore tipo "n the"
    s = re.sub(r'\b\d+[a-z]+\b', ' ', s)          # rimuove "10mg", "3cm", ecc.
    s = re.sub(r'\b\d+\b', ' ', s)                # rimuove numeri isolati
    s = re.sub(r'[^a-z\s]', ' ', s)               # rimuove caratteri non alfabetici
    s = re.sub(r'\s+', ' ', s).strip()

    # tokenizzazione e lemmatizzazione
    tokens = nltk.word_tokenize(s)
    pos_tags = nltk.pos_tag(tokens)

    lemmas = [
        lemmatizer.lemmatize(t, get_wordnet_pos(p))
        for t, p in pos_tags
    ]

    # rimozione stopword e token troppo corti
    filtered = [w for w in lemmas if w not in stop_words and len(w) > 2]

    return " ".join(filtered)


tqdm.pandas(desc="Cleaning and lemmatizing text")
patients_df['text_clean'] = patients_df['patient'].progress_apply(clean_and_lemmatize)
# Show first patient and first cleaned text
# Print full original and cleaned text (access as scalar to avoid pandas truncation)
print(patients_df.loc[0, 'patient'])
print("\n--- cleaned ---\n")
print(patients_df.loc[0, 'text_clean'])

patients_df.to_parquet(CLEAN_OUT, index=False)
print(f"Cleaned data saved to {CLEAN_OUT}")

Total records to process: 10000
2021.0
7.0
31.0
2020.0
10.0
5.0
2021.0
1.0
16.0
nan
nan
nan
nan
nan
nan
2018.0
7.0
23.0
2020.0
10.0
19.0
2014.0
11.0
25.0
2016.0
12.0
6.0
2014.0
9.0
30.0
nan
nan
nan
2017.0
3.0
9.0
nan
nan
nan
2020.0
9.0
22.0
2021.0
7.0
21.0
2020.0
5.0
19.0
2021.0
10.0
6.0
nan
nan
nan
2021.0
6.0
21.0
2019.0
11.0
21.0
2021.0
4.0
22.0
2020.0
7.0
1.0
2019.0
10.0
16.0
2020.0
8.0
28.0
nan
nan
nan
2020.0
11.0
29.0
2018.0
4.0
30.0
2016.0
2.0
20.0
2017.0
11.0
27.0
nan
nan
nan
2017.0
10.0
19.0
2015.0
2.0
28.0
2021.0
12.0
4.0
2018.0
10.0
4.0
2010.0
2.0
22.0
2017.0
2.0
22.0
2015.0
3.0
9.0
nan
nan
nan
2021.0
1.0
31.0
2012.0
4.0
1.0
2019.0
12.0
1.0
2014.0
5.0
16.0
2021.0
4.0
22.0
2020.0
6.0
3.0
2015.0
7.0
4.0
nan
nan
nan
2020.0
5.0
23.0
2016.0
11.0
4.0
nan
nan
nan
2014.0
3.0
7.0
nan
nan
nan
2007.0
10.0
18.0
2014.0
8.0
12.0
nan
nan
nan
2015.0
4.0
8.0
nan
nan
nan
2014.0
2.0
5.0
nan
nan
nan
2012.0
4.0
4.0
2020.0
8.0
5.0
2021.0
2.0
20.0
nan
nan
nan
2020.0
4.0
24.0
nan
nan
nan
2015.0
9.0


Cleaning and lemmatizing text: 100%|██████████| 7729/7729 [01:52<00:00, 68.78it/s]


A 65-year-old male, known to have three-vessel coronary artery disease, diabetes mellitus and hypertension, was referred to the neurosurgery department due to ptosis and anisocoria that developed 1 day after CABG. The patient was noted to have right ‘eyelid drop, dilated pupil and double vision’ upon elevation of the eyelid. The patient reported no history of headache, vomiting, loss of consciousness, symptoms suggestive of brainstem involvement or pituitary axis hypo/hypersecretion upon initial presentation.\nThe patient was vitally stable, alert and oriented to person, place and time with a Glasgow coma scale (GCS) of 15/15. There was right-sided paresis of the levator palpebrae superioris and the extra-ocular muscles; medial rectus, superior rectus, inferior rectus and inferior oblique muscles with a 5-mm dilated pupil. The left eye extra-ocular muscles were intact. The left pupil measured 3-mm and was reactive to light and accommodation with no gaze preference. The visual acuity wa

In [19]:
clean_df = pd.read_parquet(CLEAN_OUT)
texts = clean_df['text_clean'].astype(str).tolist()

vectorizer = CountVectorizer(ngram_range=NGRAM_RANGE, min_df=MIN_DF, max_features=MAX_FEATURES, binary=True)  # max_features=MAX_FEATURES

tqdm.pandas(desc="Vectorizing text")

X = vectorizer.fit_transform(texts)

sum_words = X.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()]
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)

pattern_df = pd.DataFrame(words_freq, columns=['Pattern', 'Frequency'])
pattern_df.to_csv('outputs/freq_terms.csv', index=False)
top_patterns = pattern_df.head(20)

import plotly.express as px

fig = px.bar(top_patterns, x='Pattern', y='Frequency', title='Top 20 N-grams in Patient Descriptions')
fig.update_layout(xaxis_tickangle=-45)
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



In [20]:
pd.set_option('display.max_colwidth', None)  # Set max column width to None for better display

clean_df = pd.read_parquet(CLEAN_OUT)
texts = clean_df['text_clean'].astype(str).tolist()

vectorizer = CountVectorizer(ngram_range=NGRAM_RANGE, min_df=MIN_DF, max_features=MAX_FEATURES, binary=True)  # max_features=MAX_FEATURES

tqdm.pandas(desc="Vectorizing text")

X = vectorizer.fit_transform(texts)

vocab = vectorizer.get_feature_names_out()
print(f"Vocabulary size: {len(vocab)}")
print(f"First 1000 terms: {vocab[:1000]}")

# Build transactions - each document as a list of present n-grams
rows, cols = X.nonzero() # indices of rows and columns with non-zero entries

doc_to_ngrams = [[] for _ in range(X.shape[0])]
for r, c in zip(rows, cols):
    doc_to_ngrams[r].append(vocab[c])

# Transaction Example
print(f"Document 0 n-grams: {doc_to_ngrams[0]}")

Vocabulary size: 5000
First 1000 terms: ['abbott' 'abdomen' 'abdomen distend' 'abdomen pelvis' 'abdominopelvic'
 'abduction' 'aberrant' 'ability' 'ablation' 'abnormal' 'abnormalities'
 'abnormality' 'abnormally' 'abortion' 'abrupt' 'abscess' 'absence'
 'absent' 'absolute' 'absorbable' 'abundant' 'abuse' 'abut' 'accept'
 'acceptable' 'access' 'accessory' 'accident' 'accompany' 'accordance'
 'accordingly' 'account' 'accumulation' 'accurate' 'ace' 'acetaminophen'
 'acetate' 'acetylsalicylic' 'acetylsalicylic acid' 'ache' 'acid'
 'acid fast' 'acid schiff' 'acidosis' 'acoustic' 'acquire'
 'acquire pneumonia' 'across' 'act' 'acth' 'actin' 'activate'
 'activate thromboplastin' 'activation' 'active' 'active bleeding'
 'acuity' 'acuity bcva' 'acuity eye' 'acutely' 'acyclovir' 'add'
 'addition' 'additional' 'additional file' 'additionally' 'address'
 'adenocarcinoma' 'adenoma' 'adenopathy' 'adenovirus' 'adequate' 'adhere'
 'adherence' 'adherent' 'adhesion' 'adipose' 'adipose tissue' 'adjacent'
 

In [21]:
# Apriori Algorithm for Frequent Itemset Mining
te = TransactionEncoder()
te_ary = te.fit(doc_to_ngrams).transform(doc_to_ngrams)
trans_df = pd.DataFrame(te_ary, columns=te.columns_)

freq_is = fpgrowth(trans_df, min_support=MIN_SUPPORT, use_colnames=True, max_len=2)
freq_is = freq_is.sort_values(['support', 'itemsets'], ascending=[False, True]).reset_index(drop=True)
print(f"Itemsets found: {len(freq_is)}")

Itemsets found: 33471


In [22]:
clean_df = pd.read_parquet(CLEAN_OUT)
trans_df['id'] = clean_df.index
clean_df['id'] = clean_df.index
trans_df.to_parquet(TRANS_OUT, index=False)


In [24]:

# convert itemsets from frozenset to string for easier readability

def itemset_to_label(itemset):
    return ' || '.join(sorted(list(itemset)))

# Arrow/Parquet cannot serialize frozenset objects directly.
# Convert itemsets to lists of strings so they can be saved to Parquet.
freq_is['itemsets'] = freq_is['itemsets'].apply(lambda x: list(x))

freq_is['pattern_label'] = freq_is['itemsets'].apply(itemset_to_label)
freq_is['support_count'] = (freq_is['support'] * len(clean_df)).astype(int)

# add length of each itemset (was missing and caused KeyError)
freq_is['len'] = freq_is['itemsets'].apply(len)

freq_is = freq_is[['pattern_label', 'itemsets', 'support', 'support_count', 'len']].sort_values(by='support', ascending=False)
freq_is.to_parquet(PATTERN_OUT, index=False)
print(f"Frequent patterns saved to {PATTERN_OUT}")
print(freq_is.head(20))

Frequent patterns saved to outputs/patterns.parquet
    pattern_label         itemsets   support  support_count  len
0            cell           [cell]  0.386725           2989    1
1            pain           [pain]  0.385949           2983    1
2           three          [three]  0.268340           2074    1
3           first          [first]  0.256566           1983    1
4            area           [area]  0.235347           1819    1
5          severe         [severe]  0.228102           1763    1
6          tissue         [tissue]  0.221633           1713    1
7            note           [note]  0.214387           1657    1
8           tumor          [tumor]  0.214387           1657    1
9            make           [make]  0.206107           1593    1
10          large          [large]  0.205589           1589    1
11          serum          [serum]  0.199638           1543    1
12          range          [range]  0.194851           1506    1
13         second         [second]  0.

In [27]:
import numpy as np
from tqdm.auto import tqdm
import nltk, re
from collections import defaultdict
import ast

clean_df = pd.read_parquet(CLEAN_OUT)
pattern_df = pd.read_parquet(PATTERN_OUT)
trans_df = pd.read_parquet(TRANS_OUT)

print(clean_df.columns)

# Parse age function
def parse_age(value):
    
    if pd.isna(value):
        return None
    try:
        # a volte è una stringa rappresentante una lista
        data = ast.literal_eval(value)
        if isinstance(data, list) and len(data) > 0:
            entry = data[0]
            if isinstance(entry, (list, tuple)) and len(entry) == 2:
                num, unit = entry
                num = float(num)
                unit = str(unit).lower()
                if "year" in unit:
                    return num
                elif "month" in unit:
                    return num / 12.0
                elif "week" in unit:
                    return num / 52.0
                elif "day" in unit:
                    return num / 365.0
        return None
    except Exception:
        return None

# Age Buckets
def age_to_bin(age):
    try:
        age = float(age)
    except:
        return 'unknown'
    if age is None or np.isnan(age):
        return 'unknown'
    if age < 1:
        return '<1'
    if age < 18:
        return '0-17'
    elif age < 40:
        return '18-39'
    elif age < 60:
        return '40-59'
    elif age < 80:
        return '60-79'
    else:
        return '80+'

clean_df['age'] = clean_df['age'].apply(parse_age)
clean_df['age_bin'] = clean_df['age'].apply(age_to_bin)

rows_ts, rows_demog, rows_snip = [], [], []

# Columns from Transaction DataFrame to NumPy boolean matrix
cols_ngrams = [c for c in trans_df.columns if c != 'id']
mat = trans_df[cols_ngrams].to_numpy(dtype=np.bool_)

# Map pattern labels to column indices
col_indexes = {col: idx for idx, col in enumerate(cols_ngrams)}

print("Extracting time series, demographics, and snippets...")
for idx, row in tqdm(pattern_df.iterrows(), total=len(pattern_df)):
    itemset = row['itemsets']
    pattern_label = row['pattern_label']
    
    # Build mask for rows containing all n-grams in the itemset
    idxs = [col_indexes[tok] for tok in itemset if tok in col_indexes]
    if not idxs:
        continue

    mask = np.all(mat[:, idxs], axis=1)

    support_count = int(mask.sum())
    if support_count == 0:
        continue

    # Time Series
    if clean_df['pub_date'].notna().any():
        ts = (
            clean_df.loc[mask & clean_df['pub_date'].notna(), ['pub_date']]
            .groupby(pd.Grouper(key='pub_date', freq='M'))
            .size()
            .reset_index(name='count')
        )
        ts['pattern_label'] = pattern_label
        rows_ts.append(ts)

    # Demographics
    demog = (
        clean_df.loc[mask]
        .groupby(['age_bin', 'gender'])
        .size()
        .reset_index(name='count')
    )
    demog['pattern_label'] = pattern_label
    rows_demog.append(demog)

    # Snippets
    snippets = []
    sample_idxs = clean_df.index[mask][:3]
    for i in sample_idxs:
        raw_text = clean_df.at[i, 'patient']
        sents = nltk.sent_tokenize(raw_text)
        ctx = None
        for s in sents:
            if all(re.search(r'\b' + re.escape(tok) + r'\b', s, re.IGNORECASE) for tok in itemset):
                ctx = s
                break
        if not ctx:
            ctx = sents[0] if sents else raw_text[:200]
        snippets.append(ctx)
    rows_snip.append({'pattern_label': pattern_label, 'snippets': snippets})

# Combine and save Time Series
if rows_ts:
    ts_df = pd.concat(rows_ts, ignore_index=True)
    ts_df.to_parquet(TS_OUT, index=False)
    print(f"Time series data saved to {TS_OUT}")

# Combine and save Demographics
if rows_demog:
    demog_df = pd.concat(rows_demog, ignore_index=True)
    demog_df.to_parquet(DEMOG_OUT, index=False)
    print(f"Demographics data saved to {DEMOG_OUT}")

# Combine and save Snippets
if rows_snip:
    snip_df = pd.DataFrame(rows_snip)
    snip_df.to_parquet(SNIPPET_OUT, index=False)
    print(f"Snippets data saved to {SNIPPET_OUT}")

print("Processing complete.")


Index(['patient_id', 'patient_uid', 'PMID', 'file_path', 'title', 'patient',
       'age', 'gender', 'relevant_articles', 'similar_patients', 'pub_date',
       'text_clean'],
      dtype='object')
Extracting time series, demographics, and snippets...


100%|██████████| 33471/33471 [04:36<00:00, 120.87it/s]


Time series data saved to outputs/timeseries.parquet
Demographics data saved to outputs/demographics.parquet
Snippets data saved to outputs/snippets.parquet
Processing complete.
