In [1]:
import csv
import pandas as pd
import numpy as np
import re
import os

from tqdm.auto import tqdm

# Vectorization
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer

# Frequency Analysis
from mlxtend.frequent_patterns import apriori, association_rules, fpgrowth
from mlxtend.preprocessing import TransactionEncoder
import pyfpgrowth


INPUT_CSV = 'data/PMC-Patients.csv'
PUBMED_CSV = 'data/pubmed_data.csv'
CLEAN_OUT = 'outputs/PMC_clean.parquet'
TRANS_OUT = 'outputs/transactions.parquet'
PATTERN_OUT = 'outputs/patterns.parquet'
TS_OUT = 'outputs/timeseries.parquet'
SNIPPET_OUT = 'outputs/snippets.parquet'
DEMOG_OUT = 'outputs/demographics.parquet'

NGRAM_RANGE = (1, 2)
MIN_DF = 10 # Minimum document frequency for CountVectorizer
MAX_FEATURES = 100000 # Maximum number of features for CountVectorizer
MIN_SUPPORT = 0.02 # Minimum support for apriori algorithm
RANDOM_SAMPLE = None # Set to an integer for limited testing


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\teolo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\teolo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\teolo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\teolo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\teolo\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\teolo\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptro

In [None]:
from custom_stopwords import cust_sw

patients_df = pd.read_csv(INPUT_CSV)

patients_df = patients_df.dropna(subset=['patient']).reset_index(drop=True)

if RANDOM_SAMPLE is not None:
    patients_df = patients_df.sample(RANDOM_SAMPLE, random_state=42).reset_index(drop=True)

print(f"Total records to process: {len(patients_df)}")

pubmed_df = pd.read_csv(PUBMED_CSV)

for row in patients_df.itertuples():
    pmid = row.PMID
    pubmed_info = pubmed_df[pubmed_df['PMID'] == pmid]
    if not pubmed_info.empty:
        patients_df.at[row.Index, 'Year'] = pubmed_info.iloc[0]['Year']
        print(pubmed_info.iloc[0]['Year'])
        patients_df.at[row.Index, 'Month'] = pubmed_info.iloc[0]['Month']
        print(pubmed_info.iloc[0]['Month'])
        patients_df.at[row.Index, 'Day'] = pubmed_info.iloc[0]['Day']
        print(pubmed_info.iloc[0]['Day'])

# Convert 'Year', 'Month', 'Day' to numeric, coercing errors to NaN
patients_df['Year'] = pd.to_numeric(patients_df['Year'], errors='coerce')
patients_df['Month'] = pd.to_numeric(patients_df['Month'], errors='coerce') 
patients_df['Day'] = pd.to_numeric(patients_df['Day'], errors='coerce')

# Remove rows with missing date components
patients_df = patients_df.dropna(subset=['Year', 'Month', 'Day']).reset_index(drop=True)

# convert 'Year', 'Month', 'Day' to datetime
patients_df['pub_date'] = pd.to_datetime(patients_df[['Year', 'Month', 'Day']], errors='coerce')
patients_df.drop(columns=['Year', 'Month', 'Day'], inplace=True)

print(f"Records after date processing: {len(patients_df)}")


# Text Cleaning and Preprocessing
stop_words = set(stopwords.words('english'))

stop_words = stop_words.union(cust_sw)

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag): # nltk pos tag to wordnet pos tag
    if tag.startswith('J'): return wordnet.ADJ
    elif tag.startswith('V'): return wordnet.VERB
    elif tag.startswith('N'): return wordnet.NOUN
    elif tag.startswith('R'): return wordnet.ADV
    else: return wordnet.NOUN

def clean_and_lemmatize(text):
    s = str(text).lower()

    # pulizia base
    s = re.sub(r'\n(?=\w)', '', s)                # remove newline when immediately before a word char ("\nThe" -> "The")
    s = re.sub(r'\bn[\s]', ' ', s)                # nthe ecc.
    s = re.sub(r'\b\d+[a-z]+\b', ' ', s)          # remove "10mg", "3cm", ecc.
    s = re.sub(r'\b\d+\b', ' ', s)                # remove isolated numbers
    s = re.sub(r'[^a-z\s]', ' ', s)               # remove non-alphabetic characters
    s = re.sub(r'\s+', ' ', s).strip()            # remove extra spaces

    # tokenizzazione e lemmatizzazione
    tokens = nltk.word_tokenize(s)
    pos_tags = nltk.pos_tag(tokens)

    lemmas = [
        lemmatizer.lemmatize(t, get_wordnet_pos(p))
        for t, p in pos_tags
    ]

    # rimozione stopword e token troppo corti
    filtered = [w for w in lemmas if w not in stop_words and len(w) > 2]

    return " ".join(filtered)


tqdm.pandas(desc="Cleaning and lemmatizing text")
patients_df['text_clean'] = patients_df['patient'].progress_apply(clean_and_lemmatize)
# Show first patient and first cleaned text
# Print full original and cleaned text (access as scalar to avoid pandas truncation)
print(patients_df.loc[0, 'patient'])
print("\n--- cleaned ---\n")
print(patients_df.loc[0, 'text_clean'])

patients_df.to_parquet(CLEAN_OUT, index=False)
print(f"Cleaned data saved to {CLEAN_OUT}")

Total records to process: 167034
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
2021.0
11.0
15.0
2021.0
11.0
15.0
2021.0
11.0
15.0
2021.0
11.0
15.0
2021.0
11.0
16.0
2021.0
11.0
16.0
2021.0
11.0
16.0
2021.0
11.0
16.0
2021.0
11.0
16.0
2021.0
11.0
17.0
2021.0
11.0
15.0
2021.0
11.0
16.0
2021.0
12.0
15.0
2021.0
12.0
15.0
2021.0
11.0
26.0
2021.0
11.0
26.0
2021.0
11.0
17.0
2021.0
11.0
16.0
2021.0
11.0
24.0
2021.0
11.0
24.0
2021.0
11.0
24.0
2021.0
11.0
24.0
2021.0
11.0
16.0
2021.0
11.0
16.0
2021.0
11.0
16.0
2021.0
11.0
16.0
2021.0
11.0
16.0
2021.0
12.0
21.0
2021.0
12.0
20.0
2021.0
12.0
21.0
2021.0
12.0
22.0
2021.0
12.0
22.0
2021.0
11.0
22.0
2021.0
12.0
20.0
2021.0
11.0
21.0
2021.0
11.0
21.0
2021.0
11.0
21.0
2021.0
11.0
22.0
2021.0
11.0
22.0
2021.0
12.0
15.0
2021.0
12.0
9.0
2021.0
12.0
9.0
2021.0
12.0
9.0
2021.0
12.0
9.0
2021.0
12.0
9.0
2021.0
12.0
9.0
2021.0
12.0
9.0
2021.0
12.0
9.0
2021.0
12.0
9.0
2021.0
12.0

Cleaning and lemmatizing text: 100%|██████████| 128748/128748 [32:34<00:00, 65.87it/s] 


A 45-year-old female was brought in by ambulance after collapsing at home secondary to a hypoglycemic event (capillary blood glucose of 1 mmol/L with paramedics). She had a history of restrictive AN, binge-purge behaviour, and an old traumatic brain injury, leaving her with memory problems. She was well known to mental health services, having been admitted multiple times to eating disorder centres for nasogastric feeding. She had never smoked in her life and denied any alcohol intake. The patient was on ferrous fumarate, fexofenadine, fluoxetine, ibuprofen, lansoprazole, quetiapine, supplemental vitamins, regular morphine (modified release), and gabapentin.\nOn admission, her blood pressure was 106/85 mmHg, respiratory rate was 20 breaths/minute, heart rate was 64 beats/minute, temperature was 35.1 °C, and capillary blood glucose was 6 mmol/L. Her weight on admission was 37.3 kg (body mass index [BMI] = 12.6). On examination, she was clearly malnourished, cachexic, and dehydrated. The 

In [None]:
clean_df = pd.read_parquet(CLEAN_OUT)
texts = clean_df['text_clean'].astype(str).tolist()

vectorizer = CountVectorizer(ngram_range=NGRAM_RANGE, min_df=MIN_DF, max_features=MAX_FEATURES, binary=True)  # max_features=MAX_FEATURES

tqdm.pandas(desc="Vectorizing text")

X = vectorizer.fit_transform(texts)

sum_words = X.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vectorizer.vocabulary_.items()] # get word frequencies
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True) # sort by frequency, key=lambda x: x[1] makes sure we sort by frequency

pattern_df = pd.DataFrame(words_freq, columns=['Pattern', 'Frequency'])
pattern_df.to_csv('outputs/freq_terms.csv', index=False)
top_patterns = pattern_df.head(20)

import plotly.express as px

fig = px.bar(top_patterns, x='Pattern', y='Frequency', title='Top 20 N-grams in Patient Descriptions')
fig.update_layout(xaxis_tickangle=-45)
fig.show()


distutils Version classes are deprecated. Use packaging.version instead.


distutils Version classes are deprecated. Use packaging.version instead.



In [4]:
pd.set_option('display.max_colwidth', None)  # Set max column width to None for better display

clean_df = pd.read_parquet(CLEAN_OUT)
texts = clean_df['text_clean'].astype(str).tolist()

vectorizer = CountVectorizer(ngram_range=NGRAM_RANGE, min_df=MIN_DF, max_features=MAX_FEATURES, binary=True)  # max_features=MAX_FEATURES

tqdm.pandas(desc="Vectorizing text")

X = vectorizer.fit_transform(texts)

vocab = vectorizer.get_feature_names_out()
print(f"Vocabulary size: {len(vocab)}")
print(f"First 1000 terms: {vocab[:1000]}")

# Build transactions - each document as a list of present n-grams
rows, cols = X.nonzero() # indices of rows and columns with non-zero entries

doc_to_ngrams = [[] for _ in range(X.shape[0])]
for r, c in zip(rows, cols):
    doc_to_ngrams[r].append(vocab[c])

# Transaction Example
print(f"Document 0 n-grams: {doc_to_ngrams[0]}")

Vocabulary size: 100000
First 1000 terms: ['aaa' 'aaa repair' 'aac' 'aad' 'aag' 'aai' 'aast' 'aat' 'aav' 'abacavir'
 'abacavir lamivudine' 'abandon' 'abatacept' 'abate' 'abbot' 'abbott'
 'abbott abbott' 'abbott chicago' 'abbott molecular' 'abbott optic'
 'abbott park' 'abbott usa' 'abbott vascular' 'abbreviate' 'abbreviated'
 'abbvie' 'abc' 'abca' 'abcam' 'abcam cambridge' 'abcb' 'abcc' 'abcd'
 'abciximab' 'abdomen' 'abdomen abdomen' 'abdomen abnormal'
 'abdomen abnormality' 'abdomen abscess' 'abdomen absence'
 'abdomen accompany' 'abdomen active' 'abdomen additional'
 'abdomen adrenal' 'abdomen air' 'abdomen amount' 'abdomen angiography'
 'abdomen appendicitis' 'abdomen appendix' 'abdomen arm' 'abdomen ascites'
 'abdomen back' 'abdomen become' 'abdomen benign' 'abdomen bone'
 'abdomen bowel' 'abdomen brain' 'abdomen bulky' 'abdomen cervical'
 'abdomen cholelithiasis' 'abdomen circumferential' 'abdomen close'
 'abdomen collection' 'abdomen colonoscopy' 'abdomen complain'
 'abdomen comp

In [None]:
te = TransactionEncoder()
te_ary = te.fit(doc_to_ngrams).transform(doc_to_ngrams)
trans_df = pd.DataFrame(te_ary, columns=te.columns_)

#freq_is = fpgrowth(trans_df, min_support=MIN_SUPPORT, use_colnames=True, max_len=2)  # --- IGNORE ---

min_support_count = int(MIN_SUPPORT * len(doc_to_ngrams))

freq_is = pyfpgrowth.find_frequent_patterns(doc_to_ngrams, min_support_count)
#transform freq_is into dataframe as returned by fpgrowth library (columns: itemsets, support)
freq_is = pd.DataFrame(freq_is.items(), columns=['itemsets', 'support_count'])
freq_is = freq_is.sort_values(['support_count', 'itemsets'], ascending=[False, True]).reset_index(drop=True)
print(f"Itemsets found: {len(freq_is)}")

Itemsets found: 3859


In [None]:
clean_df = pd.read_parquet(CLEAN_OUT)
trans_df['id'] = clean_df.index
clean_df['id'] = clean_df.index
trans_df.to_parquet(TRANS_OUT, index=False)
# trans_df and clean_df with same 'id' column saved to parquet


In [None]:

# convert itemsets from frozenset to string for easier readability

def itemset_to_label(itemset):
    return ' || '.join(sorted(list(itemset)))

# Arrow/Parquet cannot serialize frozenset objects directly.
# Convert itemsets to lists of strings so they can be saved to Parquet.
freq_is['itemsets'] = freq_is['itemsets'].apply(lambda x: list(x))

freq_is['pattern_label'] = freq_is['itemsets'].apply(itemset_to_label)
freq_is['support'] = (freq_is['support_count'] / len(clean_df)).astype(float)

# add length of each itemset as a new column
freq_is['len'] = freq_is['itemsets'].apply(len)

freq_is = freq_is[['pattern_label', 'itemsets', 'support', 'support_count', 'len']].sort_values(by='support', ascending=False)
freq_is.to_parquet(PATTERN_OUT, index=False)
print(f"Frequent patterns saved to {PATTERN_OUT}")
print(freq_is.head(20))

Frequent patterns saved to outputs/patterns.parquet
                              pattern_label  \
0                                    tissue   
1                                     tumor   
2                       protein || reactive   
5              reactive || reactive protein   
3   protein || reactive || reactive protein   
4               protein || reactive protein   
6                                  evaluate   
7                                    injury   
8                                      hand   
9                           tissue || tumor   
10                                   trauma   
11                         protein || serum   
12                                    order   
13                                  consent   
14                                reduction   
15                                injection   
16                       resection || tumor   
17                                   relate   
18                         fever || protein   
19      

In [None]:
import numpy as np
from tqdm.auto import tqdm
import nltk, re
from collections import defaultdict
import ast

clean_df = pd.read_parquet(CLEAN_OUT)
pattern_df = pd.read_parquet(PATTERN_OUT)
trans_df = pd.read_parquet(TRANS_OUT)

print(clean_df.columns)

# Parse age function
def parse_age(value):
    
    if pd.isna(value):
        return None
    try:
        # a volte è una stringa rappresentante una lista
        data = ast.literal_eval(value)
        if isinstance(data, list) and len(data) > 0:
            entry = data[0]
            if isinstance(entry, (list, tuple)) and len(entry) == 2:
                num, unit = entry
                num = float(num)
                unit = str(unit).lower()
                if "year" in unit:
                    return num
                elif "month" in unit:
                    return num / 12.0
                elif "week" in unit:
                    return num / 52.0
                elif "day" in unit:
                    return num / 365.0
        return None
    except Exception:
        return None

# Age Buckets
def age_to_bin(age):
    try:
        age = float(age)
    except:
        return 'unknown'
    if age is None or np.isnan(age):
        return 'unknown'
    if age < 1:
        return '<1'
    if age < 18:
        return '0-17'
    elif age < 40:
        return '18-39'
    elif age < 60:
        return '40-59'
    elif age < 80:
        return '60-79'
    else:
        return '80+'

clean_df['age'] = clean_df['age'].apply(parse_age)
clean_df['age_bin'] = clean_df['age'].apply(age_to_bin)

rows_ts, rows_demog, rows_snip = [], [], []

# Columns from Transaction DataFrame to NumPy boolean matrix
cols_ngrams = [c for c in trans_df.columns if c != 'id']
mat = trans_df[cols_ngrams].to_numpy(dtype=np.bool_)

# Map pattern labels to column indices
col_indexes = {col: idx for idx, col in enumerate(cols_ngrams)}

print("Extracting time series, demographics, and snippets...")
for idx, row in tqdm(pattern_df.iterrows(), total=len(pattern_df)):
    itemset = row['itemsets']
    pattern_label = row['pattern_label']
    
    # Build mask for rows containing all n-grams in the itemset
    idxs = [col_indexes[tok] for tok in itemset if tok in col_indexes]
    if not idxs:
        continue

    mask = np.all(mat[:, idxs], axis=1)

    support_count = int(mask.sum()) # number of documents containing the itemset (all itemset tokens)
    if support_count == 0:
        continue

    # Time Series
    if clean_df['pub_date'].notna().any():
        ts = (
            clean_df.loc[mask & clean_df['pub_date'].notna(), ['pub_date']] #pub_date of rows matching the pattern and with non-na pub_date
            .groupby(pd.Grouper(key='pub_date', freq='M'))
            .size()
            .reset_index(name='count')
        )
        ts['pattern_label'] = pattern_label
        rows_ts.append(ts)

    # Demographics
    demog = (
        clean_df.loc[mask]
        .groupby(['age_bin', 'gender'])
        .size()
        .reset_index(name='count')
    )
    demog['pattern_label'] = pattern_label
    rows_demog.append(demog)

    # Snippets
    snippets = []
    sample_idxs = clean_df.index[mask][:3]
    for i in sample_idxs:
        raw_text = clean_df.at[i, 'patient']
        sents = nltk.sent_tokenize(raw_text)
        ctx = None
        for s in sents:
            if all(re.search(r'\b' + re.escape(tok) + r'\b', s, re.IGNORECASE) for tok in itemset):
                ctx = s
                break
        if not ctx:
            ctx = sents[0] if sents else raw_text[:200]
        snippets.append(ctx)
    rows_snip.append({'pattern_label': pattern_label, 'snippets': snippets})

# Combine and save Time Series
if rows_ts:
    ts_df = pd.concat(rows_ts, ignore_index=True)
    ts_df.to_parquet(TS_OUT, index=False)
    print(f"Time series data saved to {TS_OUT}")

# Combine and save Demographics
if rows_demog:
    demog_df = pd.concat(rows_demog, ignore_index=True)
    demog_df.to_parquet(DEMOG_OUT, index=False)
    print(f"Demographics data saved to {DEMOG_OUT}")

# Combine and save Snippets
if rows_snip:
    snip_df = pd.DataFrame(rows_snip)
    snip_df.to_parquet(SNIPPET_OUT, index=False)
    print(f"Snippets data saved to {SNIPPET_OUT}")

print("Processing complete.")


Index(['patient_id', 'patient_uid', 'PMID', 'file_path', 'title', 'patient',
       'age', 'gender', 'relevant_articles', 'similar_patients', 'pub_date',
       'text_clean'],
      dtype='object')
Extracting time series, demographics, and snippets...


100%|██████████| 3859/3859 [01:01<00:00, 63.14it/s]


Time series data saved to outputs/timeseries.parquet
Demographics data saved to outputs/demographics.parquet
Snippets data saved to outputs/snippets.parquet
Processing complete.
