In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
# necessary imports
!pip install xmltodict
import os
import tarfile
import xmltodict
import pandas as pd
import xml.etree.ElementTree as ET

In [None]:
# extracting the xml records
folder_path = "drive/MyDrive/BAIS6100/Datasets/Project3_data"

# List all XML files in the folder
xml_files = [f for f in os.listdir(folder_path) if f.endswith('.xml')]

# Iterate through each XML file and save the text portion to a dataframe
data_list = []

for file_name in xml_files:
    file_path = os.path.join(folder_path, file_name)

    # Parse the XML file
    tree = ET.parse(file_path)
    root = tree.getroot()

    # Extract 'TEXT' data
    text_element = root.find('TEXT')
    text_data = text_element.text if text_element is not None else "No TEXT found"

    data_list.append({'File Name': file_name, 'Text': text_data})

df = pd.DataFrame(data_list)

In [None]:
# Separate the dataframe into three columns, one for patientID (first part of file name), one for visit number (second part of file name), and one for the text
df[['patientID', 'visit']] = df['File Name'].str.split("-", n=1, expand=True)

# Further split 'visit' into 'visit' and 'x' using "."
df[['visit', 'x']] = df['visit'].str.split(".", n=1, expand=True)
'
df = df.rename(columns={'Text': 'text'})

# Select relevant columns and sort by 'patientID'
df = df[['patientID', 'visit', 'text']].sort_values(by=['patientID', 'visit'])

In [None]:
print(df.iloc[10, 2]) # view the text portion of the 10th record (example usage)

In [None]:
# number of records
print(len(df))

In [None]:
# number of patients
print(len(df.groupby('patientID')))

# Blood Pressure

In [None]:
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Extract systolic and diastolic blood pressure using regex
df[['systole', 'diastole']] = df['text'].str.extract(r'(\d{2,3})\s*/\s*(\d{2,3})')

# Convert extracted values to numeric
df[['systole', 'diastole']] = df[['systole', 'diastole']].apply(pd.to_numeric, errors='coerce')

# Drop rows with missing values
df = df.dropna(subset=['systole', 'diastole'])

# Categorize blood pressure levels
def classify_bp(systole, diastole):
    if systole > 180 or diastole > 120:
        return "Crisis"
    elif systole >= 140 or diastole >= 90:
        return "Hypertension 2"
    elif 130 <= systole <= 139 or 80 <= diastole <= 89:
        return "Hypertension 1"
    elif 120 <= systole <= 129 and diastole < 80:
        return "Elevated"
    elif systole < 120 and diastole < 80:
        return "Normal"
    return "Unknown"

df.loc[:, 'level'] = df.apply(lambda row: classify_bp(row['systole'], row['diastole']), axis=1)

# Remove "Unknown" values
df = df[df['level'] != "Unknown"]



In [None]:
print(df.head)

In [None]:
# save to csv
#df.to_csv('drive/MyDrive/BAIS6100/project_df.csv', index=False)

In [None]:
# plot counts of each blood pressure level, faceted by visit number
import seaborn as sns
import matplotlib.pyplot as plt

# Create factor for BP levels
bp_levels = ["Normal", "Elevated", "Hypertension 1", "Hypertension 2", "Crisis"]
df['level_fct'] = pd.Categorical(df['level'], categories=bp_levels, ordered=True)

# Same for visits
visit_labels = ["Visit 1", "Visit 2", "Visit 3", "Visit 4", "Visit 5"]
df['visit_fct'] = pd.Categorical(df['visit'], categories=["01", "02", "03", "04", "05"], ordered=True)

# Faceted bar plot by visit number
g = sns.catplot(
    data=df,
    x="level_fct",
    hue="level_fct",
    col="visit_fct",  # Facet by visit
    kind="count",
    palette="coolwarm",
    order=bp_levels,
    col_wrap=3,
    height=4,
    aspect=1.2
)

g.set_titles("BP Distribution - {col_name}")
g.set_axis_labels("BP Level", "Num Observations")
g.set_xticklabels(rotation=45)
g.set(ylim=(0, df['level_fct'].value_counts().max() + 5))
plt.show()





In [None]:
# or
sns.countplot(data=df, x="visit", hue="level")
plt.show()

# Social History

In [None]:
import pandas as pd
import re

def extract_social_history(text):
    # Find start index
    if re.search(r'(?i)social history', text):
        soc_startin = re.search(r'(?i)social history', text).start() + 15
    elif 'SH' in text:
        soc_startin = text.find('SH') + 3
    else:
        return None  # No recognizable start

    rest_of_text = text[soc_startin:]

    # Find end index: look for "\n\n\n"
    match = re.search(r'\n\n\n', rest_of_text)
    if match:
        soc_endin = match.start()
        soc_history = text[soc_startin:soc_startin + soc_endin]
    else:
        soc_history = text[soc_startin:soc_startin + 200]  # Default to 200 characters

    return soc_history

# Apply
df['soc_history'] = df['text'].apply(extract_social_history)

# Has social history by record
df['has_history'] = df['soc_history'].notna()

# Has social history by patient
num_patients_with_history = df[df['has_history']]['patientID'].nunique()

# Combine social history per patient for topic modeling
combined_hist = (
    df[df['has_history']]
    .groupby('patientID')['soc_history']
    .apply(lambda texts: ' '.join(texts))
    .reset_index()
    .rename(columns={'soc_history': 'history'})
)


In [None]:
df

In [None]:
combined_hist

# Topic Modeling

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import nltk
nltk.download("stopwords")
stemmer = nltk.stem.SnowballStemmer("english", ignore_stopwords=True)
class StemmedCountVectorizer(CountVectorizer):
  def build_analyzer(self):
    analyzer = super(StemmedCountVectorizer, self).build_analyzer()
    return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

Choose # of Topics

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(combined_hist, test_size=0.33, random_state=2021)
df_train.reset_index(drop=True, inplace=True)
df_test.reset_index(drop=True, inplace=True)

In [None]:
global_stopwords = nltk.corpus.stopwords.words("english")

vectorizer = StemmedCountVectorizer(stop_words=global_stopwords,
                                    max_features=100)
train_x = vectorizer.fit_transform(df_train["history"])
test_x = vectorizer.transform(df_test["history"])

In [None]:
lda = LatentDirichletAllocation(n_jobs=-1,
                                random_state=6100)
num_topics = [2, 3, 4, 5, 6, 7, 8, 9, 10]
perplexity = []
for i in num_topics:
  print(i)
  lda.set_params(n_components=i)
  lda.fit(train_x)
  perplexity.append(lda.perplexity(test_x))

plt.plot(num_topics, perplexity)
plt.xlabel("Number of Topics")
plt.ylabel("Perplexity")

High perplexity values, doesn't mean much in a vacuum, but could suggest that there is high variance in the social histories.

In [None]:
# Fit LDA on whole corpus
DTM = vectorizer.fit_transform(combined_hist["history"])
lda = LatentDirichletAllocation(n_components=3,
                                n_jobs=-1,
                                random_state=6100)
lda.fit(DTM)

Trying PyLDAvis

In [None]:
#!pip install pyLDAvis
#!pip install numpy==1.24.4
import pyLDAvis
import pyLDAvis.lda_model

html = pyLDAvis.lda_model.prepare(lda, DTM, vectorizer)

# save html
#pyLDAvis.save_html(html, 'drive/MyDrive/BAIS6100/lda_sklearn.html')

Within-topic and between-topic similarity

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

# get document-topic distribution matrix (rows = docs, cols = topics)
doc_topic_dist = lda.transform(DTM)  # shape: (num_docs, num_topics)
print(doc_topic_dist.shape)
# normalize for cosine similarity
doc_topic_dist = normalize(doc_topic_dist, norm='l2')

# Assign each document to its dominant topic
doc_labels = np.argmax(doc_topic_dist, axis=1)

# within-cluster similarities
within_sims = []
for topic_id in range(lda.n_components):
    indices = np.where(doc_labels == topic_id)[0] # get indices for documents assigned to topic i
    cluster_docs = doc_topic_dist[indices] # subset docs according to those indices
    sims = cosine_similarity(cluster_docs) # computes cosine similarity for every pair of documents
    upper_tri = sims[np.triu_indices_from(sims, k=1)] # matrix is necessarily symmetric, so take just the upper triangle
    if len(upper_tri) > 0:
        within_sims.append(np.mean(upper_tri))

avg_within_similarity = np.mean(within_sims)

# between-cluster similarity
topic_centroids = []
for topic_id in range(lda.n_components):
    indices = np.where(doc_labels == topic_id)[0]
    if len(indices) == 0:
        continue
    topic_centroids.append(doc_topic_dist[indices].mean(axis=0))

centroid_sim_matrix = cosine_similarity(topic_centroids)
between_sims = centroid_sim_matrix[np.triu_indices_from(centroid_sim_matrix, k=1)]
avg_between_similarity = np.mean(between_sims)


In [None]:
print(within_sims) # within-cluster similarity scores for each topic
print(avg_within_similarity) # average similarity for the three topics
print(between_sims) # average between-cluster simlarity for each topic with the other two
print(avg_between_similarity) # average of the three