In [11]:
import pandas as pd
from nltk.tokenize import word_tokenize
import re
from gensim.parsing.preprocessing import remove_stopwords

# Load your dataset
df = pd.read_csv('data/training_text.csv')

def preprocess_text(text):
    text = text.lower()  # Lowercase the text
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = remove_stopwords(text)  # Remove stopwords
    tokens = word_tokenize(text)  # Tokenize the text
    return tokens

# Apply preprocessing
df['tokens'] = df['training_text'].apply(preprocess_text)


In [12]:
df.head(2)

Unnamed: 0,subject_id,hadm_id,admittime,discharge_location,adm_text,severity_level,note_id,disc_charttime,disc_text,Chief Complaint,...,Brief Hospital Course,Medications on Admission,Discharge Medications,Discharge Disposition,Discharge Diagnosis,Discharge Condition,Discharge Instructions,Followup Instructions,training_text,tokens
0,10000032,22595853,2180-05-06 22:23:00,HOME,"Subject ID: 10000032, Hospital Admission ID: 2...",Minimal Care/Recovery at Home,10000032-DS-21,2180-05-07,"Subject ID: 10000032, HAdm ID: 22595853, Chart...",Worsening ABD distension and pain,...,"___ HCV cirrhosis c/b ascites, hiv on ART, h/o...",The Preadmission Medication list is accurate a...,1. Albuterol Inhaler 2 PUFF IH Q4H:PRN wheezin...,Home,Ascites from Portal HTN,Mental Status: Clear and coherent.\nLevel of C...,"Dear Ms. ___,\nIt was a pleasure taking care o...",___,"Subject ID: 10000032, Hospital Admission ID: 2...","[subject, id, hospital, admission, id, admissi..."
1,10000032,22841357,2180-06-26 18:27:00,HOME,"Subject ID: 10000032, Hospital Admission ID: 2...",Minimal Care/Recovery at Home,10000032-DS-22,2180-06-27,"Subject ID: 10000032, HAdm ID: 22841357, Chart...",abdominal fullness and discomfort,...,"___ with HIV on HAART, HCV cirrhosis with asci...",The Preadmission Medication list is accurate a...,"1. Acetaminophen 500 mg PO Q6H:PRN pain,fever ...",Home,Primary: diuretic refractory ascites\nSeconda...,Mental Status: Clear and coherent.\nLevel of C...,"Dear ___,\n\n___ was a pleasure to take care o...",___,"Subject ID: 10000032, Hospital Admission ID: 2...","[subject, id, hospital, admission, id, admissi..."


In [13]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Create TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(df['tokens'])]

# Train Doc2Vec model
model = Doc2Vec(
    documents,
    vector_size=300,
    window=5,
    min_count=2,
    workers=10,
    epochs=40,
    alpha=0.025,
    min_alpha=0.0001,
    dm=1,  # PV-DM
    dbow_words=1  # If using PV-DBOW (dm=0)
)


# Infer vectors for the documents
df['doc2vec_vector'] = df['tokens'].apply(lambda x: model.infer_vector(x))


In [14]:
from sklearn.cluster import KMeans
import numpy as np

# Extract vectors
vectors = np.array(df['doc2vec_vector'].tolist())

# Define number of clusters
num_clusters = 3  # This should be tuned based on your data

# Train K-Means model
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
df['cluster'] = kmeans.fit_predict(vectors)


In [15]:
# Create a mapping from clusters to discharge types
cluster_mapping = {}

for cluster in range(num_clusters):
    cluster_data = df[df['cluster'] == cluster]
    common_discharge_type = cluster_data['severity_level'].mode().values[0]
    cluster_mapping[cluster] = common_discharge_type

# Map clusters to discharge types
df['predicted_severity_level'] = df['cluster'].map(cluster_mapping)


In [16]:
from sklearn.metrics import classification_report

# Evaluate the classification
print(classification_report(df['severity_level'], df['predicted_severity_level']))


                                precision    recall  f1-score   support

 Minimal Care/Recovery at Home       0.74      1.00      0.85      9508
Moderate Care/Support Required       0.00      0.00      0.00      2723
                Severe Outcome       0.00      0.00      0.00       546

                      accuracy                           0.74     12777
                     macro avg       0.25      0.33      0.28     12777
                  weighted avg       0.55      0.74      0.63     12777



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
