In [None]:

!pip install --upgrade google-cloud


In [None]:
!pip install google-cloud-bigquery

!gcloud --version

In [None]:
!gcloud auth list
!gcloud auth application-default login

In [None]:
project_name = "data-project-455021"

In [None]:
#Run to test that everythings working with google cloud
from google.cloud import bigquery
from google.oauth2 import service_account
import pandas as pd
import matplotlib.pyplot as plt
import db_dtypes as db_dtypes

client = bigquery.Client(project=project_name)

# List datasets in the specified project
datasets = list(client.list_datasets())

# Print the dataset names
for dataset in datasets:
    print(dataset.dataset_id)from collections import Counter
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import LabelEncoder
    from sklearn.linear_model import LogisticRegression
    from sklearn.metrics import accuracy_score, classification_report

    top_n = 10
    label_counts = Counter(y)


sql = f'SELECT * FROM `{project_name}.mimic3_v1_4.PATIENTS`'
query_job = client.query(sql)

# df = query_job.to_dataframe()
rows = query_job.result()
patients_df = pd.DataFrame([dict(row) for row in rows])

plt.figure(figsize=(8, 6))
patients_df['GENDER'].value_counts().plot(kind='bar')

plt.title('Gender Distribution in MIMIC-III Patient Dataset')
plt.xlabel('Gender')
plt.ylabel('Count')

plt.show()

In [None]:
# SQL query to join the admissions table with the diagnoses table
sql = f"""
SELECT
    a.SUBJECT_ID,
    a.HADM_ID,
    d.ICD9_CODE,
    a.ADMITTIME
FROM
    `{project_name}.mimic3_v1_4.ADMISSIONS` as a
LEFT JOIN
    `{project_name}.mimic3_v1_4.DIAGNOSES_ICD` as d
ON
    a.SUBJECT_ID = d.SUBJECT_ID AND a.HADM_ID = d.HADM_ID
"""
query_job = client.query(sql)
rows = query_job.result()

diagnoses_df = pd.DataFrame([dict(row) for row in rows])

diagnoses_df['DIAGNOSIS_FLAG'] = diagnoses_df['ICD9_CODE'].notnull()

# Left join because want to keep all admissions to the hospital, and add false to flag if no diagnosis was given

ground_truth_df = diagnoses_df[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'ICD9_CODE', 'DIAGNOSIS_FLAG']]

ground_truth_df['DIAGNOSIS_FLAG'].fillna(False, inplace=True)
print("Ground Truth DataFrame:", len(ground_truth_df))
print(ground_truth_df.head(5))

In [None]:
# Count the occurrences of each ICD9_CODE
icd9_counts = diagnoses_df['ICD9_CODE'].value_counts()

plt.figure(figsize=(10, 6))
icd9_counts.head(20).plot(kind='bar')

plt.title('Top 20 Most Common ICD-9 Diagnoses')
plt.xlabel('ICD-9 Code')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')

plt.show()

In [None]:
icd_d_sql = f'SELECT * FROM `{project_name}.mimic3_v1_4.D_ICD_DIAGNOSES`'
icd_df_job = client.query(icd_d_sql)
icd_df_rows = icd_df_job.result()
icd_df = pd.DataFrame([dict(row) for row in icd_df_rows])
icd_df = icd_df.dropna(how='all')

ground_truth_df = pd.merge(diagnoses_df, icd_df, on=['ICD9_CODE'], how='left')
print(ground_truth_df.head(5))

icd9_counts = ground_truth_df['SHORT_TITLE'].value_counts()

plt.figure(figsize=(10, 6))
icd9_counts.head(20).plot(kind='bar')

plt.title('Top 20 Most Common ICD-9 Diagnoses')
plt.xlabel('ICD-9 Code')
plt.ylabel('Count')
plt.xticks(rotation=45, ha='right')

plt.show()

In [None]:

patients_df = patients_df.dropna(how='all')
admissions_sql = f'SELECT * FROM `{project_name}.mimic3_v1_4.ADMISSIONS`'
admissions_query_job = client.query(admissions_sql)
admissions_rows = admissions_query_job.result()
admissions_df = pd.DataFrame([dict(row) for row in admissions_rows])
admissions_df = admissions_df.dropna(how='all')
print(admissions_df['SUBJECT_ID'].isin(patients_df['SUBJECT_ID']).all())  # True means filtering is unnecessary

In [None]:
note_events_sql = f"""
SELECT SUBJECT_ID, HADM_ID, DESCRIPTION, ISERROR, TEXT, CHARTDATE
FROM (
  SELECT *, ROW_NUMBER() OVER (PARTITION BY SUBJECT_ID ORDER BY CHARTDATE DESC) AS rn
  FROM `{project_name}.mimic3_v1_4.NOTEEVENTS`
  WHERE HADM_ID IS NOT NULL
)
WHERE rn = 1
"""

note_job = client.query(note_events_sql)
note_rows = note_job.result()
note_df = pd.DataFrame([dict(row) for row in note_rows])
note_df = note_df.dropna(how='all')
print(note_df.head(5))
print(len(note_df))
# print(note_df['TEXT'].iloc[0])

In [None]:
import csv

filename = 'new_patient_notes.csv'
header = ['patient_id', 'notes']
rows = note_df[['SUBJECT_ID', 'TEXT']].values.tolist()

with open(filename, 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile)

    csvwriter.writerow(header)
    csvwriter.writerows(rows)

print(f"Created CSV file: {filename}")

In [None]:
print("Number of entries in CSV file", len(rows))

In [None]:
diag_sql = f"""
SELECT SUBJECT_ID, HADM_ID, ICD9_CODE AS PRIMARY_DIAGNOSIS
FROM {project_name}.mimic3_v1_4.DIAGNOSES_ICD
WHERE SEQ_NUM = 1
"""
diag_job = client.query(diag_sql)
diag_rows = diag_job.result()
diag_df = pd.DataFrame([dict(row) for row in diag_rows])
combined_df = pd.merge(note_df, diag_df, on=['SUBJECT_ID', 'HADM_ID'], how='inner')

In [None]:
num_unique_diagnoses = combined_df['PRIMARY_DIAGNOSIS'].nunique()
print(f"Number of unique ICD-9 diagnosis: {num_unique_diagnoses}")


In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt


label_encoder = LabelEncoder()
combined_df['diagnosis_label'] = label_encoder.fit_transform(combined_df['PRIMARY_DIAGNOSIS'])
y_labels = combined_df['diagnosis_label']

In [None]:
x1 = pd.read_csv('compiled_important_notes.csv')

x1['diagnosis_label'] = y_labels
x1 = x1.dropna(subset=['important_notes', 'diagnosis_label'])

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

X = vectorizer.fit_transform(x1['important_notes'])
y = x1['diagnosis_label']


In [None]:
# (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
#Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate the model
lg_acc = model.score(X_test, y_test)
print(f"Accuracy: {accuracy:.4f}")

In [None]:
#Random Forest

model = RandomForestClassifier(
    class_weight='balanced',
    n_estimators=200,
    max_depth=30,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)
y_pred = model.predict(X_train)


In [None]:
print("Accuracy:", accuracy_score(y_train, y_pred))
rf_accuracy =  accuracy_score(y_train, y_pred)


In [None]:
models = ['Logistic Regression', 'Random Forest']
accuracies = [lg_acc, rf_accuracy]

plt.figure(figsize=(6, 4))
bars = plt.bar(models, accuracies)

for bar, acc in zip(bars, accuracies):
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval + 0.01, f'{acc:.3f}', ha='center', va='bottom')

plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()


In [None]:
# df = pd.read_csv('new_patient_notes.csv')

new_df = combined_df
print(combined_df.head())

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModel
from torch.optim import AdamW
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import pandas as pd
import numpy as np
from collections import Counter

In [None]:
df = new_df.copy()
df = df.dropna(subset=['TEXT', 'PRIMARY_DIAGNOSIS'])

top_n = 10
top_labels = df['PRIMARY_DIAGNOSIS'].value_counts().nlargest(top_n).index
df = df[df['PRIMARY_DIAGNOSIS'].isin(top_labels)]

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['PRIMARY_DIAGNOSIS'])


tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

class MIMICDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=256):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_length)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

X_train, X_val, y_train, y_val = train_test_split(
    df['TEXT'].tolist(), df['label'].tolist(),
    test_size=0.2, stratify=df['label'], random_state=42
)

train_dataset = MIMICDataset(X_train, y_train, tokenizer)
val_dataset = MIMICDataset(X_val, y_val, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16)


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    num_labels=len(top_labels)
)
model.to(device)


optimizer = AdamW(model.parameters(), lr=2e-5)
epochs = 3

for epoch in range(epochs):
    model.train()
    total_loss = 0
    loop = tqdm(enumerate(train_loader), total=len(train_loader), leave=False)

    for step, batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

        if step % 10 == 0:
            loop.set_description(f"Epoch {epoch+1}")
            loop.set_postfix(loss=f"{loss.item():.4f}")

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch + 1} average loss: {avg_loss:.4f}")



model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in val_loader:
        inputs = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
        logits = model(**inputs).logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()
        labels = batch['labels'].numpy()
        all_preds.extend(preds)
        all_labels.extend(labels)

print("Accuracy:", accuracy_score(all_labels, all_preds))
print("F1 Score:", f1_score(all_labels, all_preds, average='weighted'))
print("Classification Report:\n", classification_report(all_labels, all_preds, target_names=label_encoder.classes_))

In [None]:
bert_acc = accuracy_score(all_labels, all_preds)

In [None]:
models = ['Logistic Regression', 'Random Forest', 'BERT Model']
accuracies = [lg_acc, rf_accuracy, bert_acc]

plt.figure(figsize=(6, 4))
bars = plt.bar(models, accuracies)

for bar, acc in zip(bars, accuracies):
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval + 0.01, f'{acc:.3f}', ha='center', va='bottom')

plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

#basic data cleaning of notes

def clean_notes(text):
    text = str(text).lower()

    text = re.sub(r'\[\*\*.*?\*\*\]', '', text)

    text = re.sub(r'\d{1,4}[-/]\d{1,2}[-/]\d{1,4}', ' ', text)
    text = re.sub(r'\d{1,2}:\d{2}(?: ?[ap]m)?', ' ', text)
    text = re.sub(r'\d+', ' ', text)

    text = re.sub(r'\b(?:chief complaint|hpi|assessment and plan|ros|history|allergies|labs|radiology|ecg|plan|exam|vitals|disposition|medications|home meds|past medical history|review of systems|comments|communication):', ' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)

    tokens = text.split()
    tokens = [token for token in tokens if token not in ENGLISH_STOP_WORDS and len(token) > 2 and token.isalpha()]

    return ' '.join(tokens)

clean_x['cleaned_notes'] = clean_x['notes'].apply(clean_notes)


In [None]:
print(clean_x.head())

In [None]:
top_n = 10
label_counts = Counter(y)
top_labels = [label for label, _ in label_counts.most_common(top_n)]
mask = [label in top_labels for label in y]

X_filtered = embeddings[mask]
y_filtered = np.array(y)[mask]

label_encoder = LabelEncoder()
y_filtered_encoded = label_encoder.fit_transform(y_filtered)

X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_filtered_encoded, test_size=0.2, random_state=42, stratify=y_filtered_encoded
)

clf = LogisticRegression(
    max_iter=1000,
    class_weight='balanced',
    solver='lbfgs',
    n_jobs=-1
)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
new_lg_acc = accuracy_score(y_test, y_pred)
print("Classification Report:\n", classification_report(
    y_test, y_pred, target_names=label_encoder.classes_.astype(str)))

In [None]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

important_notes = pd.read_csv("compiled_important_notes.csv")

combined_df = pd.merge(diag_df, important_notes, left_on='SUBJECT_ID', right_on='patient_id', how='inner')

from tqdm import tqdm

def get_embeddings_batched(texts, batch_size=16):
    all_embeddings = []

    for i in tqdm(range(0, len(texts), batch_size)):
        batch_texts = texts[i:i+batch_size]
        tokens = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**tokens)
        embeddings = outputs.last_hidden_state[:, 0, :].numpy()
        all_embeddings.append(embeddings)

    return np.vstack(all_embeddings)

texts = combined_df['important_notes'].fillna('').astype(str).tolist()
embeddings = get_embeddings_batched(texts, batch_size=16)


In [None]:
models = ['Logistic Regression', 'Random Forest']
accuracies = [lg_acc, rf_accuracy]

plt.figure(figsize=(6, 4))
bars = plt.bar(models, accuracies)

for bar, acc in zip(bars, accuracies):
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval + 0.01, f'{acc:.3f}', ha='center', va='bottom')

plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('TF-IDF Basic Model Accuracy Comparison')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()

In [None]:
models = ['Logistic Regression', 'BERT Model']
accuracies = [new_lg_acc, bert_acc]

plt.figure(figsize=(6, 4))
bars = plt.bar(models, accuracies)

for bar, acc in zip(bars, accuracies):
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, yval + 0.01, f'{acc:.3f}', ha='center', va='bottom')

plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.title('Model Accuracy Comparison (Top 10)')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()
plt.show()