# Supervised Learning

In [195]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os
import tarfile
import glob
from datetime import datetime
import math
import csv
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import accuracy_score, classification_report

### Q2.1 Classic Machine Learning Methods (5 Pts)

In [270]:
# Retrieving the X matrix (can use)
df = pd.read_parquet('imputed-data/imputed-set-a.parquet')
X_normal = df.groupby("RecordID").max(numeric_only=True).reset_index()
X_normal = X_normal.drop(columns=["RecordID"])
X_normal = X_normal[sorted(X_normal.columns)]

df = pd.read_parquet('scaled-data/scaled-set-a.parquet')
X_scaled = df.groupby("RecordID").max(numeric_only=True).reset_index()
X_scaled = X_scaled.drop(columns=["RecordID"])
X_scaled = X_scaled[sorted(X_scaled.columns)]

In [226]:
# Retrieving the label vector
y_df = pd.read_parquet('processed-data/processed-outcomes-a.parquet')
y = y_df["In-hospital_death"].to_numpy().flatten()
y

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [227]:
# Models
# Logistic Regression (best scores using max_iter=4300)
model1 = LogisticRegression(max_iter=4300)
model1.fit(X_normal,y)

# Random Forest
model2 = RandomForestClassifier()
model2.fit(X_normal,y)

# KNN
model3 = KNeighborsClassifier()
model3.fit(X_normal,y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [228]:
# Test set C performance

# Loading test set C
df = pd.read_parquet('imputed-data/imputed-set-c.parquet')
X_test = df.groupby("RecordID").mean(numeric_only=True).reset_index()
X_test = X_test.drop(columns=["RecordID"])
X_test= X_test[sorted(X_test.columns)]

y_df = pd.read_parquet('processed-data/processed-outcomes-c.parquet')
y_test = y_df["In-hospital_death"].to_numpy().flatten()

y_pred1 = model1.predict(X_test)
y_pred2 = model2.predict(X_test)
y_pred3 = model3.predict(X_test)

# Calculation of AuROC and AuPRC for Logistic Regression
print("Logistic Regression results")
auroc = roc_auc_score(y_test, y_pred1)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred1)
print(f"AUPRC: {auprc}")
accuracy = accuracy_score(y_test, y_pred1)
print(f"Accuracy: {accuracy}", end="\n\n")

# Calculation of AuROC and AuPRC for Random Forests
print("Random Forests results")
auroc = roc_auc_score(y_test, y_pred2)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred2)
print(f"AUPRC: {auprc}")
accuracy = accuracy_score(y_test, y_pred2)
print(f"Accuracy: {accuracy}", end="\n\n")

# Calculation of AuROC and AuPRC for KNN
print("KNN results")
auroc = roc_auc_score(y_test, y_pred3)
print(f"AUROC: {auroc}")
auprc = average_precision_score(y_test, y_pred3)
print(f"AUPRC: {auprc}")
accuracy = accuracy_score(y_test, y_pred3)
print(f"Accuracy: {accuracy}", end="\n\n")


Logistic Regression results
AUROC: 0.7200060066824341
AUPRC: 0.2747037297075142
Accuracy: 0.765

Random Forests results
AUROC: 0.5668831575127329
AUPRC: 0.18764396453959561
Accuracy: 0.8325

KNN results
AUROC: 0.5913441203338714
AUPRC: 0.19653379203379204
Accuracy: 0.80775



In [229]:
from tsfresh import extract_features, extract_relevant_features, select_features
from tsfresh.utilities.dataframe_functions import impute
from tsfresh.feature_extraction import ComprehensiveFCParameters

In [None]:
extraction_settings = ComprehensiveFCParameters()
df = pd.read_parquet('imputed-data/imputed-set-a.parquet')
numeric_cols = df.select_dtypes(include=['number']).columns
df[numeric_cols] = df[numeric_cols].apply(lambda col: col.fillna(col.median()))
#X = extract_features(df, column_id='RecordID', default_fc_parameters=extraction_settings)

Unnamed: 0,Time,Age,BUN,Creatinine,GCS,Gender,Glucose,HCO3,HCT,HR,...,FiO2,Lactate,MAP,MechVent,PaCO2,PaO2,SaO2,SysABP,TroponinI,pH
0,00:00,35.0,17.5,1.05,15.0,0.0,136.5,22.0,37.45,92.5,...,1.0,2.500,83.0,1.0,39.0,248.5,97.0,121.000000,3.25,7.34
1,01:00,35.0,20.0,1.00,14.0,0.0,142.0,23.0,32.50,88.5,...,1.0,2.350,84.0,1.0,41.0,261.5,97.0,121.000000,3.10,7.37
2,02:00,35.0,19.0,0.90,15.0,0.0,140.0,23.0,31.60,112.0,...,0.7,2.325,81.0,1.0,41.0,236.0,98.0,117.500000,4.20,7.37
3,03:00,35.0,68.0,2.30,15.0,0.0,603.0,11.0,25.50,113.0,...,0.6,2.300,80.0,1.0,41.0,193.5,98.0,116.666667,1.10,7.36
4,04:00,35.0,68.0,2.30,15.0,0.0,603.0,11.0,25.50,112.0,...,0.5,2.300,78.5,1.0,41.0,183.0,98.0,115.875000,1.40,7.36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195995,44:00,56.0,9.0,0.40,15.0,1.0,107.0,29.0,34.40,95.0,...,0.4,1.100,103.0,1.0,40.0,63.0,91.0,150.000000,3.25,7.40
195996,45:00,56.0,9.0,0.40,15.0,1.0,107.0,29.0,34.40,94.0,...,0.4,1.100,105.0,1.0,40.0,63.0,91.0,151.000000,5.00,7.40
195997,46:00,56.0,9.0,0.40,15.0,1.0,107.0,29.0,34.40,100.0,...,0.4,1.100,116.0,1.0,40.0,63.0,91.0,167.000000,3.30,7.40
195998,47:00,56.0,9.0,0.40,15.0,1.0,107.0,29.0,34.40,99.0,...,0.4,1.100,88.0,1.0,40.0,63.0,91.0,131.000000,1.50,7.40


### Q2.2 Recurrent Neural Networks (4 Pts)

### Q2.3a Transformers (3 Pts)

### Q2.3b Tokenizing Time-Series Data and Transformers (4 Pts)