In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np
# load the wrangled table
wrangled_data = pd.read_csv('admit_pat_serv_lab_proc_diag.csv')

In [None]:
# final count of labels 0 and 1
wrangled_data.groupby('OUTPUT_LABEL')['HADM_ID'].count()

In [None]:
# check if there are empty values
wrangled_data.isna().sum().sum()

In [None]:
# quick look at the columns
wrangled_data.columns

In [None]:
# get all unnamed columns
# there are two Unnamed columns, find and eliminate
col_names = wrangled_data.columns.tolist()
unwanted_cols = [col for col in col_names if col.startswith('Unnamed')]
unwanted_cols

In [None]:
wrangled_data.drop(columns=unwanted_cols, inplace=True)
y = wrangled_data.OUTPUT_LABEL.values
X = wrangled_data.drop(['HADM_ID','OUTPUT_LABEL'], axis=1).values
print(y.shape)
print(X.shape)

In [None]:
# Set a seed for reproducibility
SEED = 222
np.random.seed(SEED)

# split the data into train and test sets
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=SEED)

# scaling is necessary for some algorithms, such as SVM and neural nets, scale the data to min-max 0-1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)
Xtest_scaled = scaler.transform(Xtest)

In [None]:
import warnings
warnings.filterwarnings("ignore")
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report

start_time = time.monotonic()
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)
solvers = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
clf = LogisticRegression(solver='liblinear', random_state=SEED, class_weight='balanced', max_iter=200)
clf.fit(Xtrain,ytrain)
ypred = clf.predict(Xtest)
print("Solver: ",solver)
print(confusion_matrix(ytest, ypred))
print(classification_report(ytest, ypred))
print(time.monotonic()-start_time)
start_time = time.monotonic()