In [1]:
import pandas as pd
import numpy as np

In [9]:
# Set a seed for reproducibility
SEED = 222
np.random.seed(SEED)

# split the data into train and test sets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import time
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

In [14]:
def logreg_fit(csv_file):
    df = pd.read_csv('../data/processed/{}'.format(csv_file))
    df.drop('Unnamed: 0', axis=1, inplace=True)
    # check if there are empty values
    assert df.isna().sum().sum()==0, 'There are missing values'
    y = df.OUTPUT_LABEL.values
    X = df.drop(['OUTPUT_LABEL'], axis=1).values
    Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=SEED)
    scaler = MinMaxScaler()
    Xtrain_scaled = scaler.fit_transform(Xtrain)
    Xtest_scaled = scaler.transform(Xtest)
    start_time = time.monotonic()
    clf = LogisticRegression(C=100, solver='liblinear', random_state=SEED, class_weight='balanced', max_iter=200)
    clf.fit(Xtrain_scaled,ytrain)
    ypred = clf.predict(Xtest_scaled)
    roc_auc = roc_auc_score(ytest, ypred)
    print("File name:{}".format(csv_file))
    print("Confusion matrix")
    print(confusion_matrix(ytest, ypred))
    print("Classification report")
    print(classification_report(ytest, ypred))
    print("AUROC {}".format(roc_auc))
    print("Time elapsed {}s".format(time.monotonic()-start_time))
    start_time = time.monotonic()

In [15]:
logreg_fit('hosp_exp_incl_proc_excl_noselect.csv')

File name:hosp_exp_incl_proc_excl_noselect.csv
Confusion matrix
[[4817 2292]
 [ 154  380]]
Classification report
              precision    recall  f1-score   support

         0.0       0.97      0.68      0.80      7109
         1.0       0.14      0.71      0.24       534

    accuracy                           0.68      7643
   macro avg       0.56      0.69      0.52      7643
weighted avg       0.91      0.68      0.76      7643

AUROC 0.6946011359762879
Time elapsed 10.472655594000116s


In [17]:
logreg_fit('hosp_exp_excl_proc_excl_noselect.csv')

File name:hosp_exp_excl_proc_excl_noselect.csv
Confusion matrix
[[4251 1859]
 [ 202  336]]
Classification report
              precision    recall  f1-score   support

         0.0       0.95      0.70      0.80      6110
         1.0       0.15      0.62      0.25       538

    accuracy                           0.69      6648
   macro avg       0.55      0.66      0.53      6648
weighted avg       0.89      0.69      0.76      6648

AUROC 0.660139998418097
Time elapsed 11.02093409400004s
