## Imports

In [50]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
import random


from sklearn.linear_model import LogisticRegression

import hw3_main
from helper import *

random.seed(69)

## Load Data

In [10]:
X_challenge, y_challenge, X_heldout, feature_names = get_challenge_data()

Loading files from disk: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [00:05<00:00, 2008.45it/s]
Generating feature vectors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [00:06<00:00, 1961.40it/s]


## Exploration

Seeing if we should one-hot encode any categorical variables by looking at the number of unique values in each column:

In [51]:
df = make_df(X_challenge, feature_names)
print(df.columns[df.nunique() <= 5])

Index(['Gender', 'ICUType'], dtype='object')


As we can see, these 2 variables have less than 5 unique observations. We can make them into one-hot encoded variables.

## Setup

In [11]:
def generate_feature_vector_challenge(df):
    return hw3_main.generate_feature_vector(df)

In [12]:
def impute_missing_values_challenge(X):
    return hw3_main.impute_missing_values(X)

In [13]:
def normalize_feature_matrix_challenge(X):
    return hw3_main.normalize_feature_matrix(X)

In [14]:
def get_train_val_split(X: np.ndarray[float], y: np.ndarray[int]):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, stratify=y, random_state=69)
    return X_train, X_val, y_train, y_val

In [30]:
def make_df(X, names):
    assert X.shape[1] == len(names)

    tmp = dict()
    for i, k in enumerate(names):
        tmp[k] = X[:, i]
    return pd.DataFrame(tmp)

In [56]:
def preprocess(X: np.ndarray[float], feature_names):
    df = make_df(X, feature_names)

    df_new = pd.get_dummies(df, columns=['Gender', 'ICUType'])
    df_new.rename(columns={
        'Gender_0.0': 'gender_0',
        'Gender_0.5611444778111445': 'gender_1',
        'Gender_1.0': 'gender_2',
        'ICUType_0.0': 'CCU',
        'ICUType_0.3333333333333333': 'CSRU',
        'ICUType_0.6666666666666666': 'MedICU',
        'ICUType_1.0': 'SurgICU'
    }, inplace=True)
    print(df_new.columns)

    X_new = X
    feature_names_new = feature_names

    

    return X_new, feature_names_new

In [57]:
def run_challenge(X_challenge, y_challenge, X_heldout, feature_names):
    X_cha, feat_names = preprocess(X_challenge, feature_names)
    # clf = LogisticRegression()
    # clf.fit(X_challenge, y_challenge)

    # X_train, X_val, y_train, y_val = get_train_val_split(X_challenge, y_challenge)
    # assert X_train.shape[0] == y_train.size and X_val.shape[0] == y_val.size


    # C_range = np.logspace(-4, 4, 9)

    # y_score = clf.predict_proba(X_heldout)[:, 1]
    # y_label = clf.predict(X_heldout)
    # make_challenge_submission(y_label, y_score)
run_challenge(X_challenge, y_challenge, X_heldout, feature_names)

Index(['Age', 'Height', 'Weight', 'mean_ALP', 'mean_ALT', 'mean_AST',
       'mean_Albumin', 'mean_BUN', 'mean_Bilirubin', 'mean_Cholesterol',
       'mean_Creatinine', 'mean_DiasABP', 'mean_FiO2', 'mean_GCS',
       'mean_Glucose', 'mean_HCO3', 'mean_HCT', 'mean_HR', 'mean_K',
       'mean_Lactate', 'mean_MAP', 'mean_Mg', 'mean_NIDiasABP', 'mean_NIMAP',
       'mean_NISysABP', 'mean_Na', 'mean_PaCO2', 'mean_PaO2', 'mean_Platelets',
       'mean_RespRate', 'mean_SaO2', 'mean_SysABP', 'mean_Temp',
       'mean_TroponinI', 'mean_TroponinT', 'mean_Urine', 'mean_WBC', 'mean_pH',
       'gender_0', 'gender_1', 'gender_2', 'CCU', 'CSRU', 'MedICU', 'SurgICU'],
      dtype='object')


## Running

In [8]:
run_challenge(X_challenge, y_challenge, X_heldout, feature_names)
test_challenge_output()

Loading files from disk: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [00:05<00:00, 2093.39it/s]
Generating feature vectors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [00:07<00:00, 1505.91it/s]


Part 3: Challenge
Saving challenge output...
challenge.csv saved
SUCCESS: csv file is valid.
