## Imports

In [49]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
import random


from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

import hw3_main
from helper import *

random.seed(69)

## Load Data

In [46]:
X_challenge, y_challenge, X_heldout, feature_names = local_get_challenge_data()




Loading files from disk:   0%|                                                                                                                               | 0/12000 [00:00<?, ?it/s][A[A[A


Loading files from disk:   1%|▉                                                                                                                    | 98/12000 [00:00<00:13, 902.67it/s][A[A[A


Loading files from disk:   2%|█▉                                                                                                                  | 194/12000 [00:00<00:12, 927.51it/s][A[A[A


Loading files from disk:   2%|██▊                                                                                                                 | 287/12000 [00:00<00:12, 919.35it/s][A[A[A


Loading files from disk:   3%|███▋                                                                                                                | 379/12000 [00:00<00:13, 842.73it/s][A[A[A


Loading files from dis

Loading files from disk:  34%|███████████████████████████████████████▍                                                                           | 4120/12000 [00:04<00:08, 973.76it/s][A[A[A


Loading files from disk:  35%|████████████████████████████████████████▍                                                                          | 4218/12000 [00:04<00:07, 974.80it/s][A[A[A


Loading files from disk:  36%|█████████████████████████████████████████▎                                                                         | 4316/12000 [00:04<00:07, 975.90it/s][A[A[A


Loading files from disk:  37%|██████████████████████████████████████████▎                                                                        | 4414/12000 [00:04<00:07, 973.59it/s][A[A[A


Loading files from disk:  38%|███████████████████████████████████████████▎                                                                       | 4520/12000 [00:04<00:07, 957.38it/s][A[A[A


Loading files from disk: 

Loading files from disk:  70%|████████████████████████████████████████████████████████████████████████████████▉                                  | 8445/12000 [00:09<00:03, 985.80it/s][A[A[A


Loading files from disk:  71%|█████████████████████████████████████████████████████████████████████████████████▉                                 | 8544/12000 [00:10<00:03, 942.62it/s][A[A[A


Loading files from disk:  72%|██████████████████████████████████████████████████████████████████████████████████▊                                | 8639/12000 [00:10<00:03, 937.82it/s][A[A[A


Loading files from disk:  73%|███████████████████████████████████████████████████████████████████████████████████▋                               | 8738/12000 [00:10<00:03, 951.92it/s][A[A[A


Loading files from disk:  74%|████████████████████████████████████████████████████████████████████████████████████▋                              | 8834/12000 [00:10<00:03, 953.96it/s][A[A[A


Loading files from disk: 

## Exploration

In [48]:
len(feature_names)

75

Seeing if we should one-hot encode any categorical variables by looking at the number of unique values in each column:

In [36]:
df = make_df(X_challenge, feature_names)
print(df.columns[df.nunique() <= 5])

Index(['Gender', 'ICUType'], dtype='object')


As we can see, these 2 variables have less than 5 unique observations. We can make them into one-hot encoded variables.

## Setup

In [37]:
def make_df(X, names):
    assert X.shape[1] == len(names)

    tmp = dict()
    for i, k in enumerate(names):
        tmp[k] = X[:, i]
    return pd.DataFrame(tmp)

In [38]:
def local_generate_feature_vector_challenge(df):
    static_variables = config['static']
    timeseries_variables = config['timeseries']
    feature_dict = {}
    
    for var in static_variables:
        val = df[df["Variable"] == var]["Value"].values[0]
        feature_dict[var] = val

        if val < 0:
            feature_dict[var] = np.nan
        else:
            feature_dict[var] = val
    
        # Time-varying variables
    for var in timeseries_variables:
        if (df['Variable'] == var).sum() == 0:
            fval = np.nan
            sval = np.nan
            norm_sd = np.nan
        else:
            fval = (df[(df["Variable"] == var) & (df["Time"].str[:2].astype(int) < 24)]["Value"]).mean()
            sval = (df[(df["Variable"] == var) & (df["Time"].str[:2].astype(int) >= 24)]["Value"]).mean()
            sd = (df[(df["Variable"] == var)]["Value"]).std()
            mean = (df[(df["Variable"] == var)]["Value"]).mean()
            norm_sd = sd/mean
            
        feature_dict[f"f24_mean_{var}"] = fval
        feature_dict[f"s24_mean_{var}"] = sval
        feature_dict[f"norm_sd_{var}"] = norm_sd
    
    return feature_dict

# local_generate_feature_vector_challenge(df)

In [39]:
def local_get_challenge_data():
    """
    This function is similar to helper.get_train_test_split, except that:
    - It reads in all 10,000 training examples
    - It does not return labels for the 2,000 examples in the heldout test set
    You should replace your preprocessing functions (generate_feature_vector,
    impute_missing_values, normalize_feature_matrix) with updated versions for the challenge
    """


    df_labels = pd.read_csv('data/labels.csv')
    df_labels = df_labels
    IDs = df_labels['RecordID']
    raw_data = {}
    for i in tqdm(IDs, desc='Loading files from disk'):
        raw_data[i] = pd.read_csv('data/files/{}.csv'.format(i))

    features = Parallel(n_jobs=16)(delayed(local_generate_feature_vector_challenge)(df) for _, df in tqdm(raw_data.items(), desc='Generating feature vectors'))
    df_features = pd.DataFrame(features)
    df_features = pd.get_dummies(df_features, columns=["Gender", "ICUType"])
    
    feature_names = df_features.columns.tolist()
    
    X, y = df_features.values, df_labels['30-day_mortality'].values
#     X = hw3_challenge.impute_missing_values_challenge(X)
#     X = hw3_challenge.normalize_feature_matrix_challenge(X)
    return X[:10000], y[:10000], X[10000:], feature_names

In [40]:
def impute_missing_values_challenge(X):
    return hw3_main.impute_missing_values(X)

In [41]:
def normalize_feature_matrix_challenge(X):
    return hw3_main.normalize_feature_matrix(X)

In [42]:
def get_train_val_split(X: np.ndarray[float], y: np.ndarray[int]):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, stratify=y, random_state=69)
    return X_train, X_val, y_train, y_val

In [43]:
def preprocess(X: np.ndarray[float], feature_names):
    df = make_df(X, feature_names)
    
    

    df_new = pd.get_dummies(df, columns=['Gender', 'ICUType'])
    df_new.rename(columns={
        'Gender_0.0': 'gender_0',
        'Gender_0.5611444778111445': 'gender_1',
        'Gender_1.0': 'gender_2',
        'ICUType_0.0': 'CCU',
        'ICUType_0.3333333333333333': 'CSRU',
        'ICUType_0.6666666666666666': 'MedICU',
        'ICUType_1.0': 'SurgICU'
    }, inplace=True)
    print(df_new.columns)

    X_new = X
    feature_names_new = feature_names

    

    return X_new, feature_names_new

In [44]:
def run_challenge(X_challenge, y_challenge, X_heldout, feature_names):
    X_cha, feat_names = preprocess(X_challenge, feature_names)
    # clf = LogisticRegression()
    # clf.fit(X_challenge, y_challenge)

    # X_train, X_val, y_train, y_val = get_train_val_split(X_challenge, y_challenge)
    # assert X_train.shape[0] == y_train.size and X_val.shape[0] == y_val.size


    # C_range = np.logspace(-4, 4, 9)

    # y_score = clf.predict_proba(X_heldout)[:, 1]
    # y_label = clf.predict(X_heldout)
    # make_challenge_submission(y_label, y_score)
run_challenge(X_challenge, y_challenge, X_heldout, feature_names)

Index(['Age', 'Height', 'Weight', 'mean_ALP', 'mean_ALT', 'mean_AST',
       'mean_Albumin', 'mean_BUN', 'mean_Bilirubin', 'mean_Cholesterol',
       'mean_Creatinine', 'mean_DiasABP', 'mean_FiO2', 'mean_GCS',
       'mean_Glucose', 'mean_HCO3', 'mean_HCT', 'mean_HR', 'mean_K',
       'mean_Lactate', 'mean_MAP', 'mean_Mg', 'mean_NIDiasABP', 'mean_NIMAP',
       'mean_NISysABP', 'mean_Na', 'mean_PaCO2', 'mean_PaO2', 'mean_Platelets',
       'mean_RespRate', 'mean_SaO2', 'mean_SysABP', 'mean_Temp',
       'mean_TroponinI', 'mean_TroponinT', 'mean_Urine', 'mean_WBC', 'mean_pH',
       'gender_0', 'gender_1', 'gender_2', 'CCU', 'CSRU', 'MedICU', 'SurgICU'],
      dtype='object')




Generating feature vectors:  38%|███████████████████████████████████████████▏                                                                     | 4592/12000 [01:27<01:49, 67.87it/s][A[A

## Running

In [8]:
run_challenge(X_challenge, y_challenge, X_heldout, feature_names)
test_challenge_output()

Loading files from disk: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [00:05<00:00, 2093.39it/s]
Generating feature vectors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [00:07<00:00, 1505.91it/s]


Part 3: Challenge
Saving challenge output...
challenge.csv saved
SUCCESS: csv file is valid.
