## Imports

In [5]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
import random


from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

import hw3_main
from helper import *

random.seed(42)

## Run

In [1]:
def generate_feature_vector_challenge(df):
    static_variables = config['static']
    timeseries_variables = config['timeseries']
    feature_dict = {}

    for var in static_variables:
        val = df[df["Variable"] == var]["Value"].values[0]
        feature_dict[var] = val

        if val < 0:
            feature_dict[var] = np.nan
        else:
            feature_dict[var] = val

    # Time-varying variables
    for var in timeseries_variables:
        if (df['Variable'] == var).sum() == 0:
            fval = np.nan
            sval = np.nan
            norm_sd = np.nan
        else:
            fval = (df[(df["Variable"] == var) & (df["Time"].str[:2].astype(int) < 24)]["Value"]).mean()
            sval = (df[(df["Variable"] == var) & (df["Time"].str[:2].astype(int) >= 24)]["Value"]).mean()

            mean = (df[(df["Variable"] == var)]["Value"]).mean()
            if mean == 0:
                norm_sd = np.nan
            else:
                sd = (df[(df["Variable"] == var)]["Value"]).std()
                norm_sd = sd / mean

        feature_dict[f"f24_mean_{var}"] = fval
        feature_dict[f"s24_mean_{var}"] = sval
        feature_dict[f"norm_sd_{var}"] = norm_sd

    return feature_dict

In [2]:
def normalize_feature_matrix_challenge(X):
    return hw3_main.normalize_feature_matrix(X)

In [3]:
def impute_missing_values_challenge(X):
    for col in range(X.shape[1]):
        median = np.nanmedian(X[:, col])
        X[:, col] = np.where(np.isnan(X[:, col]), median, X[:, col])

    return X

In [6]:
def get_train_val_split(X: np.ndarray[float], y: np.ndarray[int]):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, stratify=y, random_state=69)
    return X_train, X_val, y_train, y_val

In [10]:
def run_challenge(X_challenge, y_challenge, X_heldout, feature_names, metric="auroc"):
    print("================= Part 3 ===================")
    print("Part 3: Challenge")

    X_train, X_val, y_train, y_val = get_train_val_split(X_challenge, y_challenge)
    assert X_train.shape[0] == y_train.size and X_val.shape[0] == y_val.size

    alpha_range = np.logspace(-4, 4, 9)
    penalties = ["l1", "l2", "elasticnet"]
    scores = []

    for alpha in alpha_range:
        for penalty in penalties:
            clf = SGDClassifier(loss="modified_huber", alpha=alpha, penalty=penalty)
            clf.fit(X_train, y_train)

            score = hw3_main.cv_performance(clf, X_train, y_train, 10, metric)
            print("alpha: {:.6f} \t penalty: {:10s} \t score: {:.4f}".format(alpha, penalty, score))
            scores.append((alpha, penalty, score))

    best = sorted(scores, key=lambda x: x[2], reverse=True)[0]
    clf = SGDClassifier(loss="modified_huber", alpha=best[0], penalty=best[1])
    clf.fit(X_train, y_train)

    test_perf = hw3_main.performance(clf, X_val, y_val, metric)
    print("alpha = " + str(best[0]) + "\npenalty = " + str(best[1]) +
          "\nTest Performance on metric " + metric + ": %.4f" % test_perf)

    metric_list = ["accuracy", "precision", "sensitivity", "specificity", "f1_score", "auroc", "auprc"]

    for metric in metric_list:
        test_perf = hw3_main.performance(clf, X_val, y_val, metric)
        print("Validation Performance on metric " + metric + ": %.4f" % test_perf)

    y_score = clf.predict_proba(X_heldout)[:, 1]
    y_label = clf.predict(X_heldout)
    make_challenge_submission(y_label, y_score)

## Main

## Load Data

In [None]:
X_challenge, y_challenge, X_heldout, feature_names = get_challenge_data()

## Run tests

In [8]:
# TODO: Question 3: Apply a classifier to heldout features, and then use
#       generate_challenge_labels to print the predicted labels
run_challenge(X_challenge, y_challenge, X_heldout, feature_names, "auroc")

Loading files from disk: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [00:19<00:00, 605.21it/s]
Generating feature vectors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [06:20<00:00, 31.55it/s]


Part 3: Challenge
alpha: 0.000100 	 penalty: l1         	 score: 0.8285
alpha: 0.000100 	 penalty: l2         	 score: 0.8274
alpha: 0.000100 	 penalty: elasticnet 	 score: 0.8292
alpha: 0.001000 	 penalty: l1         	 score: 0.8348
alpha: 0.001000 	 penalty: l2         	 score: 0.8335
alpha: 0.001000 	 penalty: elasticnet 	 score: 0.8342
alpha: 0.010000 	 penalty: l1         	 score: 0.7959
alpha: 0.010000 	 penalty: l2         	 score: 0.8252
alpha: 0.010000 	 penalty: elasticnet 	 score: 0.8211
alpha: 0.100000 	 penalty: l1         	 score: 0.7196
alpha: 0.100000 	 penalty: l2         	 score: 0.7992
alpha: 0.100000 	 penalty: elasticnet 	 score: 0.7732
alpha: 1.000000 	 penalty: l1         	 score: 0.5000
alpha: 1.000000 	 penalty: l2         	 score: 0.7464
alpha: 1.000000 	 penalty: elasticnet 	 score: 0.5000
alpha: 10.000000 	 penalty: l1         	 score: 0.5000
alpha: 10.000000 	 penalty: l2         	 score: 0.6212
alpha: 10.000000 	 penalty: elasticnet 	 score: 0.5000
alpha: 

In [11]:
run_challenge(X_challenge, y_challenge, X_heldout, feature_names, "f1_score")

Part 3: Challenge
alpha: 0.000100 	 penalty: l1         	 score: 0.4115
alpha: 0.000100 	 penalty: l2         	 score: 0.3110
alpha: 0.000100 	 penalty: elasticnet 	 score: 0.3395
alpha: 0.001000 	 penalty: l1         	 score: 0.3019
alpha: 0.001000 	 penalty: l2         	 score: 0.3295
alpha: 0.001000 	 penalty: elasticnet 	 score: 0.2842
alpha: 0.010000 	 penalty: l1         	 score: 0.0854
alpha: 0.010000 	 penalty: l2         	 score: 0.2363
alpha: 0.010000 	 penalty: elasticnet 	 score: 0.1600
alpha: 0.100000 	 penalty: l1         	 score: 0.0000
alpha: 0.100000 	 penalty: l2         	 score: 0.0064
alpha: 0.100000 	 penalty: elasticnet 	 score: 0.0000
alpha: 1.000000 	 penalty: l1         	 score: 0.0000
alpha: 1.000000 	 penalty: l2         	 score: 0.0000
alpha: 1.000000 	 penalty: elasticnet 	 score: 0.0000
alpha: 10.000000 	 penalty: l1         	 score: 0.0000
alpha: 10.000000 	 penalty: l2         	 score: 0.0000
alpha: 10.000000 	 penalty: elasticnet 	 score: 0.0000
alpha: 

In [12]:
test_challenge_output()

SUCCESS: csv file is valid.
