## Imports

In [12]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from joblib import Parallel, delayed
import random
from sklearn.metrics import confusion_matrix


from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

import hw3_main
from helper import *

random.seed(42)

## Run

In [2]:
def generate_feature_vector_challenge(df):
    static_variables = config['static']
    timeseries_variables = config['timeseries']
    feature_dict = {}

    for var in static_variables:
        val = df[df["Variable"] == var]["Value"].values[0]
        feature_dict[var] = val

        if val < 0:
            feature_dict[var] = np.nan
        else:
            feature_dict[var] = val

    # Time-varying variables
    for var in timeseries_variables:
        if (df['Variable'] == var).sum() == 0:
            fval = np.nan
            sval = np.nan
            norm_sd = np.nan
        else:
            fval = (df[(df["Variable"] == var) & (df["Time"].str[:2].astype(int) < 24)]["Value"]).mean()
            sval = (df[(df["Variable"] == var) & (df["Time"].str[:2].astype(int) >= 24)]["Value"]).mean()

            mean = (df[(df["Variable"] == var)]["Value"]).mean()
            if mean == 0:
                norm_sd = np.nan
            else:
                sd = (df[(df["Variable"] == var)]["Value"]).std()
                norm_sd = sd / mean

        feature_dict[f"f24_mean_{var}"] = fval
        feature_dict[f"s24_mean_{var}"] = sval
        feature_dict[f"norm_sd_{var}"] = norm_sd

    return feature_dict

In [3]:
def normalize_feature_matrix_challenge(X):
    return hw3_main.normalize_feature_matrix(X)

In [4]:
def impute_missing_values_challenge(X):
    for col in range(X.shape[1]):
        median = np.nanmedian(X[:, col])
        X[:, col] = np.where(np.isnan(X[:, col]), median, X[:, col])

    return X

In [5]:
def get_train_val_split(X: np.ndarray[float], y: np.ndarray[int]):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.20, stratify=y, random_state=69)
    return X_train, X_val, y_train, y_val

In [8]:
def run_challenge(X_challenge, y_challenge, X_heldout, feature_names, metric="auroc"):
    print("================= Part 3 ===================")
    print("Part 3: Challenge")

    X_train, X_val, y_train, y_val = get_train_val_split(X_challenge, y_challenge)
    assert X_train.shape[0] == y_train.size and X_val.shape[0] == y_val.size

    alpha_range = np.logspace(-4, 4, 9)
    penalties = ["l1", "l2", "elasticnet"]
    scores = []

    for alpha in alpha_range:
        for penalty in penalties:
            clf = SGDClassifier(loss="modified_huber", alpha=alpha, penalty=penalty)
            clf.fit(X_train, y_train)

            score = hw3_main.cv_performance(clf, X_train, y_train, 10, metric)
            print("alpha: {:.6f} \t penalty: {:10s} \t score: {:.4f}".format(alpha, penalty, score))
            scores.append((alpha, penalty, score))

    best = sorted(scores, key=lambda x: x[2], reverse=True)[0]
    clf = SGDClassifier(loss="modified_huber", alpha=best[0], penalty=best[1])
    clf.fit(X_train, y_train)

    test_perf = hw3_main.performance(clf, X_val, y_val, metric)
    print("alpha = " + str(best[0]) + "\npenalty = " + str(best[1]) +
          "\nTest Performance on metric " + metric + ": %.4f" % test_perf)

    metric_list = ["accuracy", "precision", "sensitivity", "specificity", "f1_score", "auroc", "auprc"]

    for metric in metric_list:
        test_perf = hw3_main.performance(clf, X_val, y_val, metric)
        print("Validation Performance on metric " + metric + ": %.4f" % test_perf)

    y_score = clf.predict_proba(X_heldout)[:, 1]
    y_label = clf.predict(X_heldout)
    make_challenge_submission(y_label, y_score)
    
    return clf

## Main

## Load Data

In [7]:
X_challenge, y_challenge, X_heldout, feature_names = get_challenge_data()

Loading files from disk: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [00:24<00:00, 487.44it/s]
Generating feature vectors: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12000/12000 [06:42<00:00, 29.80it/s]


## Run tests

In [9]:
# TODO: Question 3: Apply a classifier to heldout features, and then use
#       generate_challenge_labels to print the predicted labels
auroc_clf = run_challenge(X_challenge, y_challenge, X_heldout, feature_names, "auroc")

Part 3: Challenge
alpha: 0.000100 	 penalty: l1         	 score: 0.8275
alpha: 0.000100 	 penalty: l2         	 score: 0.8310
alpha: 0.000100 	 penalty: elasticnet 	 score: 0.8291
alpha: 0.001000 	 penalty: l1         	 score: 0.8325
alpha: 0.001000 	 penalty: l2         	 score: 0.8337
alpha: 0.001000 	 penalty: elasticnet 	 score: 0.8341
alpha: 0.010000 	 penalty: l1         	 score: 0.7958
alpha: 0.010000 	 penalty: l2         	 score: 0.8245
alpha: 0.010000 	 penalty: elasticnet 	 score: 0.8213
alpha: 0.100000 	 penalty: l1         	 score: 0.6991
alpha: 0.100000 	 penalty: l2         	 score: 0.8000
alpha: 0.100000 	 penalty: elasticnet 	 score: 0.7740
alpha: 1.000000 	 penalty: l1         	 score: 0.5000
alpha: 1.000000 	 penalty: l2         	 score: 0.7504
alpha: 1.000000 	 penalty: elasticnet 	 score: 0.5000
alpha: 10.000000 	 penalty: l1         	 score: 0.5000
alpha: 10.000000 	 penalty: l2         	 score: 0.6304
alpha: 10.000000 	 penalty: elasticnet 	 score: 0.5000
alpha: 

In [10]:
f1_clf = run_challenge(X_challenge, y_challenge, X_heldout, feature_names, "f1_score")

Part 3: Challenge
alpha: 0.000100 	 penalty: l1         	 score: 0.2494
alpha: 0.000100 	 penalty: l2         	 score: 0.3783
alpha: 0.000100 	 penalty: elasticnet 	 score: 0.3171
alpha: 0.001000 	 penalty: l1         	 score: 0.2833
alpha: 0.001000 	 penalty: l2         	 score: 0.3193
alpha: 0.001000 	 penalty: elasticnet 	 score: 0.2795
alpha: 0.010000 	 penalty: l1         	 score: 0.0849
alpha: 0.010000 	 penalty: l2         	 score: 0.2010
alpha: 0.010000 	 penalty: elasticnet 	 score: 0.1708
alpha: 0.100000 	 penalty: l1         	 score: 0.0000
alpha: 0.100000 	 penalty: l2         	 score: 0.0064
alpha: 0.100000 	 penalty: elasticnet 	 score: 0.0000
alpha: 1.000000 	 penalty: l1         	 score: 0.0000
alpha: 1.000000 	 penalty: l2         	 score: 0.0000
alpha: 1.000000 	 penalty: elasticnet 	 score: 0.0000
alpha: 10.000000 	 penalty: l1         	 score: 0.0000
alpha: 10.000000 	 penalty: l2         	 score: 0.0000
alpha: 10.000000 	 penalty: elasticnet 	 score: 0.0000
alpha: 

In [11]:
test_challenge_output()

SUCCESS: csv file is valid.


In [17]:
cm = confusion_matrix(f1_clf.predict(X_challenge), y_challenge, normalize='true')

In [19]:
cm

array([[0.88210121, 0.11789879],
       [0.32924962, 0.67075038]])