In [11]:
from implementations import  *
import numpy as np
import matplotlib.pyplot as plt
from helpers import *
import seaborn as sns
plt.rcParams['text.usetex'] = False
import math
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

### Loading Data

In [2]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data("data/dataset_to_release/", sub_sample=False) 

In [3]:
y_train = (y_train + 1) / 2
column_names = np.genfromtxt("data/dataset/x_train.csv", delimiter=",", dtype=str, max_rows=1)
column_names.shape
column_names = column_names[1:]
nan_counts = np.isnan(x_train).sum(axis=0)
columns_to_remove = np.where(nan_counts > x_train.shape[0]/3)[0]


x_clean = np.delete(x_train, columns_to_remove, axis=1)
x_clean_test = np.delete(x_test, columns_to_remove, axis=1)
column_names_clean = np.delete(column_names, columns_to_remove, axis=0)


#replace nan values with the median of the column
medians = np.nanmedian(x_clean, axis=0)
x_clean[np.isnan(x_clean)] = np.take(medians, np.isnan(x_clean).nonzero()[1])

medians_test = np.nanmedian(x_clean_test, axis=0)
x_clean_test[np.isnan(x_clean_test)] = np.take(medians_test, np.isnan(x_clean_test).nonzero()[1])
assert np.isnan(x_clean).sum() == 0 and np.isnan(x_clean_test).sum() == 0, "There are still nan values in the data"



In [7]:
relevant_cat_features=['BPHIGH4'
,'_RFHLTH','_RFHYPE5','_DRDXAR1','DIABETE3'
,'DIFFWALK','TOLDHI2','_RFCHOL','HAVARTH3','QLACTLM2'
,'CVDSTRK3','PNEUVAC3'
,'CHCCOPD1','SMOKE100','SEX','DIFFALON','_LMTACT1','CHCOCNCR','_BMI5CAT'
,'DIFFDRES']
relevant_non_cat_features =['_AGE80','STRENGTH','ALCDAY5','WTKG3']

In [14]:
#expand matrix by one hot encoding
x_train_one_hot = one_hot_encoder(x_clean, column_names_clean, relevant_cat_features)

x_test_one_hot = one_hot_encoder(x_clean_test, column_names_clean, relevant_cat_features)


In [36]:
x_train_one_hot = concat_features(x_clean, x_train_one_hot, relevant_non_cat_features)
x_test_one_hot = concat_features(x_clean_test, x_test_one_hot, relevant_non_cat_features)

In [41]:
ratio = 0.8
seed = 1
x_train_split, x_val_split, y_train_split, y_val_split = train_test_split(x_train_one_hot, y_train, test_size=1-ratio, random_state=seed)

# Appy SMOTE on the training data for oversampling
x_train_smote, y_train_smote = SMOTE(x_train_split, y_train_split, k=5, ratio=0.5)
# Undersampling the training data
x_train_under, y_train_under = undersample_majority(x_train_smote, y_train_smote, ratio=0.8)


x_stan_train, mean_x, std_x = standardize(x_train_under)
x_stan_val = (x_val_split-mean_x) / std_x
x_stan_test = (x_test_one_hot - mean_x) / std_x

(239373, 70)
undersampling over


In [54]:
y, tx_train = build_model_data(y_train_under, x_stan_train_extended)
initial_w = np.random.rand(tx_train.shape[1])
w,loss = reg_logistic_regression(y, tx_train, 0.1, initial_w , 10000, 0.01)

Initial loss=4.16434087847291
Current iteration=0, loss=8.293999789863093
Current iteration=100, loss=3.316741888822076
Current iteration=200, loss=2.230712704446871
Current iteration=300, loss=1.606999060060923
Current iteration=400, loss=1.2137099833866278
Current iteration=500, loss=0.9623781121465614
Current iteration=600, loss=0.8010096636372728
Current iteration=700, loss=0.6970369510892707
Current iteration=800, loss=0.6297492135826475
Current iteration=900, loss=0.5859579491260033
Current iteration=1000, loss=0.5573047328486493
Current iteration=1100, loss=0.5384677890623831
Current iteration=1200, loss=0.5260313999753541
Current iteration=1300, loss=0.517790877089902
Current iteration=1400, loss=0.5123150400847093
Current iteration=1500, loss=0.5086687069538858
Current iteration=1600, loss=0.5062370023901002
Current iteration=1700, loss=0.5046136205435933
Current iteration=1800, loss=0.503529072453409
Current iteration=1900, loss=0.5028041364659172
Current iteration=2000, loss

In [55]:
val_tx = np.c_[np.ones(x_stan_val_extended.shape[0]), x_stan_val_extended]
y_pred_val = predict(val_tx, w, threshold = 0.5)
y_pred_val = (y_pred_val + 1) / 2
print("Accuracy for Validation Set:", accuracy(y_val_split, y_pred_val))
print("F1 score for Validation Set:", compute_f1_score(y_val_split, y_pred_val))
print((y_pred_val - y_val_split).sum())

Accuracy for Validation Set: 0.7188047602358785
F1 score for Validation Set: 0.34759244856112564
16606.0


In [56]:
best_threshold = select_best_threshold(y_val_split,val_tx,w)

Best Threshold: 0.70
Accuracy for Validation Set at Best Threshold: 0.8556082100355037
F1 score for Validation Set at Best Threshold: 0.4099626400996264


In [58]:
test_tx = np.c_[np.ones(x_stan_test_extended.shape[0]), x_stan_test_extended]
y_pred = predict(test_tx,w,threshold = best_threshold)
create_csv_submission(test_ids, y_pred, 'submission.csv')