In [2]:
import numpy as np
import pandas as pd

# set url of dataset
train_values_url = "./dataset/Warm_Up_Machine_Learning_with_a_Heart_-_Train_Values.csv"
train_labels_url = "./dataset/Warm_Up_Machine_Learning_with_a_Heart_-_Train_Labels.csv"

In [3]:
# load dataset
values_df = pd.read_csv(train_values_url)
labels_df = pd.read_csv(train_labels_url)

# concatenate values and labels
# https://pandas.pydata.org/pandas-docs/stable/merging.html
# dataset_df = pd.merge(values_df, labels_df, on=['patient_id', 'patient_id'])

# convert to nd-array
# dataset_nd = dataset_df.values

# print(dataset_nd)
# print(len(dataset_nd))
# print(len(dataset_nd[0]))

values_nd = values_df.values
labels_nd = labels_df.values

# optional
# drop null value
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.dropna.html
# dataset_df.dropna()
# Actually, nothing was dropped here.

# dataset_ndarr = dataset_df.values
ids_nd = values_nd[:, 0]
X_nd = values_nd[:, 1:]
# ids_nd, X_nd = np.split(values_df.values, [1], axis=1)
y_nd = labels_nd[:, -1]

# print(ids_nd)
# print(type(ids_nd))

# print(X_nd)
# print(type(X_nd))

# print(y_nd)
# print(type(y_nd))

In [4]:
# separate into values and labels
# ids, train = np.split(dataset_ndarr, [1], axis=1)
# values, labels = np.split(train, [-1], axis=1)

# flatten labels
# labels_list = labels.flatten().tolist()

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, Normalizer
from sklearn.pipeline import Pipeline

# one hot encoder for categoricalize
# https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder
# https://stackoverflow.com/questions/43588679/issue-with-onehotencoder-for-categorical-features
ct = ColumnTransformer(
    [('enc', OneHotEncoder(), [1])],
    remainder='passthrough'
)

# normalizer
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html#sklearn.preprocessing.Normalizer
norm = Normalizer()

# build the preprocessing pipeline
# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
prep = Pipeline(
    [('ct', ct), 
     ('norm', norm)]
)

# fit the preprocessing pipeline
prep.fit(X_nd)

# split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X_nd, y_nd, test_size=0.15)

# print(X_train)
# print(len(X_train))
# print(len(X_train[0]))
# print()

# print(X_test)
# print(len(X_test))
# print()

# print(y_train)
# print(len(y_train))
# print()

# print(y_test)
# print(len(y_test))
# print()

# transform on X_train and X_test
X_train_pp = prep.transform(X_train)
X_test_pp = prep.transform(X_test)

# print("pre-processed train X: ")
# print(X_train_pp)
# print(len(X_train_pp))
# print(len(X_train_pp[0]))
# print()

# convert y to list
y_train_list = list(y_train)
y_test_list = list(y_test)

In [5]:
# regressors
from sklearn.tree import DecisionTreeRegressor as DT
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor as NN
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html#sklearn.neural_network.MLPRegressor
from sklearn.linear_model import LogisticRegression as LR
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression

from sklearn.neural_network import MLPClassifier as MLP
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.ensemble import BaggingClassifier as Bagging
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.ensemble import AdaBoostClassifier as AdaBoost
from sklearn.ensemble import GradientBoostingClassifier as GB
from xgboost import XGBClassifier as XGB
# conda install -c conda-forge xgboost

# metrics
from sklearn.metrics import log_loss, accuracy_score

models = {
    'DT': DT(),
    'NN': NN(),
    'LR': LR(),
    'MLP': MLP(),
    'SVC': SVC(),
    'NB': NB(),
    'KNN': KNN(),
    'Bagging': Bagging(),
    'RF': RF(),
    'AdaBoost': AdaBoost(),
    'GB': GB(),
    'XGB': XGB(),
}
param_dict = {
    # 0.67 {'max_depth': 1, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 2}
    'DT': {
        'max_depth': [1,2,3,None],
        'max_leaf_nodes': [4,6,8,10,None],
        'min_samples_leaf': [1,2,3],
        'min_samples_split': [2,4,6]
    },
    # 0.40 {'activation': 'tanh', 'hidden_layer_sizes': (100, 100, 50), 'max_iter': 2000}
    'NN': {
        'hidden_layer_sizes': [(100,100,50)],
        'activation': ['relu', 'tanh'],
        'max_iter': [500, 1000, 2000]
    },
    # 12.79 {'C': 5.0, 'class_weight': 'balanced', 'fit_intercept': True, 'penalty': 'l2', 'solver': 'lbfgs'}
    'LR': {
        "solver": ['lbfgs', 'liblinear', 'sag', 'saga'],
        "penalty": ['l2'],
        "C": [1.0, 1.5, 2.0, 5.0],
        "fit_intercept": [True, False],
        "class_weight": [None, 'balanced']
    },
    # 10.23 {'activation': 'relu', 'early_stopping': False, 'hidden_layer_sizes': 5, 'learning_rate': 'adaptive', 'max_iter': 1000}
    'MLP': {
        "max_iter": [1000, 2000],
       "hidden_layer_sizes": [5],
       "activation": ['tanh', 'relu'],
       "learning_rate": ['constant', 'invscaling', 'adaptive'],
       "early_stopping": [True, False],
    },
    # 8.95 {'C': 10000, 'coef0': 0.0, 'gamma': 'scale', 'kernel': 'sigmoid', 'shrinking': True}
    'SVC': {
        "C": [5000, 10000, 20000, 30000],
        "kernel": ["poly", "rbf", "sigmoid"],
        "coef0": [0.0, 0.1, 0.2, 0.3, 0.5],
        "shrinking": [True, False],
        "gamma": ['scale', 'auto']
    },
    # 6.39 {}
    'NB': {
        # Nothing can be tuned
    },
    # 11.51 {'algorithm': 'auto', 'n_neighbors': 3, 'p': 1, 'weights': 'distance'}
    'KNN': {
        "n_neighbors": [1, 2, 3, 5, 7],
        "weights": ['uniform', 'distance'],
        "algorithm": ['auto', 'ball_tree', 'kd_tree', 'brute'],
        "p": [1, 2, 3]
    },
    # 8.95 {'max_features': 1.0, 'max_samples': 0.1, 'n_estimators': 50}
    'Bagging': {
          "n_estimators": [10, 20, 50, 100],
          "max_samples": [0.01, 0.1, 0.3],
          "max_features": [0.5, 0.8, 1.0]
    },
    # 7.67 {'criterion': 'entropy', 'max_depth': 20, 'max_features': 'auto', 'n_estimators': 20}
    'RF': {
        "n_estimators": [10, 20, 50],
        "criterion": ["gini", "entropy"],
        "max_depth": [None, 5, 10, 20],
        "max_features": [None, "auto", "log2"]
    },
    # 7.67 {'learning_rate': 0.5, 'max_depth': 10, 'max_features': 'log2', 'n_estimators': 100}
    'GB': {
          "learning_rate": [0.05, 0.1, 0.2, 0.5, 1.0],
          "n_estimators": [10, 50, 100, 200],
          "max_depth": [5, 10],
          "max_features": [None, "auto", "log2"]
    },
    # 7.67
    'AdaBoost': {
        "n_estimators": [20, 50, 100, 1000],
        "learning_rate": [0.01, 0.1, 1],
        "algorithm": ["SAMME", "SAMME.R"],
    },
    # 8.9 {'booster': 'gbtree', 'learning_rate': 0.1, 'min_child_weight': 5, 'n_estimators': 100}
    'XGB': {
        "learning_rate": [0.01, 0.1],
        "n_estimators": [5, 10, 20, 100],
        "min_child_weight": [3, 5, 10],
        "booster": ['gbtree', 'gblinear', 'dart']
    }
}

# grid search cross validation
from sklearn.model_selection import GridSearchCV
model_name = 'NN'
model = models[model_name]
scorer = 'neg_mean_squared_log_error'
gscv = GridSearchCV(model, param_dict[model_name], cv=5, scoring=None)
gscv.fit(X_train_pp, y_train_list)

print("Best parameters set found on development set:")
print(gscv.best_params_)

# test on test data
y_true, y_pred = y_test_list, gscv.predict(X_test_pp)

# print(y_true)
# print(type(y_true))
# print(y_pred)
# print(type(y_pred))

# get metrics (log loss)
print("Log Loss:", log_loss(y_true, y_pred))
# print("Accuracy Score:", accuracy_score(y_true, y_pred))



Best parameters set found on development set:
{'activation': 'tanh', 'hidden_layer_sizes': (100, 100, 50), 'max_iter': 1000}
Log Loss: 0.28924956662392204


In [6]:
# predict on test data
test_values_url = "./dataset/Warm_Up_Machine_Learning_with_a_Heart_-_Test_Values.csv"
test_df = pd.read_csv(test_values_url)
testset_ndarr = test_df.values
test_ids, test_values = np.split(testset_ndarr, [1], axis=1)

# test model
processed_test_values = prep.transform(test_values)

print(processed_test_values)
print()

test_pred = gscv.predict(processed_test_values)

print(test_pred)

[[0.         0.         0.00266674 ... 0.15733779 0.424012   0.        ]
 [0.         0.00339223 0.         ... 0.11872803 0.61738575 0.        ]
 [0.         0.         0.00401573 ... 0.17267649 0.48188788 0.00401573]
 ...
 [0.         0.00295011 0.         ... 0.12390478 0.52512026 0.        ]
 [0.         0.00311077 0.         ... 0.14309541 0.47283701 0.00311077]
 [0.         0.00295372 0.         ... 0.12700996 0.50508612 0.        ]]

[ 0.38815372  0.19595794  1.19935958  0.10200492  0.99612034  0.08124309
  0.25992542  0.89852452  0.21095232  0.28180369  0.57392963  0.53887786
  0.33316518  0.94602252  0.0328443   0.07228183 -0.09256281 -0.09747319
  0.81914079  0.02722626  0.95006084  0.35167875  0.15234588 -0.13384188
  0.39343903  1.04998085  0.43082913  0.22477232  0.47591777  0.04014866
  0.87753542  0.30488802  0.5091827   0.5825555   0.21816872  0.10633718
  0.28128012  0.3813328   0.13910868  0.07736251  0.85359593  0.04766525
  0.90002043  0.06665682  0.88960983  0.2100

In [7]:
# write to csv
header = labels_df.columns
col_id_name = header[0]
col_label_name = header[1]

test_ids_list = list(test_ids.flatten())

res_dict = {col_id_name: test_ids_list,
            col_label_name: test_pred
           }
res_df = pd.DataFrame.from_dict(res_dict)

import time
# export to df
millis = int(round(time.time() * 1000))
output_path = "./" + str(millis) + ".csv"
res_df.to_csv(output_path, index=False)

# print(millis)
# print(type(millis))