In [1]:
import numpy as np
import pandas as pd

# set url of dataset
train_values_url = "../dataset/Warm_Up_Machine_Learning_with_a_Heart_-_Train_Values.csv"
train_labels_url = "../dataset/Warm_Up_Machine_Learning_with_a_Heart_-_Train_Labels.csv"

In [2]:
from sklearn.model_selection import train_test_split

# load dataset
values_df = pd.read_csv(train_values_url)
labels_df = pd.read_csv(train_labels_url)

# concatenate values and labels
# https://pandas.pydata.org/pandas-docs/stable/merging.html
# dataset_df = pd.merge(values_df, labels_df, on=['patient_id', 'patient_id'])

# convert to nd-array
# dataset_nd = dataset_df.values

# print(dataset_nd)
# print(len(dataset_nd))
# print(len(dataset_nd[0]))

values_nd = values_df.values
labels_nd = labels_df.values

# optional
# drop null value
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.dropna.html
# dataset_df.dropna()
# Actually, nothing was dropped here.

# dataset_ndarr = dataset_df.values
ids_nd = values_nd[:, 0]
X_nd = values_nd[:, 1:]
# ids_nd, X_nd = np.split(values_df.values, [1], axis=1)
y_nd = labels_nd[:, -1]

# print(ids_nd)
# print(type(ids_nd))

# print(X_nd)
# print(type(X_nd))

# print(y_nd)
# print(type(y_nd))

In [3]:
# separate into values and labels
# ids, train = np.split(dataset_ndarr, [1], axis=1)
# values, labels = np.split(train, [-1], axis=1)

# flatten labels
# labels_list = labels.flatten().tolist()

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, Normalizer
from sklearn.pipeline import Pipeline

# one hot encoder for categoricalize
# https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder
# https://stackoverflow.com/questions/43588679/issue-with-onehotencoder-for-categorical-features
ct = ColumnTransformer(
    [('enc', OneHotEncoder(), [1])],
    remainder='passthrough'
)

# normalizer
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html#sklearn.preprocessing.Normalizer
norm = Normalizer()

# build the preprocessing pipeline
# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
prep = Pipeline(
    [('ct', ct), 
     ('norm', norm)]
)

# fit the preprocessing pipeline
prep.fit(X_nd)

# split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X_nd, y_nd, test_size=0.15)

# print(X_train)
# print(len(X_train))
# print(len(X_train[0]))
# print()

# print(X_test)
# print(len(X_test))
# print()

# print(y_train)
# print(len(y_train))
# print()

# print(y_test)
# print(len(y_test))
# print()

# transform on X_train and X_test
X_train_pp = prep.transform(X_train)
X_test_pp = prep.transform(X_test)

# print("pre-processed train X: ")
# print(X_train_pp)
# print(len(X_train_pp))
# print(len(X_train_pp[0]))
# print()

# convert y to list
y_train_list = list(y_train)
y_test_list = list(y_test)

In [16]:
# regressors
from sklearn.tree import DecisionTreeRegressor as DT
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor as NN
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html#sklearn.neural_network.MLPRegressor
from sklearn.linear_model import LogisticRegression as LR
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression

# metrics
from sklearn.metrics import log_loss

models = {
    'DT': DT(),
    'NN': NN(),
    'LR': LR()
}
param_dict = {
    'DT': {
        'max_depth': [1,2,3,None],
        'max_leaf_nodes': [4,6,8,10,None],
        'min_samples_leaf': [1,2,3],
        'min_samples_split': [2,4,6]
    },
    'NN': {
        'hidden_layer_sizes': [1,3,5],
        'activation': ['logistic','tanh','relu'],
        'max_iter': [500,1000]
    },
    'LR': {
        'C': [0.0001, 0.001, 0.01, 1, 10],
        'penalty': ['l1','l2']
    }
}

# grid search cross validation
from sklearn.model_selection import GridSearchCV
model_name = 'DT'
model = models[model_name]
scorer = 'neg_mean_squared_log_error'
gscv = GridSearchCV(model, param_dict[model_name], cv=3, scoring=None)
gscv.fit(X_train_pp, y_train_list)

print("Best parameters set found on development set:")
print(gscv.best_params_)

# test on test data
y_true, y_pred = y_test_list, gscv.predict(X_test_pp)

print(y_true)
print(type(y_true))
print(y_pred)
print(type(y_pred))

# get metrics (log loss)
log_loss(y_true, y_pred)

Best parameters set found on development set:
{'max_depth': 1, 'max_leaf_nodes': 4, 'min_samples_leaf': 1, 'min_samples_split': 4}
[0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0]
<class 'list'>
[0.10769231 0.86956522 0.10769231 0.42307692 0.6875     0.86956522
 0.10769231 0.10769231 0.10769231 0.10769231 0.86956522 0.10769231
 0.10769231 0.10769231 0.10769231 0.10769231 0.86956522 0.10769231
 0.42307692 0.10769231 0.6875     0.86956522 0.10769231 0.42307692
 0.86956522 0.86956522 0.6875    ]
<class 'numpy.ndarray'>


0.5470294503877641

In [6]:
# predict on test data
test_values_url = "./dataset/Warm_Up_Machine_Learning_with_a_Heart_-_Test_Values.csv"
test_df = pd.read_csv(test_values_url)
testset_ndarr = test_df.values
test_ids, test_values = np.split(testset_ndarr, [1], axis=1)

# test model
processed_test_values = prep.transform(test_values)

print(processed_test_values)
print()

test_pred = gscv.predict(processed_test_values)

print(test_pred)

[[0.         0.         0.00266674 ... 0.15733779 0.424012   0.        ]
 [0.         0.00339223 0.         ... 0.11872803 0.61738575 0.        ]
 [0.         0.         0.00401573 ... 0.17267649 0.48188788 0.00401573]
 ...
 [0.         0.00295011 0.         ... 0.12390478 0.52512026 0.        ]
 [0.         0.00311077 0.         ... 0.14309541 0.47283701 0.00311077]
 [0.         0.00295372 0.         ... 0.12700996 0.50508612 0.        ]]

[0.86956522 0.10769231 0.86956522 0.10769231 0.6875     0.10769231
 0.10769231 0.86956522 0.10769231 0.10769231 0.10769231 0.86956522
 0.10769231 0.86956522 0.10769231 0.10769231 0.10769231 0.10769231
 0.86956522 0.10769231 0.86956522 0.10769231 0.6875     0.10769231
 0.42307692 0.86956522 0.86956522 0.10769231 0.6875     0.10769231
 0.86956522 0.86956522 0.42307692 0.10769231 0.10769231 0.10769231
 0.10769231 0.10769231 0.42307692 0.10769231 0.86956522 0.86956522
 0.86956522 0.10769231 0.42307692 0.10769231 0.10769231 0.10769231
 0.10769231 0.6875 

In [7]:
# write to csv
header = labels_df.columns
col_id_name = header[0]
col_label_name = header[1]

test_ids_list = list(test_ids.flatten())

res_dict = {col_id_name: test_ids_list,
            col_label_name: test_pred
           }
res_df = pd.DataFrame.from_dict(res_dict)

import time
# export to df
millis = int(round(time.time() * 1000))
output_path = "./" + str(millis) + ".csv"
res_df.to_csv(output_path, index=False)

# print(millis)
# print(type(millis))