In [None]:
import numpy as np
import pandas as pd

# set url of dataset
values_url = "./dataset/Warm_Up_Machine_Learning_with_a_Heart_-_Train_Values.csv"
labels_url = "./dataset/Warm_Up_Machine_Learning_with_a_Heart_-_Train_Labels.csv"
test_values_url = "./dataset/Warm_Up_Machine_Learning_with_a_Heart_-_Test_Values.csv"

In [245]:
# load dataset
values_df = pd.read_csv(train_values_url)
labels_df = pd.read_csv(train_labels_url)
test_df = pd.read_csv(test_values_url)

# concatenate values and labels
# https://pandas.pydata.org/pandas-docs/stable/merging.html
dataset_df = pd.merge(values_df, labels_df, on=['patient_id', 'patient_id'])

# drop null value
# https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.dropna.html
dataset_df.dropna()
# Actually, nothing was dropped here.

# convert to ndarray
dataset_ndarr = dataset_df.values
testset_ndarr = test_df.values

# separate into values and labels
ids, train = np.split(dataset_ndarr, [1], axis=1)
values, labels = np.split(train, [-1], axis=1)
test_ids, test_values = np.split(testset_ndarr, [1], axis=1)

# one hot encoder for categoricalize
# https://stackoverflow.com/questions/43588679/issue-with-onehotencoder-for-categorical-features
# https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html#sklearn.preprocessing.OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, Normalizer
ct = ColumnTransformer(
    [('enc', OneHotEncoder(), [1])],
    remainder='passthrough'
)
# encoded = ct.fit_transform(values)

# normalizer
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html#sklearn.preprocessing.Normalizer
norm = Normalizer()
# normalized = norm.fit_transform(encoded)

# preprocessing pipeline
# https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
from sklearn.pipeline import Pipeline
ppl = Pipeline(
    [('ct', ct), 
     ('norm', norm)]
)
preprocessed = ppl.fit_transform(values)

# flatten labels
labels_list = labels.flatten().tolist()

print("preprocessed: ")
print(preprocessed)
# print(len(preprocessed))
# print()

# print("labels_list: ")
# print(labels_list)
# print(len(labels_list))

preprocessed: 
[[0.         0.00265201 0.         ... 0.11934031 0.45084116 0.        ]
 [0.         0.00341409 0.         ... 0.18436095 0.53942649 0.        ]
 [0.         0.00267026 0.         ... 0.20560986 0.43258178 0.00267026]
 ...
 [0.         0.         0.00274843 ... 0.17589971 0.36004471 0.00274843]
 [0.         0.00297026 0.         ... 0.14257265 0.51979612 0.        ]
 [0.         0.00323596 0.         ... 0.17474173 0.52746114 0.        ]]


In [277]:
# classifiers
from sklearn.tree import DecisionTreeRegressor as DT
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor as NN
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html#sklearn.neural_network.MLPRegressor
from sklearn.linear_model import LogisticRegression as LR
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression
models = {
    'DT': DT(),
    'NN': NN(),
    'LR': LR()
}
param_dict = {
    'DT': {
        'max_depth': [1,2,3,None],
        'max_leaf_nodes': [4,6,8,10,None],
        'min_samples_leaf': [1,2,3],
        'min_samples_split': [2,4,6]
    },
    'NN': {
        'hidden_layer_sizes': [1,3,5],
        'activation': ['logistic','tanh','relu'],
        'early_stopping': [True, False],
        'max_iter': [500,1000]
    },
    'LR': {
        'penalty': ['l2'],
        'solver': ['newton-cg','lbfgs','liblinear','sag','saga'],
        'max_iter': [100,200,500,1000]
    }
}

# grid search cross validation
from sklearn.model_selection import GridSearchCV
model_name = 'DT'
model = models[model_name]
gscv = GridSearchCV(model, param_dict[model_name], 
                    cv=5,scoring='neg_mean_squared_error')
gscv.fit(preprocessed, labels_list)
# gscv.predict(preprocessed)

print("Best parameters set found on development set:")
print(gscv.best_params_)


Best parameters set found on development set:
{'max_depth': 2, 'max_leaf_nodes': 6, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [278]:
# test model
processed_test_values = ppl.transform(test_values)

print(processed_train_values)

test_pred = gscv.predict(processed_test_values)

print(test_pred)

[[0.         0.         0.00266674 ... 0.15733779 0.424012   0.        ]
 [0.         0.00339223 0.         ... 0.11872803 0.61738575 0.        ]
 [0.         0.         0.00401573 ... 0.17267649 0.48188788 0.00401573]
 ...
 [0.         0.00295011 0.         ... 0.12390478 0.52512026 0.        ]
 [0.         0.00311077 0.         ... 0.14309541 0.47283701 0.00311077]
 [0.         0.00295372 0.         ... 0.12700996 0.50508612 0.        ]]
[0.91304348 0.09090909 0.91304348 0.09090909 0.86666667 0.09090909
 0.09090909 0.91304348 0.09090909 0.09090909 0.09090909 0.91304348
 0.09090909 0.91304348 0.09090909 0.09090909 0.09090909 0.09090909
 0.91304348 0.09090909 0.91304348 0.09090909 0.26666667 0.09090909
 0.125      0.86666667 0.91304348 0.26666667 0.86666667 0.09090909
 0.91304348 0.91304348 0.86666667 0.09090909 0.09090909 0.09090909
 0.09090909 0.09090909 0.125      0.09090909 0.91304348 0.09090909
 0.91304348 0.09090909 0.125      0.09090909 0.09090909 0.09090909
 0.09090909 0.866666

In [280]:
# write to csv
header = labels_df.columns
col_id_name = header[0]
col_label_name = header[1]

test_ids_list = list(test_ids.flatten())

res_dict = {col_id_name: test_ids_list,
            col_label_name: test_pred
           }
res_df = pd.DataFrame.from_dict(res_dict)

# export to df
res_df.to_csv('./output.csv', index=False)