Submission for heart-disease prediction program 

In [14]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [7]:
# functions to one hot encode
def categorize_label(x):
    return x.astype('category')

def one_hot_encode(df):
    if 'thal' in df.columns:
        df[['thal']] = df[['thal']].apply(categorize_label, axis=0)
        return pd.get_dummies(df, drop_first=True)
    return df

In [8]:
# train values
train_values = pd.read_csv('train_values.csv', index_col='patient_id')
train_labels = pd.read_csv('train_labels.csv', index_col='patient_id')
train_values.head()

Unnamed: 0_level_0,slope_of_peak_exercise_st_segment,thal,resting_blood_pressure,chest_pain_type,num_major_vessels,fasting_blood_sugar_gt_120_mg_per_dl,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0z64un,1,normal,128,2,0,0,2,308,0.0,1,45,170,0
ryoo3j,2,normal,110,3,0,0,0,214,1.6,0,54,158,0
yt1s1x,1,normal,125,4,3,0,2,304,0.0,1,77,162,1
l2xjde,1,reversible_defect,152,4,0,0,0,223,0.0,1,40,181,0
oyt4ek,3,reversible_defect,178,1,0,0,2,270,4.2,1,59,145,0


In [9]:
# drop less important columns
to_drop = ['fasting_blood_sugar_gt_120_mg_per_dl', 'slope_of_peak_exercise_st_segment']
train_values = train_values.drop(to_drop, axis=1)
train_values = one_hot_encode(train_values)
train_values.head()

Unnamed: 0_level_0,resting_blood_pressure,chest_pain_type,num_major_vessels,resting_ekg_results,serum_cholesterol_mg_per_dl,oldpeak_eq_st_depression,sex,age,max_heart_rate_achieved,exercise_induced_angina,thal_normal,thal_reversible_defect
patient_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0z64un,128,2,0,2,308,0.0,1,45,170,0,1,0
ryoo3j,110,3,0,0,214,1.6,0,54,158,0,1,0
yt1s1x,125,4,3,2,304,0.0,1,77,162,1,1,0
l2xjde,152,4,0,0,223,0.0,1,40,181,0,0,1
oyt4ek,178,1,0,2,270,4.2,1,59,145,0,0,1


In [12]:
# pipeline to train and fit model
pipe = Pipeline(steps=[('scale', StandardScaler()),
                       ('logistic', LogisticRegression(solver='liblinear'))])

param_grid = {'logistic__C': [0.0001, 0.001, 0.01, 1, 10],
              'logistic__penalty': ['l2'],
              'logistic__solver': ['liblinear', 'lbfgs']}
cv = GridSearchCV(pipe, param_grid, scoring='neg_log_loss', cv=10, iid=False)
X = train_values
y = train_labels.heart_disease_present
x_train, x_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3, random_state=42, stratify=y)
cv.fit(x_train, y_train)

in_sample_preds = np.round(cv.predict_proba(x_test), decimals=2)
print('Log Loss score {}'.format(log_loss(y_test, in_sample_preds)))

Log Loss score 0.3401689043685209


In [15]:
# create submission
test_values = pd.read_csv('test_values.csv', index_col='patient_id')
test_values_subset = test_values.drop(to_drop, axis=1)
test_values_subset = one_hot_encode(test_values_subset)
cv.fit(X, y)
predictions = np.round(cv.predict_proba(test_values_subset)[:, 1], decimals=2)
submission_format = pd.read_csv('submission_format.csv', index_col='patient_id')
my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)
my_submission.to_csv('submission.csv')
print(my_submission.head())


            heart_disease_present
patient_id                       
olalu7                       0.20
z9n6mx                       0.07
5k4413                       0.95
mrg7q5                       0.05
uki4do                       0.93
