In [12]:
from sklearn.linear_model import Perceptron
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

The dataset is about prediction cardiovascular disease based on systolic blood pressure and Diastolic blood pressure. A 1 classification means the person has a cardiovascular disease and 0 means they don't. There are approximately 70,000 data values in total and will be split in a 7 to 3 ratio for training and testing. 

In [21]:
train = pd.read_csv("cardio_train.csv")

a = train.to_numpy()

X = a[:, [5,6]]

y = a[:, 12]
y=y.astype('int')


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1, stratify=y)


In [22]:

pipe1 = make_pipeline(StandardScaler(), Perceptron(eta0=0.1, random_state=1))

pipe2 = make_pipeline(StandardScaler(), LogisticRegression(C=2000, random_state=5, solver='lbfgs'))
#pipe2 = make_pipeline(StandardScaler(), SVC(kernel='rbf', random_state=1, gamma=1, C=1000000.0))

#pipe3 = make_pipeline(StandardScaler(), SVC(kernel='rbf', random_state=1, gamma=1, C=1000000.0))
pipe3 = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=20,
                                                             p=3,
                                                             metric='minkowski'))

clf_labels = ['Perceptron', 'Logestic Regression', 'KNN']

print('10-fold cross validation:\n')
for clf, label in zip([pipe1, pipe2, pipe3], clf_labels):
    scores = cross_val_score(estimator=clf,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             scoring='accuracy')
    print("Accuracy: " + str(round(scores.mean(), 2)) + 
          " Stdev: " + str(round(scores.std(), 3)) +
          " [" + label + "]")

10-fold cross validation:

Accuracy: 0.57 Stdev: 0.092 [Perceptron]
Accuracy: 0.71 Stdev: 0.006 [Logestic Regression]
Accuracy: 0.71 Stdev: 0.01 [KNN]


In [23]:

mv_clf = VotingClassifier(estimators=[('p', pipe1), ('dt', pipe2), ('kn', pipe3)])

clf_labels += ['Majority voting']
all_clf = [pipe1, pipe2, pipe3, mv_clf]

for clf, label in zip(all_clf, clf_labels):
    scores = cross_val_score(estimator=clf,
                             X=X_train,
                             y=y_train,
                             cv=10,
                             scoring='accuracy')
    print("Accuracy: " + str(round(scores.mean(), 2)) + 
          " Stdev: " + str(round(scores.std(), 3)) +
          " [" + label + "]")

Accuracy: 0.57 Stdev: 0.092 [Perceptron]
Accuracy: 0.71 Stdev: 0.006 [Logestic Regression]
Accuracy: 0.71 Stdev: 0.01 [KNN]
Accuracy: 0.71 Stdev: 0.009 [Majority voting]


In [24]:
pipe1.fit(X_train, y_train)

y_pred = pipe1.predict(X_test)
print('Misclassified test set examples:', (y_test != y_pred).sum())
print('Out of a total of:', y_test.shape[0])
print('Accuracy:', pipe1.score(X_test, y_test))

Misclassified test set examples: 8642
Out of a total of: 21000
Accuracy: 0.5884761904761905


In [25]:
pipe2.fit(X_train, y_train)

y_pred = pipe2.predict(X_test)
print('Misclassified test set examples:', (y_test != y_pred).sum())
print('Out of a total of:', y_test.shape[0])
print('Accuracy:', pipe2.score(X_test, y_test))

Misclassified test set examples: 5999
Out of a total of: 21000
Accuracy: 0.7143333333333334


In [26]:
pipe3.fit(X_train, y_train)

y_pred = pipe3.predict(X_test)
print('Misclassified test set examples:', (y_test != y_pred).sum())
print('Out of a total of:', y_test.shape[0])
print('Accuracy:', pipe3.score(X_test, y_test))

Misclassified test set examples: 6451
Out of a total of: 21000
Accuracy: 0.6928095238095238


In [27]:
mv_clf.fit(X_train, y_train)

y_pred = mv_clf.predict(X_test)
print('Misclassified test set examples:', (y_test != y_pred).sum())
print('Out of a total of:', y_test.shape[0])
print('Accuracy:', mv_clf.score(X_test, y_test))

Misclassified test set examples: 5988
Out of a total of: 21000
Accuracy: 0.7148571428571429


The Ensemble and Logistic regression pipeline performed the best on the test dataset with the K nearest neighbour close behind.