In [221]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Models
In this section different machine learning algorihms will be evaluated and compared. First I splitted data before any models to have better comparison among models via same traning and same test data.

In [191]:
df = pd.read_csv('prepeddata.csv')

In [192]:
df

Unnamed: 0,age,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,39,195,106.0,70.0,26.97,80,77,0
1,46,250,121.0,81.0,28.73,95,76,0
2,48,245,127.5,80.0,25.34,75,70,0
3,61,225,150.0,95.0,28.58,65,103,1
4,46,285,130.0,84.0,23.10,85,85,0
...,...,...,...,...,...,...,...,...
3650,58,187,141.0,81.0,24.96,80,81,0
3651,68,176,168.0,97.0,23.14,60,79,1
3652,50,313,179.0,92.0,25.97,66,86,1
3653,51,207,126.5,80.0,19.71,65,68,0


In [193]:
X = df.drop('TenYearCHD', 1)
yval =df['TenYearCHD']

In [194]:
#Preparing data for cross validation
Xtr, Xts, ytr, yts = train_test_split(X, yval, test_size=0.3)

## Logistic Regression

In [195]:
#Creating the model
model = LogisticRegression(max_iter = 1000)
model.fit(Xtr, ytr)

LogisticRegression(max_iter=1000)

In [196]:
#Making predictions using model
predictionmodel = model.predict(Xtr)
predictiontest = model.predict(Xts)

In [197]:
#Accuracy of the model
print('Training Accuracy:')
print(accuracy_score(predictionmodel, ytr))
#Counting the prediction results
unique, counts = np.unique(predictionmodel, return_counts=True)
print(dict(zip(unique, counts)))
print('Training Accuracy on Test Data:')
print(accuracy_score(predictiontest, yts))
unique, counts = np.unique(predictiontest, return_counts=True)
print(dict(zip(unique, counts)))

Training Accuracy:
0.8483189992181391
{0: 2531, 1: 27}
Training Accuracy on Test Data:
0.8523245214220602
{0: 1082, 1: 15}


## PCA
PCA Dimensionality Reduction is an optimization method for machine learning classification models. It can help to increase precision and recall performance of the algorithm.

In [198]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X)
X = pca.transform(X)

In [199]:
pcadf = pd.DataFrame(data=np.c_[X, yval], columns=['Feature 1', 'Feature 2', 'Label'])

In [200]:
X_train, X_test, y_train, y_test = train_test_split(pcadf.drop('Label', 1), pcadf['Label'], test_size=0.3, random_state=0)

In [201]:
model2 = LogisticRegression()
model2.fit(X_train, y_train)

LogisticRegression()

In [202]:
predictstrain = model2.predict(X_train)
print('Training Accuracy:')
print(accuracy_score(predictstrain, y_train))
unique, counts = np.unique(predictstrain, return_counts=True)
print(dict(zip(unique, counts)))
predictstest = model2.predict(X_test)
print('Training Accuracy on Test Data:')
print(accuracy_score(predictstest, y_test))
unique, counts = np.unique(predictstest, return_counts=True)
print(dict(zip(unique, counts)))

Training Accuracy:
0.8451915559030493
{0.0: 2531, 1.0: 27}
Training Accuracy on Test Data:
0.8587055606198724
{0.0: 1091, 1.0: 6}


Dimentionality reduction did not help for solving the problem.

# Random Forest

In [203]:
from sklearn.ensemble import RandomForestClassifier

In [204]:
df = pd.read_csv('booldata.csv')

In [205]:
X = df.drop('TenYearCHD', 1)
yval =df['TenYearCHD']

In [206]:
Xtr, Xts, ytr, yts = train_test_split(X, yval, test_size=0.5)

In [207]:
model = RandomForestClassifier(max_depth=2)

In [208]:
model.fit(Xtr, ytr)

RandomForestClassifier(max_depth=2)

In [209]:
prediction = model.predict(Xtr)
predictiontest = model.predict(Xts)

In [210]:
print('Training Accuracy:')
print(accuracy_score(ytr, prediction))
unique, counts = np.unique(prediction, return_counts=True)
print(dict(zip(unique, counts)))
print('Training Accuracy on Test Data:')
print(accuracy_score(yts, predictiontest))
unique, counts = np.unique(predictiontest, return_counts=True)
print(dict(zip(unique, counts)))

Training Accuracy:
0.8483853311439519
{0: 1827}
Training Accuracy on Test Data:
0.8473741794310722
{0: 1828}


# Hyperparameter Tuning

max_depth indicates how deep the tree can be. The deeper the tree, the more splits it has and it captures more information about the data.

In [211]:
model = RandomForestClassifier(max_depth=4)
model.fit(Xtr, ytr)
prediction = model.predict(Xtr)
predictiontest = model.predict(Xts)

In [212]:
print('Training Accuracy:')
print(accuracy_score(ytr, prediction))
unique, counts = np.unique(prediction, return_counts=True)
print(dict(zip(unique, counts)))
print('Training Accuracy on Test Data:')
print(accuracy_score(yts, predictiontest))
unique, counts = np.unique(predictiontest, return_counts=True)
print(dict(zip(unique, counts)))

Training Accuracy:
0.8516694033935414
{0: 1821, 1: 6}
Training Accuracy on Test Data:
0.8479212253829321
{0: 1827, 1: 1}


In [213]:
model = RandomForestClassifier(max_depth=16)
model.fit(Xtr, ytr)
prediction = model.predict(Xtr)
predictiontest = model.predict(Xts)

In [214]:
print('Training Accuracy:')
print(accuracy_score(ytr, prediction))
unique, counts = np.unique(prediction, return_counts=True)
print(dict(zip(unique, counts)))
print('Training Accuracy on Test Data:')
print(accuracy_score(yts, predictiontest))
unique, counts = np.unique(predictiontest, return_counts=True)
print(dict(zip(unique, counts)))

Training Accuracy:
0.9939792008757526
{0: 1561, 1: 266}
Training Accuracy on Test Data:
0.8435448577680525
{0: 1789, 1: 39}


# .                     !
After this point the model has overfitting problem

In [215]:
model = RandomForestClassifier(max_depth=32)
model.fit(Xtr, ytr)
prediction = model.predict(Xtr)
predictiontest = model.predict(Xts)

In [216]:
print('Training Accuracy:')
print(accuracy_score(ytr, prediction))
unique, counts = np.unique(prediction, return_counts=True)
print(dict(zip(unique, counts)))
print('Training Accuracy on Test Data:')
print(accuracy_score(yts, predictiontest))
unique, counts = np.unique(predictiontest, return_counts=True)
print(dict(zip(unique, counts)))

Training Accuracy:
1.0
{0: 1550, 1: 277}
Training Accuracy on Test Data:
0.8419037199124726
{0: 1786, 1: 42}


# KNeighborsClassifier

Since it is a binary classification we do not need many neighbors. Increasing the neighbor did not help to improve model performance.

In [226]:
from sklearn.neighbors import KNeighborsClassifier

In [227]:
model = KNeighborsClassifier(n_neighbors=2)
model.fit(Xtr, ytr)

KNeighborsClassifier(n_neighbors=2)

In [228]:
prediction = model.predict(Xtr)
predictiontest = model.predict(Xts)

In [229]:
print('Training Accuracy:')
print(accuracy_score(ytr, prediction))
unique, counts = np.unique(prediction, return_counts=True)
print(dict(zip(unique, counts)))
print('Training Accuracy on Test Data:')
print(accuracy_score(yts, predictiontest))
unique, counts = np.unique(predictiontest, return_counts=True)
print(dict(zip(unique, counts)))

Training Accuracy:
0.8680897646414888
{0: 1791, 1: 36}
Training Accuracy on Test Data:
0.8386214442013129
{0: 1798, 1: 30}
