In [None]:
# Print List of files (our train and test data) in our file explorer
import os
for dirname, _, filenames in os.walk('D:/Machine-Learning/data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

D:/Machine-Learning/data\data.csv


## Import the libraries and the dataset

In [None]:
# Importing Pandas an Numpy Libraries to use on manipulating our Data
import pandas as pd
import numpy as np

# To Preproccesing our data
from sklearn.preprocessing import LabelEncoder

# To fill missing values
from sklearn.impute import SimpleImputer

# To Split our train data
from sklearn.model_selection import train_test_split

# To Visualize Data
import matplotlib.pyplot as plt
import seaborn as sns

# To Train our data
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB

# To evaluate end result we have
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score


# We are importing our Data with Pandas Library
# We use "Coronary_artery.csv" 
df = pd.read_csv("D:/Machine-Learning/data/data.csv")

## Exploring the dataset that we have

In [None]:
# Prints first 5 row in Data
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,Male,asymptomatic,145,233,high,normal,150,no,2.3,0,0,fixed deffect,1
1,37,Male,non anginal pain,130,250,normal,S-T abnormanilty,187,no,3.5,0,0,reversible deffect,1
2,41,Female,atypical angina,130,204,normal,normal,172,no,1.4,2,0,reversible deffect,1
3,56,Male,atypical angina,120,236,normal,S-T abnormanilty,178,no,0.8,2,0,reversible deffect,1
4,57,Female,typical angina,120,354,normal,S-T abnormanilty,163,yes,0.6,2,0,reversible deffect,1


In [None]:
# Print number of rows in data
print("Rows:", len(df))

Rows: 303


In [None]:
# Prints Summary of Numerical Data
df.describe()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,slope,ca,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,131.623762,246.264026,149.646865,1.039604,1.39934,0.729373,0.544554
std,9.082101,17.538143,51.830751,22.905161,1.161075,0.616226,1.022606,0.498835
min,29.0,94.0,126.0,71.0,0.0,0.0,0.0,0.0
25%,47.5,120.0,211.0,133.5,0.0,1.0,0.0,0.0
50%,55.0,130.0,240.0,153.0,0.8,1.0,0.0,1.0
75%,61.0,140.0,274.5,166.0,1.6,2.0,1.0,1.0
max,77.0,200.0,564.0,202.0,6.2,2.0,4.0,1.0


In [None]:
# Prints Summary of Categorical Data
df.describe(include=[np.object])

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  df.describe(include=[np.object])


Unnamed: 0,sex,cp,fbs,restecg,exang,thal
count,303,303,303,303,303,303
unique,2,4,2,3,2,4
top,Male,typical angina,normal,S-T abnormanilty,no,reversible deffect
freq,207,143,258,152,204,166


### Splitting Numerical and categorical variables

In [None]:
numerical_column = df.select_dtypes(exclude="object").columns.tolist()
categorical_column = df.select_dtypes(include="object").columns.tolist()
print("Numerical Columns:", numerical_column)
print("****************")
print("Categorical Columns:", categorical_column)

Numerical Columns: ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca', 'target']
****************
Categorical Columns: ['sex', 'cp', 'fbs', 'restecg', 'exang', 'thal']


### Exploring the categorical columns

In [None]:
df[categorical_column].describe()

Unnamed: 0,sex,cp,fbs,restecg,exang,thal
count,303,303,303,303,303,303
unique,2,4,2,3,2,4
top,Male,typical angina,normal,S-T abnormanilty,no,reversible deffect
freq,207,143,258,152,204,166


### Splitting the columns for one hot encoding and label encoding 

In [None]:
# Get column names have less than 10 more than 2 unique values
to_one_hot_encoding = [col for col in categorical_column if df[col].nunique() <= 10 and df[col].nunique() > 2]

# Get Categorical Column names thoose are not in "to_one_hot_encoding"
to_label_encoding = [col for col in categorical_column if not col in to_one_hot_encoding]

print("To One Hot Encoding:", to_one_hot_encoding)
print("To Label Encoding:", to_label_encoding)

To One Hot Encoding: ['cp', 'restecg', 'thal']
To Label Encoding: ['sex', 'fbs', 'exang']


### Investigating the missing values 

In [None]:
df.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

Our data does not have any missing values

### One hot encoding and label encoding

In [None]:
# We will use built in pandas function "get_dummies()" to simply to encode "to_one_hot_encoding" columns
one_hot_encoded_columns = pd.get_dummies(df[to_one_hot_encoding])
one_hot_encoded_columns

Unnamed: 0,cp_asymptomatic,cp_atypical angina,cp_non anginal pain,cp_typical angina,restecg_2,restecg_S-T abnormanilty,restecg_normal,thal_deffect-3,thal_fixed deffect,thal_normal,thal_reversible deffect
0,1,0,0,0,0,0,1,0,1,0,0
1,0,0,1,0,0,1,0,0,0,0,1
2,0,1,0,0,0,0,1,0,0,0,1
3,0,1,0,0,0,1,0,0,0,0,1
4,0,0,0,1,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
298,0,0,0,1,0,1,0,1,0,0,0
299,1,0,0,0,0,1,0,1,0,0,0
300,0,0,0,1,0,1,0,1,0,0,0
301,0,0,0,1,0,1,0,1,0,0,0


In [None]:
# Label Encoding

label_encoded_columns = []
# For loop for each columns
for col in to_label_encoding:
    # We define new label encoder to each new column
    le = LabelEncoder()
    # Encode our data and create new Dataframe of it, 
    # notice that we gave column name in "columns" arguments
    column_dataframe = pd.DataFrame(le.fit_transform(df[col]), columns=[col] )
    # and add new DataFrame to "label_encoded_columns" list
    label_encoded_columns.append(column_dataframe)

# Merge all data frames
label_encoded_columns = pd.concat(label_encoded_columns, axis=1)
label_encoded_columns

Unnamed: 0,sex,fbs,exang
0,1,0,0
1,1,1,0
2,0,1,0
3,1,1,0
4,0,1,1
...,...,...,...
298,0,1,1
299,1,1,0
300,1,0,0
301,1,1,1


### Bring the data together

In [None]:
# Copy our DataFrame to X variable
X = df.copy()

# Droping Categorical Columns,
# "inplace" means replace our data with new one
# Don't forget to "axis=1"
X.drop(categorical_column, axis=1, inplace=True)

# Merge DataFrames
X = pd.concat([X, one_hot_encoded_columns, label_encoded_columns], axis=1)
print("All columns:", X.columns.tolist())
X

All columns: ['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca', 'target', 'cp_asymptomatic', 'cp_atypical angina', 'cp_non anginal pain', 'cp_typical angina', 'restecg_2', 'restecg_S-T abnormanilty', 'restecg_normal', 'thal_deffect-3', 'thal_fixed deffect', 'thal_normal', 'thal_reversible deffect', 'sex', 'fbs', 'exang']


Unnamed: 0,age,trestbps,chol,thalach,oldpeak,slope,ca,target,cp_asymptomatic,cp_atypical angina,...,restecg_2,restecg_S-T abnormanilty,restecg_normal,thal_deffect-3,thal_fixed deffect,thal_normal,thal_reversible deffect,sex,fbs,exang
0,63,145,233,150,2.3,0,0,1,1,0,...,0,0,1,0,1,0,0,1,0,0
1,37,130,250,187,3.5,0,0,1,0,0,...,0,1,0,0,0,0,1,1,1,0
2,41,130,204,172,1.4,2,0,1,0,1,...,0,0,1,0,0,0,1,0,1,0
3,56,120,236,178,0.8,2,0,1,0,1,...,0,1,0,0,0,0,1,1,1,0
4,57,120,354,163,0.6,2,0,1,0,0,...,0,1,0,0,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,140,241,123,0.2,1,0,0,0,0,...,0,1,0,1,0,0,0,0,1,1
299,45,110,264,132,1.2,1,0,0,1,0,...,0,1,0,1,0,0,0,1,1,0
300,68,144,193,141,3.4,1,2,0,0,0,...,0,1,0,1,0,0,0,1,0,0
301,57,130,131,115,1.2,1,1,0,0,0,...,0,1,0,1,0,0,0,1,1,1


### Splitting the data to train and test sets

In [None]:
# Define Y (This is the value we will predict)
y = df["target"]

# Droping "class" from X
X.drop(["target"], axis=1, inplace=True)
X

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,slope,ca,cp_asymptomatic,cp_atypical angina,cp_non anginal pain,...,restecg_2,restecg_S-T abnormanilty,restecg_normal,thal_deffect-3,thal_fixed deffect,thal_normal,thal_reversible deffect,sex,fbs,exang
0,63,145,233,150,2.3,0,0,1,0,0,...,0,0,1,0,1,0,0,1,0,0
1,37,130,250,187,3.5,0,0,0,0,1,...,0,1,0,0,0,0,1,1,1,0
2,41,130,204,172,1.4,2,0,0,1,0,...,0,0,1,0,0,0,1,0,1,0
3,56,120,236,178,0.8,2,0,0,1,0,...,0,1,0,0,0,0,1,1,1,0
4,57,120,354,163,0.6,2,0,0,0,0,...,0,1,0,0,0,0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,140,241,123,0.2,1,0,0,0,0,...,0,1,0,1,0,0,0,0,1,1
299,45,110,264,132,1.2,1,0,1,0,0,...,0,1,0,1,0,0,0,1,1,0
300,68,144,193,141,3.4,1,2,0,0,0,...,0,1,0,1,0,0,0,1,0,0
301,57,130,131,115,1.2,1,1,0,0,0,...,0,1,0,1,0,0,0,1,1,1


Now is the time to split the data to train and test sets

In [None]:
# You can specify test size
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

## Training the models

* Random Forest (It's my favorite)
* Desicion Tree
* Logistic Regression Classifier
* Bernouilli Naive Bias
* Gaussian Naive Bias
* KNN (K-Nearest Neighbors)
* XGBoost (It's new and have acurate predictions)

### Random Forest

In [None]:
# Define Random Forest Model
rf = RandomForestClassifier(n_estimators=100)

# We fit our model with our train data
rf.fit(X_train, y_train)

# Then predict results from X_test data
pred_rf = rf.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_rf[0:10])
print("Actual:", y_test[0:10])

Predicted: [0 1 1 0 0 0 1 1 0 0]
Actual: 271    0
177    0
75     1
205    0
284    0
59     1
182    0
160    1
269    0
240    0
Name: target, dtype: int64


### Decision Tree

In [None]:
# Define Decision Tree Model
dt = DecisionTreeClassifier()
# We fit our model with our train data
dt.fit(X_train, y_train)
# Then predict results from X_test data
pred_dt = dt.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_dt[0:10])
print("Actual:", y_test[0:10])

Predicted: [0 1 1 0 0 0 1 1 0 0]
Actual: 271    0
177    0
75     1
205    0
284    0
59     1
182    0
160    1
269    0
240    0
Name: target, dtype: int64


### Logistic Regression

In [None]:
# Define Logistic Regression Model
log = LogisticRegression()
# We fit our model with our train data
log.fit(X_train, y_train)
# Then predict results from X_test data
pred_log = log.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_log[0:10])
print("Actual:", y_test[0:10])

Predicted: [1 1 1 0 0 1 1 1 0 0]
Actual: 271    0
177    0
75     1
205    0
284    0
59     1
182    0
160    1
269    0
240    0
Name: target, dtype: int64


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Bernouilli Naive Bias

In [None]:
# Define Bernouilli Naive Bias Model
bnb = BernoulliNB()
# We fit our model with our train data
bnb.fit(X_train, y_train)
# Then predict results from X_test data
pred_bnb = bnb.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_bnb[0:10])
print("Actual:", y_test[0:10])

Predicted: [1 1 1 0 0 1 1 1 0 0]
Actual: 271    0
177    0
75     1
205    0
284    0
59     1
182    0
160    1
269    0
240    0
Name: target, dtype: int64


### Gaussian Naive Bias

In [None]:
# Define Gaussian Naive Bias Model
gnb = GaussianNB()
# We fit our model with our train data
gnb.fit(X_train, y_train)
# Then predict results from X_test data
pred_gnb = gnb.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_gnb[0:10])
print("Actual:", y_test[0:10])

Predicted: [1 1 1 1 0 1 1 1 0 1]
Actual: 271    0
177    0
75     1
205    0
284    0
59     1
182    0
160    1
269    0
240    0
Name: target, dtype: int64


### KNN - K-Nearest Neighbours

In [None]:
# Define KNN Model
knn = KNeighborsClassifier(n_neighbors=3, metric="minkowski")
# We fit our model with our train data
knn.fit(X_train, y_train)
# Then predict results from X_test data
pred_knn = knn.predict(X_test)

# See First 10 Predictions and They Actual Values
print("Predicted:", pred_knn[0:10])
print("Actual:", y_test[0:10])

Predicted: [1 0 1 1 1 1 1 1 0 0]
Actual: 271    0
177    0
75     1
205    0
284    0
59     1
182    0
160    1
269    0
240    0
Name: target, dtype: int64


### XGBoost

In [None]:
# Define XGBoost Model
xgb = XGBClassifier(n_estimators=1000, learning_rate=0.05)
# We fit our model with our train data
xgb.fit(
    X_train, y_train,
    # That means if model don't improve it self in 5 rounds, it will stop learning
    # So you can save your time and don't overtrain your model.
    early_stopping_rounds=5,
    # We provide Test data's to evaluate model performance
    eval_set=[(X_test, y_test)],
    verbose=False
 )
# Then predict results from X_test data
pred_xgb = xgb.predict(X_test)

# See First 10 Predictions and They Actual Values
# print("Predicted:", pred_xgb[0:10])
print("Actual:", y_test[0:10])



Actual: 271    0
177    0
75     1
205    0
284    0
59     1
182    0
160    1
269    0
240    0
Name: target, dtype: int64


## Compare the models performances

### Confusion Matrices

In [None]:
# Confusion Matrixes
# First parameter is actual value
# second parameter is value that we prediceted

# Random Forest 
cm_rf = confusion_matrix(y_test, pred_rf)
# Desicion Tree
cm_dt = confusion_matrix(y_test, pred_dt)
# Logistic Regression
cm_log = confusion_matrix(y_test, pred_log)
# Bernouilli Naive Bias
cm_bnb = confusion_matrix(y_test, pred_bnb)
# Gaussian Naive Bias
cm_gnb = confusion_matrix(y_test, pred_gnb)
# KNN (K-Nearest Neighbors)
cm_knn = confusion_matrix(y_test, pred_knn)
# XGBoost 
cm_xgb = confusion_matrix(y_test, pred_xgb)

print("***********************")
print("Confusion Matrixes")
print("***********************")
print("Random Forest:\n", cm_rf)
print("Desicion Tree:\n", cm_dt)
print("Logistic Regression:\n", cm_log)
print("Bernouilli Naive Bias:\n", cm_bnb)
print("Gaussian Naive Bias:\n", cm_gnb)
print("KNN (K-Nearest Neighbors):\n", cm_knn)
print("XGBoost:\n", cm_xgb)

***********************
Confusion Matrixes
***********************
Random Forest:
 [[42  8]
 [ 9 41]]
Desicion Tree:
 [[37 13]
 [10 40]]
Logistic Regression:
 [[41  9]
 [ 5 45]]
Bernouilli Naive Bias:
 [[39 11]
 [ 8 42]]
Gaussian Naive Bias:
 [[20 30]
 [ 4 46]]
KNN (K-Nearest Neighbors):
 [[20 30]
 [10 40]]
XGBoost:
 [[40 10]
 [ 8 42]]


### Accuracy scores

In [None]:
# Accuracy Scores
# First parameter is actual value
# second parameter is value that we prediceted

# Random Forest 
acc_rf = accuracy_score(y_test, pred_rf)
# Desicion Tree
acc_dt = accuracy_score(y_test, pred_dt)
# Logistic Regression
acc_log = accuracy_score(y_test, pred_log)
# Bernouilli Naive Bias
acc_bnb = accuracy_score(y_test, pred_bnb)
# Gaussian Naive Bias
acc_gnb = accuracy_score(y_test, pred_gnb)
# KNN (K-Nearest Neighbors)
acc_knn = accuracy_score(y_test, pred_knn)
# XGBoost 
acc_xgb = accuracy_score(y_test, pred_xgb)

print("***********************")
print("Accuracy Scores")
print("***********************")
print("Random Forest:", acc_rf)
print("Desicion Tree:", acc_dt)
print("Logistic Regression:", acc_log)
print("Bernouilli Naive Bias:", acc_bnb)
print("Gaussian Naive Bias:", acc_gnb)
print("KNN (K-Nearest Neighbors):", acc_knn)
print("XGBoost:", acc_xgb)

***********************
Accuracy Scores
***********************
Random Forest: 0.83
Desicion Tree: 0.77
Logistic Regression: 0.86
Bernouilli Naive Bias: 0.81
Gaussian Naive Bias: 0.66
KNN (K-Nearest Neighbors): 0.6
XGBoost: 0.82


As you see XGBoost, Logistic Regression and Bernouilli Naive Bias: give us best results