#### Import useful libraries  : 

In [1]:
import os
import numpy as np
import pandas as pd
import numpy as np, pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics , model_selection

## Import the Classifier.
from sklearn.naive_bayes import GaussianNB

#### Import the data : 

In [2]:
data = pd.read_csv('heart.csv',names=['age','sex','cp','trestbps','chol','fbs','restecg','thalach','exang','oldpeak','slope','ca','thal','target'
])
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


_______________________________________________________________________________________________________

### Data - Attribute information

###### age - age in year sex - (1 = male; 0 = female)
###### cp - chest pain type ( On a scale of 0, 1 , 2 , 3 )
###### trestbps - resting blood pressure (in mm Hg on admission to the hospital)
###### chol - serum cholestoral in mg/dl
###### fbs - (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
###### restecg - resting electrocardiographic results
###### thalach - maximum heart rate achieved
###### exang - exercise induced angina (1 = yes; 0 = no)
###### oldpeak - ST depression induced by exercise relative to rest
###### slope - the slope of the peak exercise ST segment
###### ca - number of major vessels (0-3) colored by flourosopy
###### thal - 3 = normal; 6 = fixed defect; 7 = reversable defect
###### target - have disease or not (1=yes, 0=no)

_______________________________________________________________________________________________________

In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


_______________________________________________________________________________________________________

#### The target variable is marked as a target { target - have disease or not (1=yes, 0=no) } in the data frame. 
#### The values are present in string format. However, the algorithm requires the variables to be coded into its equivalent integer codes. 
#### We can convert the string categorical values into an integer code using factorize method of the pandas library.

In [4]:
data['target'],class_names = pd.factorize(data['target'])

In [5]:
print(class_names)
print(data['target'].unique())

Int64Index([1, 0], dtype='int64')
[0 1]


In [6]:
data['age'],_ = pd.factorize(data['age'])
data['sex'],_ = pd.factorize(data['sex'])
data['cp'],_ = pd.factorize(data['cp'])
data['trestbps'],_ = pd.factorize(data['trestbps'])
data['chol'],_ = pd.factorize(data['chol'])
data['fbs'],_ = pd.factorize(data['fbs'])
data['restecg'],_ = pd.factorize(data['restecg'])
data['thalach'],_ = pd.factorize(data['thalach'])
data['exang'],_ = pd.factorize(data['exang'])
data['oldpeak'],_ = pd.factorize(data['oldpeak'])
data['slope'],_ = pd.factorize(data['slope'])
data['ca'],_ = pd.factorize(data['ca'])
data['thal'],_ = pd.factorize(data['thal'])
data.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,1,1,1,1,1,1,0,1,0,0,1,0
2,2,1,2,1,2,1,0,2,0,2,1,0,1,0
3,3,0,2,2,3,1,1,3,0,3,1,0,1,0
4,4,1,3,2,4,1,1,4,1,4,1,0,1,0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype
---  ------    --------------  -----
 0   age       303 non-null    int64
 1   sex       303 non-null    int64
 2   cp        303 non-null    int64
 3   trestbps  303 non-null    int64
 4   chol      303 non-null    int64
 5   fbs       303 non-null    int64
 6   restecg   303 non-null    int64
 7   thalach   303 non-null    int64
 8   exang     303 non-null    int64
 9   oldpeak   303 non-null    int64
 10  slope     303 non-null    int64
 11  ca        303 non-null    int64
 12  thal      303 non-null    int64
 13  target    303 non-null    int64
dtypes: int64(14)
memory usage: 33.3 KB


_______________________________________________________________________________________________________

#### Prepare a Train Dataset and Test Dataset : 

In [8]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [9]:
# split data randomly into 70% training and 30% test
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=123)

#### Fit the model and Train it using the Gaussian Naive Bayes Classifier : 

In [10]:
model = GaussianNB()
## Fit the model on the training data.
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

_______________________________________________________________________________________________________

In [11]:
# use the model to make predictions with the test data
y_pred = model.predict(X_test)
# how did our model perform?
count_misclassified = (y_test != y_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

Misclassified samples: 11
Accuracy: 0.88


####  This model has an accuracy score of 88% since this is a very simplistic dataset with distinctly separable classes. 