In [1]:
import numpy as np
import pandas as pd

# to standardize the data to a common range
from sklearn.preprocessing import StandardScaler

# to split the data into optimum training and test data set
from sklearn.model_selection import train_test_split

# importing Support Vector Machine Model
from sklearn import svm 

# Accuracy Score
from sklearn.metrics import accuracy_score

In [2]:
# Loading the heart-disease dataset as pandas DataFrame
heartDisDataSet = pd.read_csv('../input/heart-disease-uci/heart.csv')

In [3]:
# number of rows and columns in heartDisDataSet
heartDisDataSet.shape

In [4]:
# Getting the statistical measures of the data.
# count - no of Non-NULL values in the column
# std - standard deviation
# min - minimum value in the column
# max - maximum value in the column
# mean - mean of all values in the column
# 25% with example in terms of column 'age' means 25 % people have age below 47.5
heartDisDataSet.describe()

In [5]:
# Cloumn names, its datatype, count of Non-NULL values.
heartDisDataSet.info()

In [6]:
# Count of 1 and 0 i.e having Heart Diease and not.
heartDisDataSet['target'].value_counts()

In [7]:
# This will give the group mean value of respective distinct in Outcome, here 1 and 0
# Mean for Disease detection or not
heartDisDataSet.groupby('target').mean()

In [8]:
# dropping target column from dataset and storing in X.
# axis = 0 for row and axis = 1 for column
# Storing target column in Y.
X = heartDisDataSet.drop(columns = 'target', axis = 1)
Y = heartDisDataSet['target']

In [9]:
print(X)

In [10]:
print(Y)

In [11]:
# as value of data of different columns range between different values [L,R].
# so we standardize to bring all dataset to a common range

# taking an instance of StandardScalar
scaler = StandardScaler()

# # fitting our inconsistent data into scalar, scalar.fit_transform() can do in single step, but we are doing separately
scaler.fit(X)

# tranforming our data to a particular range, here between 0 and 1
stdData = scaler.transform(X)
print(stdData)

In [12]:
# now X is a standardized data
X = stdData

In [13]:
# test_size = how much percent of data for testing, 0.3 means 30 percent
# Stratify: if we dont include it, it may happen, that only positive heart diease data goes to training, and negative heart Diease in the testing
# dataset, we want equal proprotion in testing and training, that is why, we stratify according to label here
# random_state, giving the serial number of the way for splitting data, if two user gives same random_state, their data
# splitting will be same, if they give different, their data splitting will be different
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, stratify=Y, random_state=2)

In [14]:
print(X.shape, X_train.shape, X_test.shape)

In [15]:
# Training the Model, svm : support vector machine, svc : support vector classifier, kernel = linear for linear model
classifier = svm.SVC(kernel = "linear")

In [16]:
#training the support vector Machine Classifier
classifier.fit(X_train, Y_train)

In [17]:
# Model Evaluation : Accuracy Score, greater than 75 is good because we have small amount of data, 
# because it can be optimized for greater accuracy

# accuracy score on the test data
Y_test_pred = classifier.predict(X_test)
test_data_accuracy = accuracy_score(Y_test_pred, Y_test)
print('accuracy score on test data: ', test_data_accuracy*100)

In [18]:
# accuracy score on training data
Y_train_pred = classifier.predict(X_train)
train_data_accuracy = accuracy_score(Y_train_pred, Y_train)
print('accuracy score on train data: ', train_data_accuracy*100)

In [19]:
# above we observed, the accuracy score on the training data is 86 and on the test data is 84
# it is good, because the model is not overtrained, because overtraining on the training data and resulting in 
# low accuracy on the test data is bad : overfitting