# load data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import calendar

from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import StandardScaler

In [2]:
train_data=r"C:\machinelearningprojects\heartdeseasepread\heart_disease_dataset.csv"
df_train = pd.read_csv(train_data)
print(df_train.shape)

(1000, 16)


In [3]:
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

# know abou data

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Age                      1000 non-null   int64 
 1   Gender                   1000 non-null   object
 2   Cholesterol              1000 non-null   int64 
 3   Blood Pressure           1000 non-null   int64 
 4   Heart Rate               1000 non-null   int64 
 5   Smoking                  1000 non-null   object
 6   Alcohol Intake           660 non-null    object
 7   Exercise Hours           1000 non-null   int64 
 8   Family History           1000 non-null   object
 9   Diabetes                 1000 non-null   object
 10  Obesity                  1000 non-null   object
 11  Stress Level             1000 non-null   int64 
 12  Blood Sugar              1000 non-null   int64 
 13  Exercise Induced Angina  1000 non-null   object
 14  Chest Pain Type          1000 non-null   

In [5]:
df_train.head(5)

Unnamed: 0,Age,Gender,Cholesterol,Blood Pressure,Heart Rate,Smoking,Alcohol Intake,Exercise Hours,Family History,Diabetes,Obesity,Stress Level,Blood Sugar,Exercise Induced Angina,Chest Pain Type,Heart Disease
0,75,Female,228,119,66,Current,Heavy,1,No,No,Yes,8,119,Yes,Atypical Angina,1
1,48,Male,204,165,62,Current,,5,No,No,No,9,70,Yes,Typical Angina,0
2,53,Male,234,91,67,Never,Heavy,3,Yes,No,Yes,5,196,Yes,Atypical Angina,1
3,69,Female,192,90,72,Current,,4,No,Yes,No,7,107,Yes,Non-anginal Pain,0
4,62,Female,172,163,93,Never,,6,No,Yes,No,2,183,Yes,Asymptomatic,0


In [6]:
df_msv=df_train.copy()

# changing Alcohol Intake of null values

In [7]:
df_msv["Alcohol Intake"].replace(np.nan,"NO",inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_msv["Alcohol Intake"].replace(np.nan,"NO",inplace=True)


In [8]:
df_msv.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Age                      1000 non-null   int64 
 1   Gender                   1000 non-null   object
 2   Cholesterol              1000 non-null   int64 
 3   Blood Pressure           1000 non-null   int64 
 4   Heart Rate               1000 non-null   int64 
 5   Smoking                  1000 non-null   object
 6   Alcohol Intake           1000 non-null   object
 7   Exercise Hours           1000 non-null   int64 
 8   Family History           1000 non-null   object
 9   Diabetes                 1000 non-null   object
 10  Obesity                  1000 non-null   object
 11  Stress Level             1000 non-null   int64 
 12  Blood Sugar              1000 non-null   int64 
 13  Exercise Induced Angina  1000 non-null   object
 14  Chest Pain Type          1000 non-null   

In [9]:
df_msv.head(5)

Unnamed: 0,Age,Gender,Cholesterol,Blood Pressure,Heart Rate,Smoking,Alcohol Intake,Exercise Hours,Family History,Diabetes,Obesity,Stress Level,Blood Sugar,Exercise Induced Angina,Chest Pain Type,Heart Disease
0,75,Female,228,119,66,Current,Heavy,1,No,No,Yes,8,119,Yes,Atypical Angina,1
1,48,Male,204,165,62,Current,NO,5,No,No,No,9,70,Yes,Typical Angina,0
2,53,Male,234,91,67,Never,Heavy,3,Yes,No,Yes,5,196,Yes,Atypical Angina,1
3,69,Female,192,90,72,Current,NO,4,No,Yes,No,7,107,Yes,Non-anginal Pain,0
4,62,Female,172,163,93,Never,NO,6,No,Yes,No,2,183,Yes,Asymptomatic,0


# feature transformation

### Smoking

In [10]:
df_msv["Smoking"].unique()

array(['Current', 'Never', 'Former'], dtype=object)

In [11]:
df_msv["Smoking"]=df_msv["Smoking"].astype(CategoricalDtype(categories=[ 'Never', 'Former','Current'], ordered=True)).cat.codes

### Alcohol Intake

In [12]:
df_msv["Alcohol Intake"].unique()

array(['Heavy', 'NO', 'Moderate'], dtype=object)

In [13]:
df_msv["Alcohol Intake"]=df_msv["Alcohol Intake"].astype(CategoricalDtype(categories=['NO', 'Moderate','Heavy'], ordered=True)).cat.codes

### excercise hours

In [14]:
df_msv["Exercise Hours"].unique()

array([1, 5, 3, 4, 6, 0, 8, 9, 7, 2], dtype=int64)

In [15]:
df_msv["Exercise Hours"]=df_msv["Exercise Hours"].astype(CategoricalDtype(categories=[9,8,7,6,5,4,3,2,1,0], ordered=True)).cat.codes

### "Family History","Diabetes","Obesity","Exercise Induced Angina"

In [16]:
df_msv["Family History"].unique()

array(['No', 'Yes'], dtype=object)

In [17]:
fease=["Family History","Diabetes","Obesity","Exercise Induced Angina"]

In [18]:
for i in fease:
    df_msv[i]=df_msv[i].astype(CategoricalDtype(categories=['No', 'Yes'], ordered=True)).cat.codes

### Blood Sugar

In [19]:
df_msv["Blood Sugar"].unique()

array([119,  70, 196, 107, 183, 122, 120, 113, 114,  85, 129, 192, 163,
        97, 121, 150,  76, 157, 139, 104, 112, 170, 187, 151,  92,  95,
       176, 166,  73,  75,  99, 111, 174, 135,  93, 109,  87, 137, 162,
        78, 160, 154, 172,  96, 180, 124, 152, 141,  88, 156, 177, 134,
       158,  86,  82, 105, 155,  74, 143, 127, 145, 198, 159, 148, 171,
       110, 103, 115, 193, 132, 140,  72, 147, 161, 144, 116,  90,  91,
        79, 173,  77, 142,  89, 178, 175, 118, 108, 149, 126, 101, 128,
       167, 184, 179, 197, 186, 146,  98,  84, 123,  94,  71, 181, 189,
       195, 138, 185, 106, 194, 188, 164, 102, 168, 169, 117, 165, 153,
       191, 136,  80, 133,  83, 199, 182, 100,  81, 131, 130, 190, 125],
      dtype=int64)

In [20]:
def encode_blood_sugar(value):
    if value < 140:
        return 0
    else:
        return 1
df_msv['Blood Sugar'] = df_msv['Blood Sugar'].apply(lambda x: encode_blood_sugar(x))


In [21]:
df_msv.head(5)

Unnamed: 0,Age,Gender,Cholesterol,Blood Pressure,Heart Rate,Smoking,Alcohol Intake,Exercise Hours,Family History,Diabetes,Obesity,Stress Level,Blood Sugar,Exercise Induced Angina,Chest Pain Type,Heart Disease
0,75,Female,228,119,66,2,2,8,0,0,1,8,0,1,Atypical Angina,1
1,48,Male,204,165,62,2,0,4,0,0,0,9,0,1,Typical Angina,0
2,53,Male,234,91,67,0,2,6,1,0,1,5,1,1,Atypical Angina,1
3,69,Female,192,90,72,2,0,5,0,1,0,7,0,1,Non-anginal Pain,0
4,62,Female,172,163,93,0,0,3,0,1,0,2,1,1,Asymptomatic,0


### chese pain 

In [22]:
df_msv["Chest Pain Type"].unique()

array(['Atypical Angina', 'Typical Angina', 'Non-anginal Pain',
       'Asymptomatic'], dtype=object)

In [23]:
df_msv["Chest Pain Type"]=df_msv["Chest Pain Type"].astype(CategoricalDtype(categories=['Asymptomatic','Non-anginal Pain','Atypical Angina','Typical Angina'], ordered=True)).cat.codes

In [24]:
df_msv["Gender"]=df_msv["Gender"].astype(CategoricalDtype(categories=["Male","Female"], ordered=True)).cat.codes

In [25]:
df_msv.head(5)

Unnamed: 0,Age,Gender,Cholesterol,Blood Pressure,Heart Rate,Smoking,Alcohol Intake,Exercise Hours,Family History,Diabetes,Obesity,Stress Level,Blood Sugar,Exercise Induced Angina,Chest Pain Type,Heart Disease
0,75,1,228,119,66,2,2,8,0,0,1,8,0,1,2,1
1,48,0,204,165,62,2,0,4,0,0,0,9,0,1,3,0
2,53,0,234,91,67,0,2,6,1,0,1,5,1,1,2,1
3,69,1,192,90,72,2,0,5,0,1,0,7,0,1,1,0
4,62,1,172,163,93,0,0,3,0,1,0,2,1,1,0,0


# model selection

In [36]:
X_train=df_msv[:850].drop("Heart Disease",axis=1)
y_train=df_msv["Heart Disease"][:850]
X_test=df_msv[850:].drop("Heart Disease",axis=1)
y_test=df_msv["Heart Disease"][850:]

In [37]:
X_train = X_train.astype(int)
X_test = X_test.astype(int)

In [38]:
sc=StandardScaler()
sc.fit(X_train)

In [39]:
X_train=sc.transform(X_train)
X_test=sc.transform(X_test)

## decision tree classifier

In [42]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_train, y_train)
print(dt.score(X_train, y_train), ": Training_Accuracy")
print(dt.score(X_test, y_test), ": testing accuracy")

1.0 : Training_Accuracy
1.0 : testing accuracy


## using svc classifier

In [43]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train), ": Training_Accuracy")
print(clf.score(X_test, y_test), ": testing accuracy")

0.9705882352941176 : Training_Accuracy
0.9133333333333333 : testing accuracy


## using sgd classifier

In [44]:
from sklearn.linear_model import SGDClassifier
sg = SGDClassifier(loss="hinge", penalty="l2", max_iter=80)
sg.fit(X_train, y_train)
print(sg.score(X_train, y_train), ": Training_Accuracy")
print(sg.score(X_test, y_test), ": testing accuracy")

0.8623529411764705 : Training_Accuracy
0.86 : testing accuracy


## using neighbour claaification

In [45]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X_train, y_train)
print(neigh.score(X_train, y_train), ": Training_Accuracy")
print(neigh.score(X_test, y_test), ": testing accuracy")

0.8988235294117647 : Training_Accuracy
0.8 : testing accuracy
