In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df = pd.read_csv('heart.csv')

In [3]:
df
df.shape

(918, 12)

In [4]:
df.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


In [5]:
df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [6]:
X= df.drop('HeartDisease',axis=1)
y= df['HeartDisease']

In [7]:
cat_cols =[ c for c in X.select_dtypes(["category", "object"])]
cat_cols

['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']

In [8]:
num_cols= [ c for c in X.select_dtypes([int,float])]
num_cols

['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']

In [9]:
left_inter=df[num_cols].mean() - (3 * df[num_cols].std())
# left_interval = df[num_cols].mean()
left_inter

Age             25.213044
RestingBP       76.854052
Cholesterol   -129.352869
FastingBS       -1.036021
MaxHR           60.428366
Oldpeak         -2.312347
dtype: float64

In [10]:
right_inter = df[num_cols].mean() + (3* df[num_cols].std())
right_inter

Age             81.808743
RestingBP      187.938977
Cholesterol    526.951998
FastingBS        1.502252
MaxHR          213.190371
Oldpeak          4.087074
dtype: float64

In [11]:
def compare(row):
    for c in num_cols:
        if row.loc[c] < left_inter.loc[c]:
           return False
        if row.loc[c]> right_inter.loc[c]:
            return False
        
    return True

In [12]:
new_df= df.apply(compare,axis= 'columns')
new_df= df[new_df]
new_df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [16]:
X_new_df= new_df.drop('HeartDisease',axis=1)
y_new_df = new_df['HeartDisease']
X_new_df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up
...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat


In [14]:
arr=[]
for c in cat_cols:
    uni=new_df.loc[:,c].unique()
    arr.append({c:uni})
arr    

[{'Sex': array(['M', 'F'], dtype=object)},
 {'ChestPainType': array(['ATA', 'NAP', 'ASY', 'TA'], dtype=object)},
 {'RestingECG': array(['Normal', 'ST', 'LVH'], dtype=object)},
 {'ExerciseAngina': array(['N', 'Y'], dtype=object)},
 {'ST_Slope': array(['Up', 'Flat', 'Down'], dtype=object)}]

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer


cat_pipeline = Pipeline([('encoder',OneHotEncoder())])
num_pipeline= Pipeline([('scaler',StandardScaler())])

preprocessor = ColumnTransformer([
    ('cat',cat_pipeline,cat_cols),
    ('num',num_pipeline,num_cols)
])
final_pipeline= Pipeline([('preprocessor',preprocessor)])
X_preprocessed= final_pipeline.fit_transform(X_new_df)
X_preprocessed

array([[ 0.        ,  1.        ,  0.        , ..., -0.5503622 ,
         1.38431998, -0.85546862],
       [ 1.        ,  0.        ,  0.        , ..., -0.5503622 ,
         0.7529728 ,  0.13751561],
       [ 0.        ,  1.        ,  0.        , ..., -0.5503622 ,
        -1.53566071, -0.85546862],
       ...,
       [ 0.        ,  1.        ,  1.        , ..., -0.5503622 ,
        -0.86485434,  0.33611246],
       [ 1.        ,  0.        ,  0.        , ..., -0.5503622 ,
         1.46323838, -0.85546862],
       [ 0.        ,  1.        ,  0.        , ..., -0.5503622 ,
         1.42377918, -0.85546862]])

In [32]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X_preprocessed,y_new_df,test_size=0.2)

In [20]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
scores = cross_val_score(SVC(),X_train,y_train,cv=5)
scores.mean()

0.8664238539238539

In [22]:
from sklearn.ensemble import BaggingClassifier
bag_model=BaggingClassifier(
    estimator= SVC(),
    n_estimators=100,
    # max_samples=0.8
)
scores = cross_val_score(bag_model,X_train,y_train,cv=5)
scores.mean()

0.8622668997668999

In [23]:
from sklearn.tree import DecisionTreeClassifier
scores = cross_val_score(DecisionTreeClassifier(),X_train,y_train,cv=5)
scores.mean()

0.8122377622377621

In [29]:
bag_tree_model = BaggingClassifier(
    estimator = DecisionTreeClassifier(),
    n_estimators=100,
    max_samples=0.8,
    oob_score=True,
    random_state=0
)
scores = cross_val_score(bag_tree_model,X_train,y_train,cv=5)
scores.mean()


0.853923853923854

In [33]:
bag_tree_model.score(X_test,y_test)

NotFittedError: This BaggingClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [27]:
from sklearn.ensemble import RandomForestClassifier
scores = cross_val_score(RandomForestClassifier(),X_train,y_train,cv=5)
scores.mean()

0.8552836052836053