In [1]:
import pandas as pd
import numpy as np
from scipy import stats

In [28]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [17]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Bagging

This is an exercise completed using the following link:https://github.com/codebasics/py/blob/master/ML/19_Bagging/bagging_exercise.md

# Load heart disease dataset in pandas dataframe

In [3]:
df_heart = pd.read_csv(r'heart.csv')

In [4]:
df_heart.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [5]:
df_heart.describe()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease
count,918.0,918.0,918.0,918.0,918.0,918.0,918.0
mean,53.510893,132.396514,198.799564,0.233115,136.809368,0.887364,0.553377
std,9.432617,18.514154,109.384145,0.423046,25.460334,1.06657,0.497414
min,28.0,0.0,0.0,0.0,60.0,-2.6,0.0
25%,47.0,120.0,173.25,0.0,120.0,0.0,0.0
50%,54.0,130.0,223.0,0.0,138.0,0.6,1.0
75%,60.0,140.0,267.0,0.0,156.0,1.5,1.0
max,77.0,200.0,603.0,1.0,202.0,6.2,1.0


# Remove outliers using Z score. Usual guideline is to remove anything that has Z score > 3 formula or Z score < -3

In [6]:
cols = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'Oldpeak']

In [7]:
def zscore_outliers(df, col, score):
    upper = df[col].mean() + score*df[col].std()

    lower = df[col].mean() - score*df[col].std()
    
    df_filtered = df[(df[col]<=upper)&(df[col]>=lower)]
    
    return df_filtered

In [8]:
score = 3

counter = 0

for i in cols:
    if counter == 0:
        df_data = zscore_outliers(df_heart, i, score)
        counter+=1
    else:
        df_data = zscore_outliers(df_data, i, score)
        counter+=1

In [9]:
df_data.shape

(900, 12)

In [10]:
df_data.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


# Convert text columns to numbers using label encoding and one hot encoding

In [11]:
df4 = df_data.copy()
df4.ExerciseAngina.replace(
    {
        'N': 0,
        'Y': 1
    },
    inplace=True)

df4.ST_Slope.replace(
    {
        'Down': 1,
        'Flat': 2,
        'Up': 3
    },
    inplace=True
)

df4.RestingECG.replace(
    {
        'Normal': 1,
        'ST': 2,
        'LVH': 3
    },
    inplace=True)

df4.head()

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,1,172,0,0.0,3,0
1,49,F,NAP,160,180,0,1,156,0,1.0,2,1
2,37,M,ATA,130,283,0,2,98,0,0.0,3,0
3,48,F,ASY,138,214,0,1,108,1,1.5,2,1
4,54,M,NAP,150,195,0,1,122,0,0.0,3,0


In [12]:
df_data_dummies = pd.get_dummies(df4, drop_first=True)

In [13]:
X = df_data_dummies.drop("HeartDisease",axis='columns')
y = df_data_dummies.HeartDisease

X.head()

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,Sex_M,ChestPainType_ATA,ChestPainType_NAP,ChestPainType_TA
0,40,140,289,0,1,172,0,0.0,3,1,1,0,0
1,49,160,180,0,1,156,0,1.0,2,0,0,1,0
2,37,130,283,0,2,98,0,0.0,3,1,1,0,0
3,48,138,214,0,1,108,1,1.5,2,0,0,0,0
4,54,150,195,0,1,122,0,0.0,3,1,0,1,0


# Apply scaling

In [14]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Build a classification model

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=30)

## Using SVM

In [29]:
scores = cross_val_score(svm.SVC(C=1, kernel='rbf'), X, y, cv=5)
scores.mean()

0.69

## Bagging model

In [30]:
from sklearn.ensemble import BaggingClassifier

bag_model = BaggingClassifier(
    base_estimator=svm.SVC(C=1, kernel='rbf'),
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)

scores = cross_val_score(bag_model, X, y, cv=5)
scores.mean()

0.6822222222222222

# Build a classification model

## Using decision tree classifier

In [32]:
scores = cross_val_score(DecisionTreeClassifier(), X, y, cv=5)
scores.mean()

0.7144444444444444

## Bagging model

In [33]:
bag_model = BaggingClassifier(
    base_estimator=DecisionTreeClassifier(),
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)
scores = cross_val_score(bag_model, X, y, cv=5)
scores.mean()

0.8011111111111111

# Build a classification model using various methods

In [22]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto',max_iter=-1),
        'params' : {
            'C': [1,10,20],
            'kernel': ['linear', 'rbf',],
            #'shrinking':['True', 'False']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10,100],
            'max_depth': [1,2,3,4,5,6],
            'min_samples_leaf': [100,200,300,500,1000]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(multi_class='auto'),
        'params': {
            'C': [1,5,10],
            'solver':['newton-cg', 'liblinear']
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'kneighbours': {
        'model': KNeighborsClassifier(n_jobs=-1),
        "params": {
            
            'n_neighbors' : [5, 10 , 100],
        }
    },
    'Xgboost': {
        'model': XGBClassifier(n_jobs=-1),
        "params": {
            'n_estimators': [1,5,10,100],
            'max_depth': [1,2,3,4,5,6],
           
        }
    },
    'LightGBM': {
        'model': LGBMClassifier(n_jobs=-1),
        "params": {
            'n_estimators': [1,5,10,100],
            'max_depth': [1,2,3,4,5,6],
            'num_leaves': [100,200,300,500,1000]
        }
    },

}

In [None]:
from sklearn.model_selection import GridSearchCV
scores = []

for model_name, mp in model_params.items():
    print(model_name)
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(X_scaled, y)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_.round(4),
        'best_params': clf.best_params_
    })

In [24]:
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df.sort_values('best_score', ascending=False)

Unnamed: 0,model,best_score,best_params
4,kneighbours,0.8311,{'n_neighbors': 100}
3,naive_bayes_gaussian,0.8278,{}
6,LightGBM,0.8278,"{'max_depth': 2, 'n_estimators': 100, 'num_lea..."
5,Xgboost,0.8256,"{'max_depth': 3, 'n_estimators': 10}"
1,random_forest,0.82,"{'max_depth': 2, 'min_samples_leaf': 100, 'n_e..."
0,svm,0.8156,"{'C': 1, 'kernel': 'rbf'}"
2,logistic_regression,0.8067,"{'C': 1, 'solver': 'newton-cg'}"


## Best Model

In [34]:
model = KNeighborsClassifier(n_jobs=-1, n_neighbors=100)
scores = cross_val_score(model, X, y, cv=5)
scores.mean()

0.6699999999999999

## Bagging model

In [35]:
from sklearn.ensemble import BaggingClassifier

bag_model = BaggingClassifier(
    base_estimator=KNeighborsClassifier(n_jobs=-1, n_neighbors=100), 
    n_estimators=100, 
    max_samples=0.8, 
    oob_score=True,
    random_state=0
)

scores = cross_val_score(bag_model, X, y, cv=5)
scores.mean()

0.6688888888888889

# Comparing performance of svm and decision tree classifier figure out where it makes most sense to use bagging and why.

Use internet to figure out in what conditions bagging works the best.

From: https://analyticsindiamag.com/primer-ensemble-learning-bagging-boosting/

Bagging is used when the goal is to reduce the variance of a decision tree classifier. Here the objective is to create several subsets of data from training sample chosen randomly with replacement. Each collection of subset data is used to train their decision trees. 