<a href="https://colab.research.google.com/github/swethag04/ml-projects/blob/main/ensemble/ensemble_learning_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Ensemble Learning
Ensemble learning combines the predictions of multiple models to improve predictive performance. Bagging and Boosting are two popular ensemble techniques.

**Bagging** involves fitting a number of decision trees on samples of the same dataset and averaging their predictions.

**Boosting** involves adding ensemble members sequentially to correct the predictions made by previous models and producing a weighted average of predictions.

###Basic aggregation of models

In [2]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

The dataset is from Kaggle and contains information from fetal Cardiotocogram exams that were classified into three categories:


*   Normal
*   Suspect
* Pathological




In [3]:
df = pd.read_csv('sample_data/fetal.zip', compression='zip')

In [4]:
df.head()

Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency,fetal_health
0,120.0,0.0,0.0,0.0,0.0,0.0,0.0,73.0,0.5,43.0,...,62.0,126.0,2.0,0.0,120.0,137.0,121.0,73.0,1.0,2.0
1,132.0,0.006,0.0,0.006,0.003,0.0,0.0,17.0,2.1,0.0,...,68.0,198.0,6.0,1.0,141.0,136.0,140.0,12.0,0.0,1.0
2,133.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.1,0.0,...,68.0,198.0,5.0,1.0,141.0,135.0,138.0,13.0,0.0,1.0
3,134.0,0.003,0.0,0.008,0.003,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,11.0,0.0,137.0,134.0,137.0,13.0,1.0,1.0
4,132.0,0.007,0.0,0.008,0.0,0.0,0.0,16.0,2.4,0.0,...,53.0,170.0,9.0,0.0,137.0,136.0,138.0,11.0,1.0,1.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2126 entries, 0 to 2125
Data columns (total 22 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   baseline value                                          2126 non-null   float64
 1   accelerations                                           2126 non-null   float64
 2   fetal_movement                                          2126 non-null   float64
 3   uterine_contractions                                    2126 non-null   float64
 4   light_decelerations                                     2126 non-null   float64
 5   severe_decelerations                                    2126 non-null   float64
 6   prolongued_decelerations                                2126 non-null   float64
 7   abnormal_short_term_variability                         2126 non-null   float64
 8   mean_value_of_short_term_variability  

In [6]:
df['fetal_health'].value_counts()

1.0    1655
2.0     295
3.0     176
Name: fetal_health, dtype: int64

In [7]:
X = df.drop('fetal_health', axis=1)
y = df['fetal_health']

In [9]:
scaler = StandardScaler()
X= scaler.fit_transform(X)

In [10]:
# Model predictions

models = [LogisticRegression(), KNeighborsClassifier(), SVC()]
results = {'logistic':[],
           'knn': [],
           'svc': []}
i=0
for model in models:
  model.fit(X, y)
  results[list(results.keys())[i]] = model.predict(X)
  i+=1
results


{'logistic': array([2., 1., 1., ..., 2., 2., 1.]),
 'knn': array([2., 1., 1., ..., 2., 2., 1.]),
 'svc': array([2., 1., 1., ..., 2., 2., 1.])}

In [11]:
# Majority vote
prediction_df = pd.DataFrame(results)
prediction_df['ensemble_prediction'] = prediction_df.mode(axis=1).iloc[:,0]
prediction_df.head()

Unnamed: 0,logistic,knn,svc,ensemble_prediction
0,2.0,2.0,2.0,2.0
1,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0


In [12]:
# Accuracy
from sklearn.metrics import accuracy_score
accuracies =[]
for col in prediction_df.columns:
  accuracies.append(accuracy_score(y, prediction_df[col]))
accuracies

[0.9045155221072436, 0.9374412041392286, 0.929444967074318, 0.9270931326434619]

In [15]:
# Using the voting classifier
voter = VotingClassifier(
    estimators = [('svc', SVC()),
                  ('lgr', LogisticRegression()),
                   ('knn', KNeighborsClassifier())],
    voting = 'hard')
voter.fit(X,y)
vote_accuracy = voter.score(X,y)
vote_accuracy

0.9270931326434619

In [18]:
# Voting based on probabilities
soft_voter = VotingClassifier(
    estimators = [('svc', SVC(probability=True)),
                  ('lgr', LogisticRegression()),
                   ('knn', KNeighborsClassifier())],
    voting = 'soft')
soft_voter.fit(X,y)
soft_accuracy = soft_voter.score(X,y)
soft_accuracy

0.9379115710253998

In [19]:
# Using different weights
weighted_voter = VotingClassifier(
    estimators = [('svc', SVC()),
                  ('lgr', LogisticRegression()),
                   ('knn', KNeighborsClassifier())],
    weights = [0.25, 0.5, 0.25])
weighted_voter.fit(X,y)
weighted_score = weighted_voter.score(X,y)
weighted_score

0.9214487300094073