In [1]:
# Ensembles - These are meta-algorithms which are a combination of various machine learning techniques to improve
#            accuracy of prediction by decreasing the variance(bagging),bias(boosting) and improve prediction (stacking)

# Variance - variability of the predictions
# Bias - Refers to the how far are the preictions from the actual value

# Good Algorithm should have low bias and low variance.

# Sequential Ensembles - Where the base layers are generated sequentially (dependence of base layers)
#                      - Here the in the base layers the weights associated with the mis-labeled data is changed.
#             Ex: AdaBoost etc.

# Parallel Ensembles - Where the base layers are independent and the error is reduced by averaging.
#             Ex: Trees, RandomForest etc

# Bagging Ensembles: These are also called as BootstrapAveraging ensembles. Here the variance is reduces by calculating
#                    the average of estimates. It uses averaging for regression and voting for classification. 
#             Ex: RandomForests, DecisionTreeClassifer, ExtraTreeClassifier

# Boosting Ensembles: These are collection of algorithms which convert weak learners to strong learners, this is done by 
#                     changing the weight given to the mis-labeled data for prediction and then calculating the average of
#                     the estimates by averaging in case of regression and voting in case of classification.

# Stacking Ensembles: Stacking combine multiple classifiers or regressors via a meta-classifier or meta-regressor and the 
#                    output of the basemodels is fed as features into these meta-classifier or meta-regressor 

# Voting Ensembles: Builds multiple models and statistics such are mean and std.dev are used for prediction


In [30]:
# Bagging Ensembles:
#       1. Bagging DecisionTrees
#       2. RandomForestClassifier
#       3. ExtraTreeClassifier

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn 
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
df = pd.read_csv('C:\\Users\\vardh\\Vardhan\\ED\\breast_cancer_data.csv')
df = df.drop(columns=df.iloc[:,[0,-1]])
X = df.iloc[:,1::]
new_data = X.iloc[1:2,:]
print(new_data)
Y = df.iloc[:,0]
kf = KFold(n_splits=10,random_state=7)
estimator = DecisionTreeClassifier()
model = BaggingClassifier(base_estimator=estimator,n_estimators=100,random_state=7)
fit = model.fit(X,Y)
predict = model.predict(new_data)
print(predict)
accuracy = cross_val_score(model,X,Y,cv=kf)
print(accuracy.mean()*100)

   radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
1        20.57         17.77           132.9     1326.0          0.08474   

   compactness_mean  concavity_mean  concave points_mean  symmetry_mean  \
1           0.07864          0.0869              0.07017         0.1812   

   fractal_dimension_mean  ...  radius_worst  texture_worst  perimeter_worst  \
1                 0.05667  ...         24.99          23.41            158.8   

   area_worst  smoothness_worst  compactness_worst  concavity_worst  \
1      1956.0            0.1238             0.1866           0.2416   

   concave points_worst  symmetry_worst  fractal_dimension_worst  
1                 0.186           0.275                  0.08902  

[1 rows x 30 columns]
['M']
96.49122807017542


In [51]:
#Random Tree & Extra Tree Classifier - These are Extensions of bagged ensembles 

from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
model1 = ExtraTreesClassifier(n_estimators=100,random_state=7)
model2 = RandomForestClassifier(n_estimators=100,random_state=7)
acc1 = cross_val_score(model1,X,Y,cv=kf)
acc2 = cross_val_score(model2,X,Y,cv=kf)
print('Accuracy ETC: ', acc1.mean()*100,'Accuracy RFC: ', acc2.mean()*100)

Accuracy:  96.8389724310777 Accuracy RFC:  95.96177944862156


In [65]:
#Boosting - Is an ensemble method for reducing the bias - These are sequential algorithms
#     Ex - AdaBoost Ensemble , StochasticGradientEnsemble
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
#from sklearn.ensemble import XGBoost
model3 = AdaBoostClassifier(n_estimators=150,random_state=7)
model4 = GradientBoostingClassifier(n_estimators=100,random_state=7)
acc3 = cross_val_score(model3,X,Y,cv=kf)
acc4 = cross_val_score(model4,X,Y,cv=kf)
print('Accuracy Ada: ', acc3.mean()*100,'Accuracy SGB: ', acc4.mean()*100)

Accuracy Ada:  97.36528822055138 Accuracy SGB:  96.49122807017545


In [78]:
# Voting Ensemble - It generates the predication based on the average from different other algorithms

import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
lr = LogisticRegression()
dtc = DecisionTreeClassifier()
nb = GaussianNB()
estimators = []
estimators.append(('lr',lr))
estimators.append(('dtc',dtc))
estimators.append(('nb',nb))
model5 = VotingClassifier(estimators)
accuracy = cross_val_score(model5,X,Y,cv=kf)
print(accuracy.mean()*100)

95.43233082706767
