### Trying out AdaBoost classifier
https://www.youtube.com/watch?v=ix6IvwbVpw0

- When random subsets of the dataset are drawn as random subsets of the samples, then this algorithm is known as Pasting.
- When samples are drawn with replacement, then the method is known as Bagging
- When random subsets of the dataset are drawn as random subsets of the features, then the method is known as Random Subspaces
- Finally, when base estimators are built on subsets of both samples and features, then the method is known as Random Patches

In [1]:
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
from sklearn.ensemble import AdaBoostClassifier

iris = load_iris()
clf = AdaBoostClassifier(n_estimators=100)
scores = cross_val_score(clf, iris.data, iris.target)
scores.mean()

0.95996732026143794

### Random forest

In [2]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
scores = cross_val_score(clf, iris.data, iris.target)
scores.mean()

0.95343137254901966

### Voting

In [32]:
from sklearn import datasets
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

iris = datasets.load_iris()
X, y = iris.data[:, 1:3], iris.target

clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()

eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard')

for clf, label in zip([clf1, clf2, clf3, eclf], ['Logistic Regression', 'Random Forest', 'naive Bayes', 'Ensemble']):
    scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
    print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

Accuracy: 0.90 (+/- 0.05) [Logistic Regression]
Accuracy: 0.93 (+/- 0.05) [Random Forest]
Accuracy: 0.91 (+/- 0.04) [naive Bayes]
Accuracy: 0.95 (+/- 0.05) [Ensemble]


### Testing the Breast cancer dataset

In [22]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn import tree
from sklearn.naive_bayes import MultinomialNB

dataset = pd.read_excel('./BreastTissue.xls','Data')
dataset.drop(dataset.columns[0],inplace=True,axis=1)

data = dataset.iloc[:, 1:]
target = dataset.iloc[:, :1]
normalized_data = (data-data.min())/(data.max()-data.min())
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(normalized_data, target, test_size=0.05, random_state=42)

clf = tree.DecisionTreeClassifier()
clf.fit(x_train,y_train)

from sklearn.metrics import accuracy_score, classification_report
y_predicted = clf.predict(x_test)
print "Accuracy = {} %".format(accuracy_score(y_test, y_predicted)*100)
print "Classification Report \n {}".format(classification_report(y_test, y_predicted, labels=dataset.Class.unique()))

Accuracy = 83.3333333333 %
Classification Report 
              precision    recall  f1-score   support

        car       1.00      1.00      1.00         2
        fad       0.00      0.00      0.00         0
        mas       0.00      0.00      0.00         0
        gla       0.00      0.00      0.00         1
        con       1.00      1.00      1.00         1
        adi       1.00      1.00      1.00         2

avg / total       0.83      0.83      0.83         6



In [23]:
import numpy as np
import pandas as pd
from sklearn import svm
from sklearn import tree
from sklearn.naive_bayes import MultinomialNB

dataset = pd.read_excel('./BreastTissue.xls','Data')
dataset.drop(dataset.columns[0],inplace=True,axis=1)

data = dataset.iloc[:, 1:]
target = dataset.iloc[:, :1]
normalized_data = (data-data.min())/(data.max()-data.min())

from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(normalized_data, target, test_size=0.05, random_state=42)

clf = MultinomialNB()
clf.fit(x_train,y_train)

from sklearn.metrics import accuracy_score, classification_report
y_predicted = clf.predict(x_test)
print "Accuracy = {} %".format(accuracy_score(y_test, y_predicted)*100)
print "Classification Report \n {}".format(classification_report(y_test, y_predicted, labels=dataset.Class.unique()))

Accuracy = 66.6666666667 %
Classification Report 
              precision    recall  f1-score   support

        car       1.00      1.00      1.00         2
        fad       0.00      0.00      0.00         0
        mas       0.00      0.00      0.00         0
        gla       0.00      0.00      0.00         1
        con       0.00      0.00      0.00         1
        adi       0.50      1.00      0.67         2

avg / total       0.50      0.67      0.56         6



In [24]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

dataset = pd.read_excel('./BreastTissue.xls','Data')
dataset.drop(dataset.columns[0],inplace=True,axis=1)

data = dataset.iloc[:, 1:]
target = dataset.iloc[:, :1]
normalized_data = (data-data.min())/(data.max()-data.min())

from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(normalized_data, target, test_size=0.05, random_state=42)

clf = RandomForestClassifier(n_estimators=100)
clf.fit(x_train,y_train)

from sklearn.metrics import accuracy_score, classification_report
y_predicted = clf.predict(x_test)
print "Accuracy = {} %".format(accuracy_score(y_test, y_predicted)*100)
print "Classification Report \n {}".format(classification_report(y_test, y_predicted, labels=dataset.Class.unique()))

  app.launch_new_instance()


Accuracy = 83.3333333333 %
Classification Report 
              precision    recall  f1-score   support

        car       1.00      1.00      1.00         2
        fad       0.00      0.00      0.00         0
        mas       0.00      0.00      0.00         0
        gla       0.00      0.00      0.00         1
        con       1.00      1.00      1.00         1
        adi       1.00      1.00      1.00         2

avg / total       0.83      0.83      0.83         6



In [31]:
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier

dataset = pd.read_excel('./BreastTissue.xls','Data')
dataset.drop(dataset.columns[0],inplace=True,axis=1)

data = dataset.iloc[:, 1:]
target = dataset.iloc[:, :1]
normalized_data = (data-data.min())/(data.max()-data.min())

from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(normalized_data, target, test_size=0.05, random_state=42)

clf = AdaBoostClassifier(n_estimators=100)
clf.fit(x_train,y_train)

from sklearn.metrics import accuracy_score, classification_report
y_predicted = clf.predict(x_test)
print "Accuracy = {} %".format(accuracy_score(y_test, y_predicted)*100)
print "Classification Report \n {}".format(classification_report(y_test, y_predicted, labels=dataset.Class.unique()))

Accuracy = 100.0 %
Classification Report 
              precision    recall  f1-score   support

        car       1.00      1.00      1.00         2
        fad       0.00      0.00      0.00         0
        mas       0.00      0.00      0.00         0
        gla       1.00      1.00      1.00         1
        con       1.00      1.00      1.00         1
        adi       1.00      1.00      1.00         2

avg / total       1.00      1.00      1.00         6

