In [1]:
import pandas as pd
from sklearn import datasets
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Iris

In [3]:
dataset = datasets.load_iris()
features = dataset.data
targets = dataset.target
feature_train, feature_test, target_train, target_test = train_test_split(features, targets, test_size=0.2)

In [4]:
model = AdaBoostClassifier(n_estimators=100, learning_rate=1, random_state=123)
model.fit(feature_train, target_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1,
          n_estimators=100, random_state=123)

In [5]:
predictions = model.predict(feature_test)
print(confusion_matrix(target_test, predictions))
print(accuracy_score(target_test, predictions))

[[12  0  0]
 [ 0  9  2]
 [ 0  1  6]]
0.9


In [6]:
# wine dataset

In [7]:
data = pd.read_csv("data/wine.csv", sep=";")

print(data['quality'].describe())
print('')
print(data['quality'].value_counts())

count    4898.000000
mean        5.877909
std         0.885639
min         3.000000
25%         5.000000
50%         6.000000
75%         6.000000
max         9.000000
Name: quality, dtype: float64

6    2198
5    1457
7     880
8     175
4     163
3      20
9       5
Name: quality, dtype: int64


In [8]:
data.columns

Index(['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar',
       'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality'],
      dtype='object')

In [9]:
X = data.iloc[:,:-1]
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [10]:
# scaling
X = MinMaxScaler().fit_transform(X)

In [11]:
def isTasty(quality):
    if quality >= 7:
        return 1
    else:
        return 0

data['tasty'] = data["quality"].apply(isTasty)
data['tasty'].value_counts()

0    3838
1    1060
Name: tasty, dtype: int64

In [12]:
y = data['tasty']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [14]:
param_dist = {
 'n_estimators': [50, 100, 200],
 'learning_rate' : [0.001, 0.01, 0.1, 0.5, 1.0],
}

grid_search = GridSearchCV(estimator=AdaBoostClassifier(), param_grid=param_dist, cv=10)
grid_search.fit(X_train, y_train)

print("Best parameter with Grid Search: ", grid_search.best_params_)

Best parameter with Grid Search:  {'learning_rate': 0.5, 'n_estimators': 200}


In [15]:
predictions = grid_search.predict(X_test)

print(confusion_matrix(y_test, predictions))
print(accuracy_score(y_test, predictions))

[[709  45]
 [132  94]]
0.819387755102
