In [None]:
#Beyond Random Forests: More Ensemble Models

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Let's hide warnings

import warnings
warnings.filterwarnings('ignore')

In [3]:
from sklearn.datasets import fetch_openml

elec_data = fetch_openml(name='electricity', version=1)

In [6]:
#Exploratory Data Analysis

In [7]:
# Getting the whole dataframe

elec_df = elec_data.frame

In [8]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(elec_df , test_size=0.25,random_state=20)

print('The size of training data is: {} \nThe size of testing data is: {}'.format(len(train_data), len(test_data)))

The size of training data is: 33984 
The size of testing data is: 11328


In [9]:
train_data.head(10)

Unnamed: 0,date,day,period,nswprice,nswdemand,vicprice,vicdemand,transfer,class
27325,0.469846,4,0.276596,0.164705,0.519637,0.011417,0.657949,0.265789,DOWN
28731,0.474227,5,0.574468,0.024919,0.191907,0.001656,0.090886,0.819737,DOWN
8450,0.023141,3,0.042553,0.06527,0.250074,0.003467,0.422915,0.414912,DOWN
36659,0.889385,2,0.744681,0.148193,0.670039,0.009981,0.533402,0.563596,UP
781,0.000708,4,0.276596,0.124204,0.475454,0.003467,0.422915,0.414912,UP
13013,0.428963,7,0.106383,0.055242,0.084647,0.003467,0.422915,0.414912,DOWN
3330,0.009203,1,0.382979,0.045635,0.741892,0.003467,0.422915,0.414912,DOWN
18851,0.446662,2,0.744681,0.183409,0.785034,0.012154,0.757639,0.517105,UP
14838,0.43383,3,0.12766,0.047886,0.141476,0.003467,0.422915,0.414912,DOWN
30462,0.868236,6,0.638298,0.030833,0.702023,0.001963,0.538322,0.674123,UP


In [15]:
X_train = train_data.drop('class', axis=1)
y_train = train_data['class']

In [16]:
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()
y_train_prepared = label_enc.fit_transform(y_train)

In [17]:
#Training Ensemble Classifiers

In [18]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import accuracy_score

log_classifier =  LogisticRegression()
sv_classifier = SVC()
sgd_classifier = SGDClassifier()


def classifiers(clf1, clf2, clf3, X_train, y_train):
    
    """
    A function that takes 5 inputs: 3 classifiers, training data & labels
    And return the list of accuracies on all classifiers
    
    """
    
    # A list of all classifiers
    clfs = [clf1, clf2, clf3]
    
    # An empty list to comprehend 
    all_clfs_acc = []
    
    # Train each classifier, evaluate it on the training set 
    # And append the accuracy to 'all_clfs_acc' 
    
    for clf in clfs:
        
        clf.fit(X_train, y_train)
        preds = clf.predict(X_train)
        acc = accuracy_score(y_train,preds)
        acc = acc.tolist()
        all_clfs_acc.append(acc)
        
    return all_clfs_acc

In [19]:
classifiers(log_classifier,sv_classifier, sgd_classifier, X_train, y_train_prepared)

[0.751088747645951, 0.7368761770244822, 0.7454390301318268]

In [20]:
# As you can see, the function returned 4 accuracies on the training set. The first accuracy correspond to Logistic Regression, the second is Support Vector Classifier, and the third is SGD(Stockastic Gradient Descent,

# Now, let us use Voting Classifier to aggregate the results of all of those 3 classifiers.

In [21]:
from sklearn.ensemble import VotingClassifier

vot_classifier = VotingClassifier(
    
    estimators=[('log_reg', log_classifier),
                ('svc', sv_classifier),
                ('sgd', sgd_classifier)], 
    voting='hard')

vot_classifier.fit(X_train, y_train_prepared)

VotingClassifier(estimators=[('log_reg', LogisticRegression()), ('svc', SVC()),
                             ('sgd', SGDClassifier())])

In [22]:
from sklearn.metrics import accuracy_score

def accuracy(model, data, labels):
    
    predictions = model.predict(data)
    acc = accuracy_score(labels, predictions)
    
    return acc

In [23]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_classifier = BaggingClassifier(
      DecisionTreeClassifier(class_weight='balanced'),
    max_samples=0.5, max_features=0.5, bootstrap=False
)

bag_classifier.fit(X_train, y_train_prepared)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced'),
                  bootstrap=False, max_features=0.5, max_samples=0.5)

In [24]:
accuracy(bag_classifier, X_train, y_train_prepared)

0.9798140301318268

In [25]:
#Gradient Boosting Classifier

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

grad_boost_clf = GradientBoostingClassifier(
                        n_estimators=500, 
                        learning_rate=0.8, 
                        random_state=42,
                        max_depth=2)

grad_boost_clf.fit(X_train, y_train_prepared)

GradientBoostingClassifier(learning_rate=0.8, max_depth=2, n_estimators=500,
                           random_state=42)

In [27]:
accuracy(grad_boost_clf, X_train, y_train_prepared)

0.9269362052730696

In [28]:
 #AdaBoost Classifier

In [29]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

adaboost_clf = AdaBoostClassifier(
    base_estimator=DecisionTreeClassifier(max_depth=3, class_weight='balanced'), 
    #base estimator is decision trees by default
    n_estimators=300,
    learning_rate=0.5

)

adaboost_clf.fit(X_train, y_train_prepared)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced',
                                                         max_depth=3),
                   learning_rate=0.5, n_estimators=300)

In [30]:
accuracy(adaboost_clf, X_train, y_train_prepared)

0.93717631826742

In [31]:
#Stacking Classifier

In [32]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


base_estimators = [
    ('rand', RandomForestClassifier(random_state=42)),
    ('svc', SVC(random_state=42))]

final_estimator = LogisticRegression()

stack_clf = StackingClassifier(estimators = base_estimators, 
                               final_estimator = final_estimator)

stack_clf.fit(X_train, y_train_prepared)

StackingClassifier(estimators=[('rand',
                                RandomForestClassifier(random_state=42)),
                               ('svc', SVC(random_state=42))],
                   final_estimator=LogisticRegression())

In [33]:
accuracy(stack_clf, X_train, y_train_prepared)

0.9996468926553672

In [34]:
#XGBoost

In [45]:
import xgboost as xgb

xgb_clf = xgb.XGBClassifier()
xgb_clf.fit(X_train, y_train_prepared)
# print(xgb_clf)
# # model = xgboost.XGBClassifier()
# model.fit(X_train, y_train)
# print(model)

ValueError: DataFrame.dtypes for data must be int, float, bool or categorical.  When
                categorical type is supplied, DMatrix parameter
                `enable_categorical` must be set to `True`.day

In [36]:
# Evaluating the Ensemble Model on the Test Set

In [37]:
X_test = test_data.drop('class', axis=1)
y_test = test_data['class']

y_test_prepared = label_enc.transform(y_test)

In [38]:
accuracy(grad_boost_clf, X_test, y_test_prepared)

0.8968926553672316

In [39]:
#Let's also evaluate the stacking classifier. It was overly optimistic on the training data.

In [40]:
accuracy(stack_clf, X_test, y_test_prepared)

0.9021892655367232

In [41]:
#How about trying a bag classifier also? It had nearly 98% on the training data.

In [42]:
accuracy(bag_classifier, X_test, y_test_prepared)

0.875882768361582