In [20]:
# install required libraries
# ! pip install scikit-learn pandas numpy zipfile statistics xgboost

In [62]:
# import libraries
from statistics import mean, stdev
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn import metrics
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder

In [22]:
# loading the dataset
digits = load_digits() # dataset containing handwritten digits
digits_df = pd.DataFrame(digits.data, columns=digits.feature_names)

In [23]:
# splitting the data into training and test data - 80:20 split
x_tr, x_test, y_tr, y_test = train_test_split(digits_df, digits.target, train_size = 0.8, shuffle = True)

# Extra Trees Classifier for Handwriting Data

In [24]:
# defining the model
model = ExtraTreesClassifier()

# 10-fold cross-validation
digits_cv = RepeatedStratifiedKFold(n_splits = 10)
cv_scores = cross_val_score(model, x_tr, y_tr, scoring = "accuracy", cv = digits_cv)

# accuracy using the training data
print("accuracy of ET classifier: %.3f (%.3f)" % (mean(cv_scores), stdev(cv_scores)))

accuracy of ET classifier: 0.979 (0.012)


In [25]:
# making predictions on the model using the test data
# fitting the model on the training data
model.fit(x_tr, y_tr)

# making predictions on the test data
et_preds = model.predict(x_test)

# accuracy
print("test accuracy for ET: %.3f" % metrics.accuracy_score(y_test, et_preds))
# MSE
print("test MSE for ET: %.3f" % metrics.mean_squared_error(y_test, et_preds))

test accuracy for ET: 0.994
test MSE for ET: 0.056


# Random Forest Algorithm For Classification

In [26]:
# defining the model
model = RandomForestClassifier()

# 10-fold cross-validation
digits_cv = RepeatedStratifiedKFold(n_splits = 10)
cv_scores = cross_val_score(model, x_tr, y_tr, scoring = "accuracy", cv = digits_cv)

# accuracy using the training data
print("accuracy of RF classifier: %.3f (%.3f)" % (mean(cv_scores), stdev(cv_scores)))

accuracy of RF classifier: 0.971 (0.014)


In [27]:
# fitting the model on the training data
model.fit(x_tr, y_tr)

# making predictions using the fitted model on the test data
rf_preds = model.predict(x_test)

# accuracy
print("test accuracy for RF: %.3f" % metrics.accuracy_score(y_test, rf_preds))
# MSE
print("test MSE for RF: %.3f" % metrics.mean_squared_error(y_test, rf_preds))

test accuracy for RF: 0.983
test MSE for RF: 0.381


# XGBoost For Classification

In [29]:
# defining the model
model = XGBClassifier()

# 10-fold cross-validation
digits_cv = RepeatedStratifiedKFold(n_splits = 10)
cv_scores = cross_val_score(model, x_tr, y_tr, scoring = "accuracy", cv = digits_cv)

# accuracy using the training data
print("accuracy of XGB classifier: %.3f (%.3f)" % (mean(cv_scores), stdev(cv_scores)))

accuracy of XGB classifier: 0.963 (0.016)


In [30]:
# fitting the model on the training data
model.fit(x_tr, y_tr)

# making predictions using the fitted model on the test data
xgb_preds = model.predict(x_test)

# accuracy
print("test accuracy for XGB: %.3f" % metrics.accuracy_score(y_test, xgb_preds))
# MSE
print("test MSE for XGB: %.3f" % metrics.mean_squared_error(y_test, xgb_preds))

test accuracy for XGB: 0.981
test MSE for XGB: 0.342


# Support Vector Classifier

In [31]:
# defining the model
model = SVC(kernel = "rbf")

# 10-fold cross-validation
digits_cv = RepeatedStratifiedKFold(n_splits = 10)
cv_scores = cross_val_score(model, x_tr, y_tr, scoring = "accuracy", cv = digits_cv)

# accuracy using the training data
print("accuracy of support vector classifier: %.3f (%.3f)" % (mean(cv_scores), stdev(cv_scores)))

accuracy of support vector classifier: 0.984 (0.011)


In [32]:
# fitting the model on the training data
model.fit(x_tr, y_tr)

# making predictions using the fitted model on the test data
svc_preds = model.predict(x_test)

# accuracy
print("test accuracy for SVC: %.3f" % metrics.accuracy_score(y_test, svc_preds))
# MSE
print("test MSE for SVC: %.3f" % metrics.mean_squared_error(y_test, svc_preds))

test accuracy for SVC: 0.997
test MSE for SVC: 0.011


# Linear Discriminant Analysis

In [33]:
# defining the model
model = LinearDiscriminantAnalysis()

# 10-fold cross-validation
digits_cv = RepeatedStratifiedKFold(n_splits = 10)
cv_scores = cross_val_score(model, x_tr, y_tr, scoring = "accuracy", cv = digits_cv)

# accuracy using the training data
print("accuracy of support vector classifier: %.3f (%.3f)" % (mean(cv_scores), stdev(cv_scores)))

accuracy of support vector classifier: 0.953 (0.016)


In [34]:
# fitting the model on the training data
model.fit(x_tr, y_tr)

# making predictions using the fitted model on the test data
lda_preds = model.predict(x_test)

# accuracy
print("test accuracy for LDA: %.3f" % metrics.accuracy_score(y_test, lda_preds))
# MSE
print("test MSE for LDA: %.3f" % metrics.mean_squared_error(y_test, lda_preds))

test accuracy for LDA: 0.953
test MSE for LDA: 1.319


## K-Nearest Neighbours

In [35]:
# defining the model
model = KNeighborsClassifier(n_neighbors = 3)

# 10-fold cross-validation
digits_cv = RepeatedStratifiedKFold(n_splits = 10)
cv_scores = cross_val_score(model, x_tr, y_tr, scoring = "accuracy", cv = digits_cv)

# accuracy using the training data
print("accuracy of KNN classifier: %.3f (%.3f)" % (mean(cv_scores), stdev(cv_scores)))

accuracy of KNN classifier: 0.985 (0.011)


In [36]:
# fitting the model on the training data
model.fit(x_tr, y_tr)

# making predictions using the fitted model on the test data
knn_preds = model.predict(x_test)

# accuracy
print("test accuracy for KNN Classifier: %.3f" % metrics.accuracy_score(y_test, knn_preds))
# MSE
print("test MSE for KNN Classifier: %.3f" % metrics.mean_squared_error(y_test, knn_preds))

test accuracy for KNN Classifier: 0.997
test MSE for KNN Classifier: 0.025


In [56]:
# creating a function to run all models
def run_models(x_tr, y_tr, x_test, y_test, res, dataset: str, n):

    ## EXTRA TREES CLASSIFIER
    # defining the model
    model = ExtraTreesClassifier()
    # n-fold cross-validation
    digits_cv = RepeatedStratifiedKFold(n_splits = n)
    cv_scores = cross_val_score(model, x_tr, y_tr, scoring = "accuracy", cv = digits_cv)
    # accuracy using the training data
    # print("accuracy of ET classifier: %.3f (%.3f)" % (mean(cv_scores), stdev(cv_scores)))
    mean_trscore = round(mean(cv_scores), 3)
    std_trscore = round(stdev(cv_scores), 3)
    # making predictions on the model using the test data
    # fitting the model on the training data
    model.fit(x_tr, y_tr)
    # making predictions on the test data
    et_preds = model.predict(x_test)
    # accuracy
    # print("test accuracy for ET: %.3f" % metrics.accuracy_score(y_test, et_preds))
    test_acc = round(metrics.accuracy_score(y_test, et_preds), 3)
    # MSE
    # print("test MSE for ET: %.3f" % metrics.mean_squared_error(y_test, et_preds))
    test_mse = round(metrics.mean_squared_error(y_test, et_preds), 3)
    # add results to the dataframe
    res.loc['a'] = [dataset, 'ExtraTreesClassifier', mean_trscore, std_trscore, test_acc, test_mse]

    ## RANDOM FOREST
    # defining the model
    model = RandomForestClassifier()
    # n-fold cross-validation
    digits_cv = RepeatedStratifiedKFold(n_splits = n)
    cv_scores = cross_val_score(model, x_tr, y_tr, scoring = "accuracy", cv = digits_cv)
    # accuracy using the training data
    # print("accuracy of RF classifier: %.3f (%.3f)" % (mean(cv_scores), stdev(cv_scores)))
    mean_trscore = round(mean(cv_scores), 3)
    std_trscore = round(stdev(cv_scores), 3)
    # fitting the model on the training data
    model.fit(x_tr, y_tr)
    # making predictions using the fitted model on the test data
    rf_preds = model.predict(x_test)
    # accuracy
    # print("test accuracy for RF: %.3f" % metrics.accuracy_score(y_test, rf_preds))
    test_acc = round(metrics.accuracy_score(y_test, rf_preds), 3)
    # MSE
    # print("test MSE for RF: %.3f" % metrics.mean_squared_error(y_test, rf_preds))
    test_mse = round(metrics.mean_squared_error(y_test, rf_preds), 3)
    # add results to the dataframe
    res.loc['b'] = [dataset, 'RandomForestClassifier', mean_trscore, std_trscore, test_acc, test_mse]
    
    ## XGBOOST
    # defining the model
    model = XGBClassifier()
    # n-fold cross-validation
    digits_cv = RepeatedStratifiedKFold(n_splits = n)
    cv_scores = cross_val_score(model, x_tr, y_tr, scoring = "accuracy", cv = digits_cv)
    # accuracy using the training data
    # print("accuracy of XGB classifier: %.3f (%.3f)" % (mean(cv_scores), stdev(cv_scores)))
    mean_trscore = round(mean(cv_scores), 3)
    std_trscore = round(stdev(cv_scores), 3)
    # fitting the model on the training data
    model.fit(x_tr, y_tr)
    # making predictions using the fitted model on the test data
    xgb_preds = model.predict(x_test)
    # accuracy
    # print("test accuracy for XGB: %.3f" % metrics.accuracy_score(y_test, xgb_preds))
    test_acc = round(metrics.accuracy_score(y_test, xgb_preds), 3)
    # MSE
    # print("test MSE for XGB: %.3f" % metrics.mean_squared_error(y_test, xgb_preds))
    test_mse = round(metrics.mean_squared_error(y_test, xgb_preds), 3)
    # add results to the dataframe
    res.loc['c'] = [dataset, 'XGBoostClassifier', mean_trscore, std_trscore, test_acc, test_mse]

    ## SVC
    # defining the model
    model = SVC(kernel = "rbf")
    # n-fold cross-validation
    digits_cv = RepeatedStratifiedKFold(n_splits = n)
    cv_scores = cross_val_score(model, x_tr, y_tr, scoring = "accuracy", cv = digits_cv)
    # accuracy using the training data
    # print("accuracy of support vector classifier: %.3f (%.3f)" % (mean(cv_scores), stdev(cv_scores)))
    mean_trscore = round(mean(cv_scores), 3)
    std_trscore = round(stdev(cv_scores), 3)
    # fitting the model on the training data
    model.fit(x_tr, y_tr)
    # making predictions using the fitted model on the test data
    svc_preds = model.predict(x_test)
    # accuracy
    # print("test accuracy for SVC: %.3f" % metrics.accuracy_score(y_test, svc_preds))
    test_acc = round(metrics.accuracy_score(y_test, svc_preds), 3)
    # MSE
    # print("test MSE for SVC: %.3f" % metrics.mean_squared_error(y_test, svc_preds))
    test_mse = round(metrics.mean_squared_error(y_test, svc_preds), 3)
    # add results to the dataframe
    res.loc['d'] = [dataset, 'SupportVectorClassifier', mean_trscore, std_trscore, test_acc, test_mse]

    ## LDA
    # defining the model
    model = LinearDiscriminantAnalysis()
    # n-fold cross-validation
    digits_cv = RepeatedStratifiedKFold(n_splits = n)
    cv_scores = cross_val_score(model, x_tr, y_tr, scoring = "accuracy", cv = digits_cv)
    # accuracy using the training data
    # print("accuracy of support vector classifier: %.3f (%.3f)" % (mean(cv_scores), stdev(cv_scores)))
    mean_trscore = round(mean(cv_scores), 3)
    std_trscore = round(stdev(cv_scores), 3)
    # fitting the model on the training data
    model.fit(x_tr, y_tr)
    # making predictions using the fitted model on the test data
    lda_preds = model.predict(x_test)
    # accuracy
    # print("test accuracy for LDA: %.3f" % metrics.accuracy_score(y_test, lda_preds))
    test_acc = round(metrics.accuracy_score(y_test, lda_preds), 3)
    # MSE
    # print("test MSE for LDA: %.3f" % metrics.mean_squared_error(y_test, lda_preds))
    test_mse = round(metrics.mean_squared_error(y_test, lda_preds), 3)
    # add results to the dataframe
    res.loc['e'] = [dataset, 'LDAClassifier', mean_trscore, std_trscore, test_acc, test_mse]

    ## KNN
    # defining the model
    model = KNeighborsClassifier(n_neighbors = 3)
    # n-fold cross-validation
    digits_cv = RepeatedStratifiedKFold(n_splits = n)
    cv_scores = cross_val_score(model, x_tr, y_tr, scoring = "accuracy", cv = digits_cv)
    # accuracy using the training data
    # print("accuracy of KNN classifier: %.3f (%.3f)" % (mean(cv_scores), stdev(cv_scores)))
    mean_trscore = round(mean(cv_scores), 3)
    std_trscore = round(stdev(cv_scores), 3)
    # fitting the model on the training data
    model.fit(x_tr, y_tr)
    # making predictions using the fitted model on the test data
    knn_preds = model.predict(x_test)
    # accuracy
    # print("test accuracy for KNN Classifier: %.3f" % metrics.accuracy_score(y_test, knn_preds))
    test_acc = round(metrics.accuracy_score(y_test, knn_preds), 3)
    # MSE
    # print("test MSE for KNN Classifier: %.3f" % metrics.mean_squared_error(y_test, knn_preds))
    test_mse = round(metrics.mean_squared_error(y_test, knn_preds), 3)
    # add results to the dataframe
    res.loc['f'] = [dataset, 'KNNClassifier', mean_trscore, std_trscore, test_acc, test_mse]

    return res
    

## Digits Dataset

In [66]:
# loading the dataset
data = load_digits() # dataset containing handwritten digits
df = pd.DataFrame(data.data, columns=data.feature_names)

# splitting the data into training and test data - 80:20 split
x_tr, x_test, y_tr, y_test = train_test_split(df, data.target, train_size = 0.8, shuffle = True)
le = LabelEncoder()
y_tr = le.fit_transform(y_tr)

res_digits = run_models(x_tr, y_tr, x_test, y_test, pd.DataFrame(columns = ['dataset', 'model', 'train_score', 'train_score_std', 'test_accuracy', 'test_mse']), "digits", 4)

In [67]:
res_digits

Unnamed: 0,dataset,model,train_score,train_score_std,test_accuracy,test_mse
a,digits,ExtraTreesClassifier,0.98,0.008,0.978,0.492
b,digits,RandomForestClassifier,0.971,0.008,0.969,0.442
c,digits,XGBoostClassifier,0.958,0.011,0.978,0.422
d,digits,SupportVectorClassifier,0.988,0.005,0.983,0.397
e,digits,LDAClassifier,0.949,0.011,0.961,1.058
f,digits,KNNClassifier,0.988,0.006,0.978,0.692


## Wine Quality Dataset - Red Wine

In [68]:
# loading the dataset
df = pd.read_csv('winequality-red.csv', sep = ';')
y = df['quality']

# splitting the data into training and test data - 80:20 split
x_tr, x_test, y_tr, y_test = train_test_split(df, y, train_size = 0.8, shuffle = True, stratify = y)
le = LabelEncoder()
y_tr = le.fit_transform(y_tr)

res_wqred = run_models(x_tr, y_tr, x_test, y_test, pd.DataFrame(columns = ['dataset', 'model', 'train_score', 'train_score_std', 'test_accuracy', 'test_mse']), "wineQualityRed", 4)

In [69]:
res_wqred

Unnamed: 0,dataset,model,train_score,train_score_std,test_accuracy,test_mse
a,digits,ExtraTreesClassifier,0.997,0.003,0.0,8.984
b,digits,RandomForestClassifier,0.981,0.004,0.0,8.981
c,digits,XGBoostClassifier,1.0,0.0,0.0,9.0
d,digits,SupportVectorClassifier,0.517,0.027,0.003,9.362
e,digits,LDAClassifier,0.588,0.025,0.006,9.638
f,digits,KNNClassifier,0.576,0.023,0.0,10.003


## Wine Quality Dataset - White Wine

In [None]:
# loading the dataset
df = pd.read_csv('winequality-white.csv', sep = ';')
y = df['quality']

# splitting the data into training and test data - 80:20 split
x_tr, x_test, y_tr, y_test = train_test_split(df, y, train_size = 0.8, shuffle = True)
le = LabelEncoder()
y_tr = le.fit_transform(y_tr)

res_wqwhite = run_models(x_tr, y_tr, x_test, y_test, pd.DataFrame(columns = ['dataset', 'model', 'train_score', 'train_score_std', 'test_accuracy', 'test_mse']), "wineQualityWhite", 10)

In [84]:
res_wqwhite

Unnamed: 0,dataset,model,train_score,train_score_std,test_accuracy,test_mse
a,digits,ExtraTreesClassifier,0.999,0.001,0.0,9.0
b,digits,RandomForestClassifier,0.995,0.002,0.0,8.984
c,digits,XGBoostClassifier,0.999,0.0,0.0,9.007
d,digits,SupportVectorClassifier,0.454,0.005,0.003,9.433
e,digits,LDAClassifier,0.528,0.015,0.004,9.988
f,digits,KNNClassifier,0.505,0.014,0.002,10.019


## Energy Efficiency - Heating Load

In [None]:
# Energy Efficiency - Heating Load
df = pd.read_excel('ENB2012_data.xlsx')
df.dropna(axis=0, inplace=True)
df = df.drop('Y2', axis = 1)
df.Y1 = df.Y1.round()
df = df.astype({'Y1':'int'})
y = df['Y1']


# splitting the data into training and test data - 80:20 split
x_tr, x_test, y_tr, y_test = train_test_split(df, y, train_size = 0.8, shuffle = True)
le = LabelEncoder()
y_tr = le.fit_transform(y_tr)

res_energy_heating = run_models(x_tr, y_tr, x_test, y_test, pd.DataFrame(columns = ['dataset', 'model', 'train_score', 'train_score_std', 'test_accuracy', 'test_mse']), "energyEfficiencyHeating", 4)