In [5]:
# install required libraries
# ! pip install scikit-learn pandas numpy zipfile statistics xgboost

In [6]:
# import libraries
from statistics import mean, stdev
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, cross_val_score
from sklearn import metrics
from xgboost import XGBClassifier

In [7]:
# loading the dataset
digits = load_digits() # dataset containing handwritten digits
digits_df = pd.DataFrame(digits.data, columns=digits.feature_names)

In [8]:
# splitting the data into training and test data - 80:20 split
x_tr, x_test, y_tr, y_test = train_test_split(digits_df, digits.target, train_size = 0.8, shuffle = True)

# Extra Trees Classifier for Handwriting Data

In [9]:
# defining the model
model = ExtraTreesClassifier()

# 10-fold cross-validation
digits_cv = RepeatedStratifiedKFold(n_splits = 10)
cv_scores = cross_val_score(model, x_tr, y_tr, scoring = "accuracy", cv = digits_cv)

# accuracy using the training data
print("accuracy of ET classifier: %.3f (%.3f)" % (mean(cv_scores), stdev(cv_scores)))

accuracy of ET classifier: 0.980 (0.013)


In [10]:
# making predictions on the model using the test data
# fitting the model on the training data
model.fit(x_tr, y_tr)

# making predictions on the test data
et_preds = model.predict(x_test)

# accuracy
round(metrics.accuracy_score(y_test, et_preds), 3)

0.978

# Random Forest Algorithm For Classification

In [11]:
# defining the model
model = RandomForestClassifier()

# 10-fold cross-validation
digits_cv = RepeatedStratifiedKFold(n_splits = 10)
cv_scores = cross_val_score(model, x_tr, y_tr, scoring = "accuracy", cv = digits_cv)

# accuracy using the training data
print("accuracy of RF classifier: %.3f (%.3f)" % (mean(cv_scores), stdev(cv_scores)))

accuracy of RF classifier: 0.974 (0.014)


In [12]:
# fitting the model on the training data
model.fit(x_tr, y_tr)

# making predictions using the fitted model on the test data
rf_preds = model.predict(x_test)

# accuracy
print("test accuracy for RF: %.3f" % metrics.accuracy_score(y_test, rf_preds))
# MSE
print("test MSE for RF: %.3f" % metrics.mean_squared_error(y_test, rf_preds))

test accuracy for RF: 0.969
test MSE for RF: 0.614


# XGBoost For Classification

In [13]:
# defining the model
model = XGBClassifier()

# 10-fold cross-validation
digits_cv = RepeatedStratifiedKFold(n_splits = 10)
cv_scores = cross_val_score(model, x_tr, y_tr, scoring = "accuracy", cv = digits_cv)

# accuracy using the training data
print("accuracy of XGB classifier: %.3f (%.3f)" % (mean(cv_scores), stdev(cv_scores)))

accuracy of XGB classifier: 0.965 (0.016)


In [14]:
# fitting the model on the training data
model.fit(x_tr, y_tr)

# making predictions using the fitted model on the test data
xgb_preds = model.predict(x_test)

# accuracy
print("test accuracy for RF: %.3f" % metrics.accuracy_score(y_test, xgb_preds))
# MSE
print("test MSE for RF: %.3f" % metrics.mean_squared_error(y_test, xgb_preds))

test accuracy for RF: 0.958
test MSE for RF: 0.956
