In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score, r2_score
from sklearn.tree import DecisionTreeClassifier as skDTC
from sklearn.tree import DecisionTreeRegressor as skDTR
from sklearn.ensemble import RandomForestClassifier as skRFC
from sklearn.ensemble import RandomForestRegressor as skRFR
from sklearn.model_selection import train_test_split

from decision_trees.tree import DecisionTreeClassifier, DecisionTreeRegressor
from random_forest.forest import RandomForestRegressor, RandomForestClassifier
from decision_trees.metrics import MeanSquaredError

In [2]:
data_class = np.loadtxt('wifi_localization.txt')
data_reg = pd.read_csv('BostonHousing.csv')
X_reg, y_reg = data_reg.values[:,:-1], data_reg.values[:,-1]
X_class, y_class = data_class[:,:-1], data_class[:,-1]

In [3]:
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.25, random_state=42)
X_train_class, X_test_class, y_train_class, y_test_class = train_test_split(X_class, y_class, test_size=0.25, random_state=42)

# Decision trees
## Classification

In [4]:
dtc = DecisionTreeClassifier(max_depth=2)
skdtc = skDTC(max_depth=2)

In [5]:
dtc.fit(X_train_class, y_train_class)
skdtc.fit(X_train_class, y_train_class)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [6]:
dtc_predictions_train = dtc.predict(X_train_class)
skdtc_predictions_train = skdtc.predict(X_train_class)

dtc_predictions_test = dtc.predict(X_test_class)
skdtc_predictions_test = skdtc.predict(X_test_class)

In [7]:
dtc_acc_train= accuracy_score(y_train_class, dtc_predictions_train)
skdtc_acc_train = accuracy_score(y_train_class, skdtc_predictions_train)
dtc_acc_test= accuracy_score(y_test_class, dtc_predictions_test)
skdtc_acc_test = accuracy_score(y_test_class, skdtc_predictions_test)

print(f'Accuracy:\nTRAIN: own = {dtc_acc_train:.3f}, sklearn {skdtc_acc_train:.3f}\nTEST own = {dtc_acc_test}, sklearn = {skdtc_acc_test}')

Accuracy:
TRAIN: own = 0.959, sklearn 0.959
TEST own = 0.948, sklearn = 0.948


In [8]:
dtc.draw_tree('viz/')

## Visualization of own decision tree classifier

![title](decision_tree_classification.png)

## Regression

In [9]:
dtr = DecisionTreeRegressor(max_depth=2)
skdtr = skDTR(max_depth=2)

In [10]:
dtr.fit(X_train_reg, y_train_reg)
skdtr.fit(X_train_reg, y_train_reg)

DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [11]:
dtr_predictions_train = dtr.predict(X_train_reg)
skdtr_predictions_train = skdtr.predict(X_train_reg)

dtr_predictions_test = dtr.predict(X_test_reg)
skdtr_predictions_test = skdtr.predict(X_test_reg)

In [12]:
dtr_r2_train= r2_score(y_train_reg, dtr_predictions_train)
skdtr_r2_train = r2_score(y_train_reg, skdtr_predictions_train)
dtr_r2_test= r2_score(y_test_reg, dtr_predictions_test)
skdtr_r2_test = r2_score(y_test_reg, skdtr_predictions_test)

print(f'r2_score:\nTRAIN: own = {dtr_r2_train:.3f}, sklearn {skdtr_r2_train:.3f}\nTEST own = {dtr_r2_test}, sklearn = {skdtr_r2_test}')

r2_score:
TRAIN: own = 0.709, sklearn 0.709
TEST own = 0.6067947547484775, sklearn = 0.6370379660938668


In [13]:
dtr.draw_tree('viz/')

## Visualization of decision tree regressor

![title](decision_tree_regression.png)

# Random Forests

## Classification

In [14]:
rfc = RandomForestClassifier(X_train_class, y_train_class, n_trees=20, max_depth=2, rows_percentage=1)
skrfc = skRFC(max_depth=2,n_estimators=20)

In [15]:
rfc.fit()
skrfc.fit(X_train_class, y_train_class)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=20,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [16]:
rfc_predictions_train = rfc.predict(X_train_class)
skrfc_predictions_train = skrfc.predict(X_train_class)

rfc_predictions_test = rfc.predict(X_test_class)
skrfc_predictions_test = skrfc.predict(X_test_class)

In [17]:
rfc_acc_train= accuracy_score(y_train_class, rfc_predictions_train)
skrfc_acc_train = accuracy_score(y_train_class, skrfc_predictions_train)
rfc_acc_test= accuracy_score(y_test_class, rfc_predictions_test)
skrfc_acc_test = accuracy_score(y_test_class, skrfc_predictions_test)

print(f'Accuracy:\nTRAIN: own = {rfc_acc_train:.3f}, sklearn {skrfc_acc_train:.3f}\nTEST own = {rfc_acc_test}, sklearn = {skrfc_acc_test}')

Accuracy:
TRAIN: own = 0.798, sklearn 0.961
TEST own = 0.766, sklearn = 0.96


## Regression

In [5]:
rfr = RandomForestRegressor(X_train_reg, y_train_reg, n_trees=20, max_depth=2, rows_percentage=1, scorer=MeanSquaredError)
skrfr = skRFR(max_depth=2,n_estimators=20)

In [6]:
rfr.fit()
skrfr.fit(X_train_reg, y_train_reg)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=20,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [7]:
rfr_predictions_train = rfr.predict(X_train_reg)
skrfr_predictions_train = skrfr.predict(X_train_reg)

rfr_predictions_test = rfr.predict(X_test_reg)
skrfr_predictions_test = skrfr.predict(X_test_reg)

In [8]:
rfr_r2_train= r2_score(y_train_reg, rfr_predictions_train)
skrfr_r2_train = r2_score(y_train_reg, skrfr_predictions_train)
rfr_r2_test= r2_score(y_test_reg, rfr_predictions_test)
skrfr_r2_test = r2_score(y_test_reg, skrfr_predictions_test)

print(f'Accuracy:\nTRAIN: own = {rfr_r2_train:.3f}, sklearn {skrfr_r2_train:.3f}\nTEST own = {rfr_r2_test}, sklearn = {skrfr_r2_test}')

Accuracy:
TRAIN: own = 0.230, sklearn 0.770
TEST own = 0.272609660372924, sklearn = 0.6919451264916823
