In [1]:
# Pandas is used for data manipulation
import pandas as pd
# Use numpy to convert to arrays
import numpy as np
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# To show a figure in notebook
from IPython.display import Image

# Import matplotlib for plotting and use magic command for Jupyter Notebooks
import matplotlib.pyplot as plt
%matplotlib inline
# Use datetime for creating date objects for plotting
import datetime

from tabulate import tabulate as tb
from anytree import Node, RenderTree
from anytree import search as anys
from anytree.exporter import DotExporter
from IPython.display import Image

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
# Read in data and display first 5 rows
features = pd.read_csv('temps.csv')
features.head(5)

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend
0,2016,1,1,Fri,45,45,45.6,45,43,50,44,29
1,2016,1,2,Sat,44,45,45.7,44,41,50,44,61
2,2016,1,3,Sun,45,44,45.8,41,43,46,47,56
3,2016,1,4,Mon,44,41,45.9,40,44,48,46,53
4,2016,1,5,Tues,41,40,46.0,44,46,46,46,41


In [3]:
print('The shape of our features is:', features.shape)

The shape of our features is: (348, 12)


In [4]:
# Descriptive statistics for each column
features.describe()

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend
count,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0
mean,2016.0,6.477011,15.514368,62.652299,62.701149,59.760632,62.543103,57.238506,62.373563,59.772989,60.034483
std,0.0,3.49838,8.772982,12.165398,12.120542,10.527306,11.794146,10.605746,10.549381,10.705256,15.626179
min,2016.0,1.0,1.0,35.0,35.0,45.1,35.0,41.0,46.0,44.0,28.0
25%,2016.0,3.0,8.0,54.0,54.0,49.975,54.0,48.0,53.0,50.0,47.75
50%,2016.0,6.0,15.0,62.5,62.5,58.2,62.5,56.0,61.0,58.0,60.0
75%,2016.0,10.0,23.0,71.0,71.0,69.025,71.0,66.0,72.0,69.0,71.0
max,2016.0,12.0,31.0,117.0,117.0,77.4,92.0,77.0,82.0,79.0,95.0


In [5]:
# One-hot encode the data using pandas get_dummies
features = pd.get_dummies(features)
# Display the first 5 rows of the last 12 columns
features.iloc[:,5:].head(5)

Unnamed: 0,average,actual,forecast_noaa,forecast_acc,forecast_under,friend,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
0,45.6,45,43,50,44,29,1,0,0,0,0,0,0
1,45.7,44,41,50,44,61,0,0,1,0,0,0,0
2,45.8,41,43,46,47,56,0,0,0,1,0,0,0
3,45.9,40,44,48,46,53,0,1,0,0,0,0,0
4,46.0,44,46,46,46,41,0,0,0,0,0,1,0


In [6]:
features.head(5)

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
0,2016,1,1,45,45,45.6,45,43,50,44,29,1,0,0,0,0,0,0
1,2016,1,2,44,45,45.7,44,41,50,44,61,0,0,1,0,0,0,0
2,2016,1,3,45,44,45.8,41,43,46,47,56,0,0,0,1,0,0,0
3,2016,1,4,44,41,45.9,40,44,48,46,53,0,1,0,0,0,0,0
4,2016,1,5,41,40,46.0,44,46,46,46,41,0,0,0,0,0,1,0


In [7]:
# Labels are the values we want to predict
labels = np.array(features['actual'])
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('actual', axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)

In [8]:
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

np.savetxt("train_features.csv", train_features, delimiter=",")
np.savetxt("test_features.csv", test_features, delimiter=",")
np.savetxt("train_labels.csv", train_labels, delimiter=",")
np.savetxt("test_labels.csv", test_labels, delimiter=",")

In [9]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (261, 17)
Training Labels Shape: (261,)
Testing Features Shape: (87, 17)
Testing Labels Shape: (87,)


In [13]:
# Minimum square error.
# Return mse, threshold, total samples <=, total samples >

def mse(num_feat,train_features,train_labels):
    mean_featu = train_features[:,num_feat].mean()
    templeq = train_features[:,num_feat] <= mean_featu
    tempg = train_features[:,num_feat] >= mean_featu
    mse_out = train_labels[templeq].std()+train_labels[tempg].std()
    return mse_out, mean_featu, (templeq*1).sum(), (tempg*1).sum()


# Function to get the best features to split (create a node).
# Return mse, threshold, best feature (number), total samples <=, total samples >

def best_split(train_features,train_labels):
    n = 1
    mse_out, mean_featu, numg, numl = mse(n,train_features,train_labels)
    for i in range(n+1,train_features.shape[1]):
        mse_temp, mean_temp, tnumg, tnuml = mse(i,train_features,train_labels)
        if (mse_out > mse_temp):
            mse_out = mse_temp
            n = i
            mean_featu = mean_temp
            numg = tnumg
            numl = tnuml
    return mse_out, mean_featu, n, numl, numg


# Function to create a root fo the tree.

def plant_the_tree(train_features,train_labels):
    tree = {}
    tree['node'] = best_split(train_features,train_labels)
    return tree


# Function to calculate the precision weight. For while, we choose 1.

def func_prec(arg_vec):
    return 1


# Function to make the splits and create the branchs.

def grow_tree(tree,train_features,train_labels,brachs,min_mse=0,min_min_samples_leaf=1):

    # node info
    mean_feat = tree['node'][1]
    n_feat = tree['node'][2]
    
    # for values greater than threshold
    
    # Spliting the data
    tempg = train_features[train_features[:,n_feat]>mean_feat,:]
    labelg = train_labels[train_features[:,n_feat]>mean_feat]
    
    # Start spliting greater.
    tree['greater'] = {}
    bs_temp = best_split(tempg,labelg)
    mean_feat_loc = bs_temp[1]
    n_feat_loc = bs_temp[2]
    
    # Branches available
    bool1 = brachs < 1
    # Min. samples remaining
    bool2 = bs_temp[4] <= min_min_samples_leaf
    # Min. MSE value.
    bool3 = bs_temp[0] <= min_mse
    
    if (bool1 or bool2 or bool3):
        # Create the leaf (final branch).
        tree['greater']['leaf']  = bs_temp
        target = labelg[tempg[:,n_feat_loc]>=mean_feat_loc].mean()
        prec = func_prec(labelg[tempg[:,n_feat_loc]>=mean_feat_loc])        
        tree['greater']['greater'] = [target, prec]
        target = labelg[tempg[:,n_feat_loc]<=mean_feat_loc].mean()
        prec = func_prec(labelg[tempg[:,n_feat_loc]<=mean_feat_loc])        
        tree['greater']['less'] = [target, prec]
    else:
        # Create a new node (one more branch).
        tree['greater']['node'] = bs_temp
        grow_tree(tree['greater'],tempg,labelg,(brachs-1))

    # for values equal or less than threshold
    
    # Spliting the data
    tempg = train_features[train_features[:,n_feat]<=mean_feat,:]
    labelg = train_labels[train_features[:,n_feat]<=mean_feat]

    # Start spliting greater.
    tree['less'] = {}
    bs_temp = best_split(tempg,labelg)
    mean_feat_loc = bs_temp[1]
    n_feat_loc = bs_temp[2]
    
    # Branches available
    bool1 = brachs < 1
    # Min. samples remaining
    bool2 = bs_temp[4] <= min_min_samples_leaf
    # Min. MSE value.
    bool3 = bs_temp[0] <= min_mse
    
    if (bool1 or bool2 or bool3):
        # Create the leaf (final branch).
        tree['less']['leaf']  = bs_temp
        target = labelg[tempg[:,n_feat_loc]>=mean_feat_loc].mean()
        prec = func_prec(labelg[tempg[:,n_feat_loc]>=mean_feat_loc])        
        tree['less']['greater'] = [target, prec]
        target = labelg[tempg[:,n_feat_loc]<=mean_feat_loc].mean()
        prec = func_prec(labelg[tempg[:,n_feat_loc]<=mean_feat_loc])        
        tree['less']['less'] = [target, prec]
    else:
        # Create a new node (one more branch).
        tree['less']['node'] = bs_temp
        grow_tree(tree['less'],tempg,labelg,(brachs-1))
    
    # Important to ensure the recursion  
    return tree


# Function to use a tree (model) to make a predcition given a sample (x).

def use_tree(tree,x):
    if ('node' in tree):
        bs_temp = tree['node']
        if (x[bs_temp[2]] >bs_temp[1]):
            return use_tree(tree['greater'],x)
        else:
            return use_tree(tree['less'],x)
    else:
        bs_temp = tree['leaf']
        if (x[bs_temp[2]] >bs_temp[1]):
            return tree['greater']
        else:
            return tree['less']

        
# Function to generate the forest. Using a different bacth for make each tree.
        
def gen_forest(NTree,train_features,train_labels,frac,branchs):
    forest = {}
    for i in range(0,NTree):
        train_features_mini, _, train_labels_mini, _, = train_test_split(train_features, train_labels, test_size = (1-frac), shuffle=True)
        forest[i] = plant_the_tree(train_features_mini,train_labels_mini)
        forest[i] = grow_tree(forest[i],train_features_mini,train_labels_mini,branchs)
    return forest


# Function to use a forest (ensemble model) to generate the erro prediction vector
# for all samples in a given dataset.

def use_forest(forest,test_features,test_labels):
    e = np.zeros((len(test_labels),1))
    for j in range(0,len(test_labels)):
        x = test_features[j,:]
        y_m = 0
        t_w = 0
        for i in range(0,len(forest)):
            y_hat, w = use_tree(forest[i],x)
            t_w += w
            y_m += y_hat*w
        y_m = y_m/t_w
        e[j] = abs(y_m-test_labels[j])
        # Plot the results
    #plt.figure()
    #plt.plot(e)
    #plt.xlabel("data")
    #plt.ylabel("Erro")
    #plt.title("Decision Tree Regression")
    #plt.legend()
    #plt.show()
    return e


# Function to do a set of experiments varying number of trees, limit of branchs and fraction
# of train dataset for each batch.

def run_exps(fran_vec,NTree_vec,branchs_vec):
    N = len(NTree_vec)
    K = len(fran_vec)
    T = len(branchs_vec)
    results = np.zeros((4,K,N,T))
    print('Start!')
    for t in range(0,T):
        branchs = branchs_vec[t]
        for n in range(0,N):
            NTree = NTree_vec[n]
            for k in range(0,K):
                forest = {}
                frac = fran_vec[k]
                print(str(k+1)+'/'+str(K)+' <> '+str(n+1)+'/'+str(N)+' <> '+str(t+1)+'/'+str(T), end='\r')
                forest = gen_forest(NTree,train_features,train_labels,frac,branchs)
                e = use_forest(forest,test_features,test_labels)

                results[0,k,n,t] = np.mean(e)
                results[1,k,n,t] = np.std(e)
                results[2,k,n,t] = np.min(e)
                results[3,k,n,t] = np.max(e)
    return results


# Function to do a test step to prune.

def tree_test_step(tree,test_features,test_labels):
    e = np.zeros((len(test_labels),1))
    for i in range(0,len(test_labels)):
        x = test_features[i,:]
        e[i] = abs(use_tree(tree,x)[0]-test_labels[i])
    return np.mean(e), np.std(e)


# The same fuction run_exps with prune function.

def run_exps_prune(fran_vec,NTree_vec,branchs_vec,min_pruned_samples):
    N = len(NTree_vec)
    K = len(fran_vec)
    T = len(branchs_vec)
    results = np.zeros((5,K,N,T))
    print('Start!')
    for t in range(0,T):
        branchs = branchs_vec[t]
        for n in range(0,N):
            NTree = NTree_vec[n]
            for k in range(0,K):
                forest = {}
                frac = fran_vec[k]
                min_erro = 0
                print(str(k+1)+'/'+str(K)+' <> '+str(n+1)+'/'+str(N)+' <> '+str(t+1)+'/'+str(T), end='\r')
                for i in range(0,NTree):
                    cont_prunes = 0
                    temp_error = 100
                    while (temp_error >= min_erro):
                        train_features_mini, test_features_mini, train_labels_mini, test_labels_mini, = train_test_split(train_features, train_labels, test_size = (1-frac), shuffle=True)
                        forest[i] = plant_the_tree(train_features_mini,train_labels_mini)
                        try:
                            forest[i] = grow_tree(forest[i],train_features_mini,train_labels_mini,branchs)
                        except:
                            forest[i] = forest[i-1]
                        temp_error, _ = tree_test_step(forest[i],test_features_mini,test_labels_mini)
                        if (temp_error > min_erro):
                            cont_prunes += 1
                        if (cont_prunes >= min_pruned_samples):
                            min_erro += 0.01
                            print(str(k+1)+'/'+str(K)+' <> '+str(n+1)+'/'+str(N)+' <> '+str(t+1)+'/'+str(T)+'. Th prune lower: '+str(min_erro)+'.', end='\r')

                e = np.zeros((len(test_labels),1))
                for j in range(0,len(test_labels)):
                    x = test_features[j,:]
                    y_m = 0
                    t_w = 0
                    for i in range(0,NTree):
                        y_hat, w = use_tree(forest[i],x)
                        t_w += w
                        y_m += y_hat*w
                    y_m = y_m/t_w
                    e[j] = abs(y_m-test_labels[j])

                results[0,k,n,t] = np.mean(e)
                results[1,k,n,t] = np.std(e)
                results[2,k,n,t] = np.min(e)
                results[3,k,n,t] = np.max(e)
                results[4,k,n,t] = min_erro
    return results


# Function to get the best result with model parameters.

def best_results(results,fran_vec,NTree_vec,branchs_vec,idx):
    ind = np.unravel_index(np.argmin(results[idx,:,:,:], axis=None), results[idx,:,:,:].shape)
    print(tb([['RFR '+"{:.2f}".format(fran_vec[ind[0]])+" / {:.2f} / ".format(NTree_vec[ind[1]])+str(branchs_vec[ind[2]]), \
               "{:.2f}".format(results[0,ind[0],ind[1],ind[2]])+" +-{:.2f}".format(results[1,ind[0],ind[1],ind[2]])+" ({:.2f}".format(results[2,ind[0],ind[1],ind[2]])+" / {:.2f})".format(results[3,ind[0],ind[1],ind[2]])]], \
        headers=["Method", "Erro (F°)"], tablefmt='orgtbl'))
    print(' ')
    

In [14]:
tree = plant_the_tree(train_features,train_labels)
tree = grow_tree(tree,train_features,train_labels,2)
x = test_features[0,:]
y = test_labels[0]
yh = use_tree(tree,x)[0]
print("Target: {:.2f}°".format(y))
print("Prediction: {:.2f}°".format(yh))
print("Error: {:.2f}°".format(yh-y))

Target: 66.00°
Prediction: 70.43°
Error: 4.43°


In [15]:
fran_vec = np.arange(0.1, 0.9, 0.1)
NTree_vec  = np.arange(1, 20, 2)
branchs_vec = np.arange(2, 5, 1)

results1 = run_exps(fran_vec,NTree_vec,branchs_vec)
best_results(results1,fran_vec,NTree_vec,branchs_vec,0)
best_results(results1,fran_vec,NTree_vec,branchs_vec,1)

Start!
8/8 <> 10/10 <> 3/3

| Method              | Erro (F°)                  |
|---------------------+----------------------------|
| RFR 0.40 / 5.00 / 4 | 3.63 +-3.33 (0.05 / 21.56) |
 
| Method              | Erro (F°)                  |
|---------------------+----------------------------|
| RFR 0.60 / 7.00 / 3 | 4.25 +-3.07 (0.03 / 17.55) |
 


In [None]:
fran_vec = np.arange(0.10, 0.8, 0.1)
NTree_vec  = np.arange(1, 20, 1)
branchs_vec = np.arange(0, 5, 1)

results2 = run_exps(fran_vec,NTree_vec,branchs_vec)
best_results(results2,fran_vec,NTree_vec,branchs_vec,0)
best_results(results2,fran_vec,NTree_vec,branchs_vec,1)

In [None]:
fran_vec = np.arange(0.10, 0.8, 0.1)
NTree_vec  = np.arange(4, 8, 1)
branchs_vec = np.arange(0, 4, 1)
max_prunes = 10

results3 = run_exps(fran_vec,NTree_vec,branchs_vec,max_prunes)
best_results(results3,fran_vec,NTree_vec,branchs_vec,0)
best_results(results3,fran_vec,NTree_vec,branchs_vec,1)

In [None]:
fran_vec = np.arange(0.10, 0.8, 0.1)
NTree_vec  = np.arange(1, 20, 1)
branchs_vec = np.arange(0, 5, 1)

results2a = run_exps(fran_vec,NTree_vec,branchs_vec)
#results1 = run_exps(fran_vec,NTree_vec)
best_results(results2,fran_vec,NTree_vec,branchs_vec,0)
best_results(results2,fran_vec,NTree_vec,branchs_vec,1)
#results1 = run_exps(fran_vec,NTree_vec)
best_results(results2a,fran_vec,NTree_vec,branchs_vec,0)
best_results(results2a,fran_vec,NTree_vec,branchs_vec,1)

In [None]:
fran_vec = np.arange(0.10, 0.6, 0.1)
NTree_vec  = np.arange(10, 100, 10)
branchs_vec = np.arange(0, 10, 1)


results4 = run_exps(fran_vec,NTree_vec,branchs_vec)
best_results(results4,fran_vec,NTree_vec,branchs_vec,0)
best_results(results4,fran_vec,NTree_vec,branchs_vec,1)

In [None]:
fran_vec = np.arange(0.10, 0.6, 0.1)
NTree_vec  = np.arange(10, 100, 10)
branchs_vec = np.arange(0, 10, 1)


results4a = run_exps(fran_vec,NTree_vec,branchs_vec)
best_results(results4a,fran_vec,NTree_vec,branchs_vec,0)
best_results(results4a,fran_vec,NTree_vec,branchs_vec,1)