In [261]:
# Pandas is used for data manipulation
import pandas as pd
# Use numpy to convert to arrays
import numpy as np
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# To show a figure in notebook
from IPython.display import Image

# Import matplotlib for plotting and use magic command for Jupyter Notebooks
import matplotlib.pyplot as plt
%matplotlib inline
# Use datetime for creating date objects for plotting
import datetime

from tabulate import tabulate as tb
from anytree import Node, RenderTree
from anytree import search as anys
from anytree.exporter import DotExporter
from IPython.display import Image

In [2]:
# Read in data and display first 5 rows
features = pd.read_csv('temps.csv')
features.head(5)

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend
0,2016,1,1,Fri,45,45,45.6,45,43,50,44,29
1,2016,1,2,Sat,44,45,45.7,44,41,50,44,61
2,2016,1,3,Sun,45,44,45.8,41,43,46,47,56
3,2016,1,4,Mon,44,41,45.9,40,44,48,46,53
4,2016,1,5,Tues,41,40,46.0,44,46,46,46,41


In [3]:
print('The shape of our features is:', features.shape)

The shape of our features is: (348, 12)


In [4]:
# Descriptive statistics for each column
features.describe()

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend
count,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0
mean,2016.0,6.477011,15.514368,62.652299,62.701149,59.760632,62.543103,57.238506,62.373563,59.772989,60.034483
std,0.0,3.49838,8.772982,12.165398,12.120542,10.527306,11.794146,10.605746,10.549381,10.705256,15.626179
min,2016.0,1.0,1.0,35.0,35.0,45.1,35.0,41.0,46.0,44.0,28.0
25%,2016.0,3.0,8.0,54.0,54.0,49.975,54.0,48.0,53.0,50.0,47.75
50%,2016.0,6.0,15.0,62.5,62.5,58.2,62.5,56.0,61.0,58.0,60.0
75%,2016.0,10.0,23.0,71.0,71.0,69.025,71.0,66.0,72.0,69.0,71.0
max,2016.0,12.0,31.0,117.0,117.0,77.4,92.0,77.0,82.0,79.0,95.0


In [5]:
# One-hot encode the data using pandas get_dummies
features = pd.get_dummies(features)
# Display the first 5 rows of the last 12 columns
features.iloc[:,5:].head(5)

Unnamed: 0,average,actual,forecast_noaa,forecast_acc,forecast_under,friend,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
0,45.6,45,43,50,44,29,1,0,0,0,0,0,0
1,45.7,44,41,50,44,61,0,0,1,0,0,0,0
2,45.8,41,43,46,47,56,0,0,0,1,0,0,0
3,45.9,40,44,48,46,53,0,1,0,0,0,0,0
4,46.0,44,46,46,46,41,0,0,0,0,0,1,0


In [6]:
features.head(5)

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,forecast_noaa,forecast_acc,forecast_under,friend,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
0,2016,1,1,45,45,45.6,45,43,50,44,29,1,0,0,0,0,0,0
1,2016,1,2,44,45,45.7,44,41,50,44,61,0,0,1,0,0,0,0
2,2016,1,3,45,44,45.8,41,43,46,47,56,0,0,0,1,0,0,0
3,2016,1,4,44,41,45.9,40,44,48,46,53,0,1,0,0,0,0,0
4,2016,1,5,41,40,46.0,44,46,46,46,41,0,0,0,0,0,1,0


In [7]:
# Labels are the values we want to predict
labels = np.array(features['actual'])
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('actual', axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)

In [8]:
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [9]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (261, 17)
Training Labels Shape: (261,)
Testing Features Shape: (87, 17)
Testing Labels Shape: (87,)


In [252]:
def mse(num_feat,train_features,train_labels):
    mean_featu = train_features[:,num_feat].mean()
    templeq = train_features[:,num_feat] <= mean_featu
    tempg = train_features[:,num_feat] >= mean_featu
    mse_out = train_labels[templeq].std()+train_labels[tempg].std()
    return mse_out, mean_featu, (templeq*1).sum(), (tempg*1).sum()


In [70]:
mse(0,train_features,train_labels)

  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


(nan, 2016.0, 261, 0)

In [253]:
def best_split(train_features,train_labels):
    n = 1
    mse_out, mean_featu, numg, numl = mse(n,train_features,train_labels)
    for i in range(n+1,train_features.shape[1]):
        mse_temp, mean_temp, tnumg, tnuml = mse(i,train_features,train_labels)
        if (mse_out > mse_temp):
            mse_out = mse_temp
            n = i
            mean_featu = mean_temp
            numg = tnumg
            numl = tnuml
    return mse_out, mean_featu, n, numl, numg


def func_leaf(arg):
    return arg[1], (1-arg[0]/arg[1])

In [80]:
best_split(train_features,train_labels)

(14.94189129669283, 62.888888888888886, 4, 134, 127)

In [81]:
tree = {}
tree['node'] = best_split(train_features,train_labels)
tree['greater'] = {}
tree['less'] = {}

In [82]:
tree['node']

(14.94189129669283, 62.888888888888886, 4, 134, 127)

In [244]:
min_mse_rel = 0.5
mean_feat = tree['node'][1]
n_feat = tree['node'][2]

tempg = train_features[train_features[:,n_feat]>=mean_feat,:]
labelg = train_labels[train_features[:,n_feat]>=mean_feat]

nodeg = {}
bs_temp = best_split(tempg,labelg)
if (bs_temp[0]/bs_temp[1] < min_mse_rel):
    nodeg['leaf'] = func_leaf(bs_temp)
    tree['greater'] = nodeg
else:
    nodeg['node'] = bs_temp
    nodeg['greater'] = {}
    nodeg['less'] = {}
    tree['greater'] = nodeg

templ = train_features[train_features[:,n_feat]<=mean_feat,:]
labell = train_labels[train_features[:,n_feat]<=mean_feat]

nodel = {}
bs_temp = best_split(templ,labell)
if (bs_temp[0]/bs_temp[1] < min_mse_rel):
    nodel['leaf'] = func_leaf(bs_temp)
    tree['less'] = nodel
else:
    nodel['node'] = bs_temp
    nodel['greater'] = {}
    nodel['less'] = {}
    tree['less'] = nodel


In [245]:
tree

{'greater': {'leaf': (72.70149253731343, 0.8232647248879077)},
 'less': {'leaf': (51.12834645669291, 0.7880437431075704)},
 'node': (14.94189129669283, 62.888888888888886, 4, 134, 127)}

In [254]:

def plant_the_tree(train_features,train_labels):
    tree = {}
    tree['node'] = best_split(train_features,train_labels)
    return tree

def grow_tree(tree,train_features,train_labels,min_mse=0,min_min_samples_leaf=1):

    mean_feat = tree['node'][1]
    n_feat = tree['node'][2]

    tempg = train_features[train_features[:,n_feat]>mean_feat,:]
    labelg = train_labels[train_features[:,n_feat]>mean_feat]
    
    tree['greater'] = {}
    bs_temp = best_split(tempg,labelg)
    mean_feat_loc = bs_temp[1]
    n_feat_loc = bs_temp[2]
    bool1 = bs_temp[3] <= min_min_samples_leaf
    bool2 = bs_temp[4] <= min_min_samples_leaf
    bool3 = bs_temp[0] <= min_mse
    if (bool1 or bool2 or bool3):
        tree['greater']['leaf']  = bs_temp
        target = labelg[tempg[:,n_feat_loc]>=mean_feat_loc].mean()
        prec = 1-labelg[tempg[:,n_feat_loc]>=mean_feat_loc].std()/target        
        tree['greater']['greater'] = [target, prec]
        target = labelg[tempg[:,n_feat_loc]<=mean_feat_loc].mean()
        prec = 1-labelg[tempg[:,n_feat_loc]<=mean_feat_loc].std()/target        
        tree['greater']['less'] = [target, prec]
    else:
        print(bs_temp)
        tree['greater']['node'] = bs_temp
        grow_tree(tree['greater'],tempg,labelg)

    tempg = train_features[train_features[:,n_feat]<=mean_feat,:]
    labelg = train_labels[train_features[:,n_feat]<=mean_feat]

    tree['less'] = {}
    bs_temp = best_split(tempg,labelg)
    mean_feat_loc = bs_temp[1]
    n_feat_loc = bs_temp[2]
    bool1 = bs_temp[3] <= min_min_samples_leaf
    bool2 = bs_temp[4] <= min_min_samples_leaf
    bool3 = bs_temp[0] <= min_mse
    if (bool1 or bool2 or bool3):
        tree['less']['leaf']  = bs_temp
        target = labelg[tempg[:,n_feat_loc]>=mean_feat_loc].mean()
        prec = 1-labelg[tempg[:,n_feat_loc]>=mean_feat_loc].std()/target        
        tree['less']['greater'] = [target, prec]
        target = labelg[tempg[:,n_feat_loc]<=mean_feat_loc].mean()
        prec = 1-labelg[tempg[:,n_feat_loc]<=mean_feat_loc].std()/target        
        tree['less']['less'] = [target, prec]
    else:
        tree['less']['node'] = bs_temp
        grow_tree(tree['less'],tempg,labelg)
        
    return tree

In [255]:
tree = plant_the_tree(train_features,train_labels)

In [256]:
tree = grow_tree(tree,train_features,train_labels)

(12.848918284641812, 72.70149253731343, 4, 61, 73)
(10.88152505847371, 0.18032786885245902, 15, 11, 50)
(5.13352789392958, 78.9090909090909, 4, 4, 7)
(9.0863900365551, 72.4, 5, 5, 4)
(9.617650104885035, 11.375, 2, 4, 4)
(6.0, 73.75, 9, 2, 2)
(8.716923809014466, 71.53846153846153, 9, 5, 8)
(8.759252917297887, 71.0, 7, 2, 4)
(8.015135510675146, 0.15151515151515152, 10, 5, 28)
(1.9714045207910318, 69.68, 5, 3, 2)
(1.8564659966250536, 64.85714285714286, 3, 5, 2)
(6.83976357667035, 0.125, 16, 6, 42)
(1.7593056225097894, 56.666666666666664, 3, 3, 3)
(2.414213562373095, 57.4, 4, 3, 2)
(4.502790697118382, 0.3, 11, 6, 14)
(1.247219128924647, 56.666666666666664, 6, 3, 3)
(3.0436023260597347, 6.5, 1, 3, 5)
(5.6853339852216696, 0.1282051282051282, 13, 5, 34)
(7.587969505439089, 13.615384615384615, 2, 7, 6)
(5.768525043108829, 0.3333333333333333, 11, 3, 6)


In [257]:
print(tree)

{'node': (14.94189129669283, 62.888888888888886, 4, 134, 127), 'greater': {'node': (12.848918284641812, 72.70149253731343, 4, 61, 73), 'greater': {'node': (10.88152505847371, 0.18032786885245902, 15, 11, 50), 'greater': {'node': (5.13352789392958, 78.9090909090909, 4, 4, 7), 'greater': {'leaf': (0.9428090415820634, 17.75, 2, 3, 1), 'greater': [79.66666666666667, 0.9881655768839072], 'less': [74.0, 1.0]}, 'less': {'node': (3.6199596081647467, 16.285714285714285, 2, 3, 4), 'greater': {'leaf': (0.5, 7.666666666666667, 1, 2, 1), 'greater': [69.5, 0.9928057553956835], 'less': [73.0, 1.0]}, 'less': {'node': (1.5, 74.25, 6, 2, 2), 'greater': {'leaf': (0.0, 5.5, 2, 1, 1), 'greater': [72.0, 1.0], 'less': [73.0, 1.0]}, 'less': {'leaf': (0.0, 8.0, 1, 1, 1), 'greater': [75.0, 1.0], 'less': [77.0, 1.0]}}}}, 'less': {'node': (12.915359341571051, 0.18, 14, 9, 41), 'greater': {'node': (9.0863900365551, 72.4, 5, 5, 4), 'greater': {'leaf': (4.5, 76.6, 4, 1, 4), 'greater': [85.0, 1.0], 'less': [74.5, 0.9

In [251]:
a= np.array((0.2, 0.3))

In [157]:
a[1]

0.3

In [165]:
[0.2, 0.3][0]

0.2

In [267]:
pre = RenderTree(tree)

In [268]:
pre

RenderTree({'node': (14.94189129669283, 62.888888888888886, 4, 134, 127), 'greater': {'node': (12.848918284641812, 72.70149253731343, 4, 61, 73), 'greater': {'node': (10.88152505847371, 0.18032786885245902, 15, 11, 50), 'greater': {'node': (5.13352789392958, 78.9090909090909, 4, 4, 7), 'greater': {'leaf': (0.9428090415820634, 17.75, 2, 3, 1), 'greater': [79.66666666666667, 0.9881655768839072], 'less': [74.0, 1.0]}, 'less': {'node': (3.6199596081647467, 16.285714285714285, 2, 3, 4), 'greater': {'leaf': (0.5, 7.666666666666667, 1, 2, 1), 'greater': [69.5, 0.9928057553956835], 'less': [73.0, 1.0]}, 'less': {'node': (1.5, 74.25, 6, 2, 2), 'greater': {'leaf': (0.0, 5.5, 2, 1, 1), 'greater': [72.0, 1.0], 'less': [73.0, 1.0]}, 'less': {'leaf': (0.0, 8.0, 1, 1, 1), 'greater': [75.0, 1.0], 'less': [77.0, 1.0]}}}}, 'less': {'node': (12.915359341571051, 0.18, 14, 9, 41), 'greater': {'node': (9.0863900365551, 72.4, 5, 5, 4), 'greater': {'leaf': (4.5, 76.6, 4, 1, 4), 'greater': [85.0, 1.0], 'less':