# RANDOMFOREST

In [1]:
# Pandas is used for data manipulation
import pandas as pd
# Read in data and display first 5 rows
df = pd.read_csv('temperature.csv')
df.head(5)

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual,friend
0,2019,1,1,Fri,45,45,45.6,45,29
1,2019,1,2,Sat,44,45,45.7,44,61
2,2019,1,3,Sun,45,44,45.8,41,56
3,2019,1,4,Mon,44,41,45.9,40,53
4,2019,1,5,Tues,41,40,46.0,44,41


In [2]:
print('The shape of our features is:', df.shape)

The shape of our features is: (348, 9)


In [3]:
df.describe()

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,friend
count,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0
mean,2019.0,6.477011,15.514368,62.652299,62.701149,59.760632,62.543103,60.034483
std,0.0,3.49838,8.772982,12.165398,12.120542,10.527306,11.794146,15.626179
min,2019.0,1.0,1.0,35.0,35.0,45.1,35.0,28.0
25%,2019.0,3.0,8.0,54.0,54.0,49.975,54.0,47.75
50%,2019.0,6.0,15.0,62.5,62.5,58.2,62.5,60.0
75%,2019.0,10.0,23.0,71.0,71.0,69.025,71.0,71.0
max,2019.0,12.0,31.0,117.0,117.0,77.4,92.0,95.0


Preprocessing

In [4]:
# One-hot encode the data using pandas get_dummies
df = pd.get_dummies(df)
df

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,friend,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
0,2019,1,1,45,45,45.6,45,29,1,0,0,0,0,0,0
1,2019,1,2,44,45,45.7,44,61,0,0,1,0,0,0,0
2,2019,1,3,45,44,45.8,41,56,0,0,0,1,0,0,0
3,2019,1,4,44,41,45.9,40,53,0,1,0,0,0,0,0
4,2019,1,5,41,40,46.0,44,41,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,2019,12,27,42,42,45.2,47,47,0,0,0,0,0,1,0
344,2019,12,28,42,47,45.3,48,58,0,0,0,0,0,0,1
345,2019,12,29,47,48,45.3,48,65,0,0,0,0,1,0,0
346,2019,12,30,48,48,45.4,57,42,1,0,0,0,0,0,0


Splitting Dataset

In [5]:
# Use numpy to convert to arrays
import numpy as np
# Labels are the values we want to predict
y = df['actual']
X = df.drop('actual', axis = 1)

In [6]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state=42)

In [7]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (243, 14)
Training Labels Shape: (243,)
Testing Features Shape: (105, 14)
Testing Labels Shape: (105,)


Creating Model

In [8]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train)

Prediction

In [9]:
predictions = rf.predict(X_test)
predictions

array([68.356, 60.747, 51.19 , 60.809, 66.684, 69.911, 79.644, 79.129,
       62.514, 73.02 , 63.688, 73.133, 40.02 , 62.559, 71.685, 55.804,
       61.073, 56.396, 57.319, 75.758, 64.835, 54.402, 65.266, 62.458,
       57.976, 52.442, 67.418, 46.782, 62.266, 77.306, 74.213, 63.724,
       55.204, 78.906, 73.59 , 61.843, 53.786, 50.87 , 68.767, 43.673,
       70.062, 57.392, 76.143, 42.963, 61.063, 72.735, 53.566, 77.491,
       54.186, 42.471, 46.098, 42.965, 65.09 , 65.75 , 74.894, 61.342,
       55.314, 59.741, 53.49 , 59.734, 66.145, 51.035, 60.137, 69.653,
       59.891, 58.781, 71.878, 69.447, 74.564, 40.493, 76.352, 55.677,
       60.52 , 49.721, 54.207, 65.303, 43.684, 77.392, 47.921, 53.008,
       53.225, 69.545, 71.94 , 73.071, 63.179, 58.748, 46.579, 68.998,
       60.224, 83.972, 64.701, 49.838, 52.451, 53.6  , 76.074, 39.04 ,
       41.88 , 44.996, 74.12 , 74.437, 39.857, 75.428, 70.774, 52.279,
       75.19 ])

Mean Absolute Error

In [10]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
round(np.mean(errors),2)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 3.87 degrees.


Accuracy

In [11]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / y_test)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 93.96 %.
