Importing dataset

In [None]:
# Pandas is used for data manipulation
import pandas as pd
# Read in data and display first 5 rows
features = pd.read_csv('temps.csv')
features.head(5)

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual,friend
0,2019,1,1,Fri,45,45,45.6,45,29
1,2019,1,2,Sat,44,45,45.7,44,61
2,2019,1,3,Sun,45,44,45.8,41,56
3,2019,1,4,Mon,44,41,45.9,40,53
4,2019,1,5,Tues,41,40,46.0,44,41


In [None]:
print('The shape of our features is:', features.shape)

The shape of our features is: (348, 9)


In [None]:
features.describe()

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,friend
count,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0
mean,2019.0,6.477011,15.514368,62.652299,62.701149,59.760632,62.543103,60.034483
std,0.0,3.49838,8.772982,12.165398,12.120542,10.527306,11.794146,15.626179
min,2019.0,1.0,1.0,35.0,35.0,45.1,35.0,28.0
25%,2019.0,3.0,8.0,54.0,54.0,49.975,54.0,47.75
50%,2019.0,6.0,15.0,62.5,62.5,58.2,62.5,60.0
75%,2019.0,10.0,23.0,71.0,71.0,69.025,71.0,71.0
max,2019.0,12.0,31.0,117.0,117.0,77.4,92.0,95.0


Preprocessing

In [None]:
# One-hot encode the data using pandas get_dummies
features = pd.get_dummies(features)
features

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,friend,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
0,2019,1,1,45,45,45.6,45,29,1,0,0,0,0,0,0
1,2019,1,2,44,45,45.7,44,61,0,0,1,0,0,0,0
2,2019,1,3,45,44,45.8,41,56,0,0,0,1,0,0,0
3,2019,1,4,44,41,45.9,40,53,0,1,0,0,0,0,0
4,2019,1,5,41,40,46.0,44,41,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,2019,12,27,42,42,45.2,47,47,0,0,0,0,0,1,0
344,2019,12,28,42,47,45.3,48,58,0,0,0,0,0,0,1
345,2019,12,29,47,48,45.3,48,65,0,0,0,0,1,0,0
346,2019,12,30,48,48,45.4,57,42,1,0,0,0,0,0,0


Splitting Dataset

In [None]:
# Use numpy to convert to arrays
import numpy as np
# Labels are the values we want to predict
labels = np.array(features['actual'])

In [None]:
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('actual', axis = 1)

In [None]:
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)

In [None]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [None]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (261, 14)
Training Labels Shape: (261,)
Testing Features Shape: (87, 14)
Testing Labels Shape: (87,)


Baseline error

In [None]:
# The baseline predictions are the historical averages
baseline_preds = test_features[:, feature_list.index('average')]
# Baseline errors, and display average baseline error
baseline_errors = abs(baseline_preds - test_labels)
print('Average baseline error: ', round(np.mean(baseline_errors), 2))

Average baseline error:  5.06


Creating Model

In [None]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);

Prediction

In [None]:
predictions = rf.predict(test_features)
predictions

array([68.546, 60.784, 51.793, 60.927, 65.881, 70.528, 80.896, 78.619,
       62.719, 73.128, 63.602, 72.64 , 38.659, 62.429, 71.211, 56.202,
       60.822, 56.842, 57.348, 76.862, 64.103, 54.39 , 65.619, 62.349,
       58.085, 52.536, 67.117, 47.089, 61.673, 78.307, 73.671, 64.337,
       55.865, 80.073, 73.967, 61.824, 53.672, 50.83 , 68.275, 43.457,
       70.446, 57.512, 76.381, 42.691, 61.097, 73.52 , 52.555, 79.064,
       54.275, 42.529, 46.701, 42.727, 64.663, 65.748, 74.633, 61.44 ,
       55.451, 59.685, 53.377, 59.33 , 66.457, 50.433, 60.308, 70.522,
       59.998, 58.837, 71.399, 69.518, 75.912, 40.739, 78.394, 56.005,
       60.448, 49.8  , 54.326, 63.446, 43.876, 76.366, 48.199, 52.312,
       53.308, 68.748, 73.033, 73.27 , 63.059, 58.739, 46.929])

Mean Absolute Error

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 3.83 degrees.


Accuracy

In [None]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 93.98 %.
