# Random Forest

In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score

# Temperature Dataset

In [38]:
df=pd.read_csv('datasets/temperature.csv')
df

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual,friend
0,2019,1,1,Fri,45,45,45.6,45,29
1,2019,1,2,Sat,44,45,45.7,44,61
2,2019,1,3,Sun,45,44,45.8,41,56
3,2019,1,4,Mon,44,41,45.9,40,53
4,2019,1,5,Tues,41,40,46.0,44,41
...,...,...,...,...,...,...,...,...,...
343,2019,12,27,Tues,42,42,45.2,47,47
344,2019,12,28,Wed,42,47,45.3,48,58
345,2019,12,29,Thurs,47,48,45.3,48,65
346,2019,12,30,Fri,48,48,45.4,57,42


In [39]:
df.shape

(348, 9)

In [40]:
df.dtypes

year         int64
month        int64
day          int64
week        object
temp_2       int64
temp_1       int64
average    float64
actual       int64
friend       int64
dtype: object

In [41]:
df.describe()

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,friend
count,348.0,348.0,348.0,348.0,348.0,348.0,348.0,348.0
mean,2019.0,6.477011,15.514368,62.652299,62.701149,59.760632,62.543103,60.034483
std,0.0,3.49838,8.772982,12.165398,12.120542,10.527306,11.794146,15.626179
min,2019.0,1.0,1.0,35.0,35.0,45.1,35.0,28.0
25%,2019.0,3.0,8.0,54.0,54.0,49.975,54.0,47.75
50%,2019.0,6.0,15.0,62.5,62.5,58.2,62.5,60.0
75%,2019.0,10.0,23.0,71.0,71.0,69.025,71.0,71.0
max,2019.0,12.0,31.0,117.0,117.0,77.4,92.0,95.0


In [42]:
# One-hot encode the data using pandas get_dummies
df = pd.get_dummies(df)
df

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,friend,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
0,2019,1,1,45,45,45.6,45,29,1,0,0,0,0,0,0
1,2019,1,2,44,45,45.7,44,61,0,0,1,0,0,0,0
2,2019,1,3,45,44,45.8,41,56,0,0,0,1,0,0,0
3,2019,1,4,44,41,45.9,40,53,0,1,0,0,0,0,0
4,2019,1,5,41,40,46.0,44,41,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
343,2019,12,27,42,42,45.2,47,47,0,0,0,0,0,1,0
344,2019,12,28,42,47,45.3,48,58,0,0,0,0,0,0,1
345,2019,12,29,47,48,45.3,48,65,0,0,0,0,1,0,0
346,2019,12,30,48,48,45.4,57,42,1,0,0,0,0,0,0


In [43]:
# Labels are the values we want to predict
y = df['actual']
x = df.drop('actual', axis = 1)

In [44]:
# split the data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3, random_state=42)

In [45]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (243, 14)
Training Labels Shape: (243,)
Testing Features Shape: (105, 14)
Testing Labels Shape: (105,)


In [46]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train)

RandomForestRegressor(n_estimators=1000, random_state=42)

In [47]:
predictions = rf.predict(X_test)
predictions = rf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
round(np.mean(errors),2)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 3.87 degrees.


In [48]:
# Calculate mean absolute percentage error (MAPE) and Accuracy
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 93.96 %.


# Petrol Dataset

In [49]:
df = pd.read_csv('datasets/petrol_consumption.csv')
df

Unnamed: 0,Petrol_tax,Average_income,Paved_Highways,Population_Driver_licence(%),Petrol_Consumption
0,9.0,3571,1976,0.525,541
1,9.0,4092,1250,0.572,524
2,9.0,3865,1586,0.58,561
3,7.5,4870,2351,0.529,414
4,8.0,4399,431,0.544,410
5,10.0,5342,1333,0.571,457
6,8.0,5319,11868,0.451,344
7,8.0,5126,2138,0.553,467
8,8.0,4447,8577,0.529,464
9,7.0,4512,8507,0.552,498


In [50]:
x = df.iloc[:,0:4]
y = df.iloc[:,-1]

In [51]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size= 0.3, random_state=42)

In [52]:
print('Training Features Shape:', X_train.shape)
print('Training Labels Shape:', y_train.shape)
print('Testing Features Shape:', X_test.shape)
print('Testing Labels Shape:', y_test.shape)

Training Features Shape: (33, 4)
Training Labels Shape: (33,)
Testing Features Shape: (15, 4)
Testing Labels Shape: (15,)


In [53]:
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(X_train, y_train)

RandomForestRegressor(n_estimators=1000, random_state=42)

In [54]:
predictions = rf.predict(X_test)
# Calculate the absolute errors
errors = abs(predictions - y_test)
round(np.mean(errors),2)

69.93

In [55]:
# Calculate mean absolute percentage error (MAPE) and Accuracy
mape = 100 * (errors / y_test)
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 86.84 %.
