In [1]:
# Pandas is used for data manipulation
import pandas as pd

# Use numpy to convert to arrays
import numpy as np

# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df = pd.read_csv('temps.csv')

In [3]:
df.head()

Unnamed: 0,year,month,day,week,temp_2,temp_1,average,actual,friend
0,2019,1,1,Fri,45,45,45.6,45,29
1,2019,1,2,Sat,44,45,45.7,44,61
2,2019,1,3,Sun,45,44,45.8,41,56
3,2019,1,4,Mon,44,41,45.9,40,53
4,2019,1,5,Tues,41,40,46.0,44,41


In [4]:
df.shape

(348, 9)

In [5]:
df.isnull().sum()

year       0
month      0
day        0
week       0
temp_2     0
temp_1     0
average    0
actual     0
friend     0
dtype: int64

In [6]:
df.columns

Index(['year', 'month', 'day', 'week', 'temp_2', 'temp_1', 'average', 'actual',
       'friend'],
      dtype='object')

In [7]:
# One-hot encode categorical features
df = pd.get_dummies(df)
df.head(5)

Unnamed: 0,year,month,day,temp_2,temp_1,average,actual,friend,week_Fri,week_Mon,week_Sat,week_Sun,week_Thurs,week_Tues,week_Wed
0,2019,1,1,45,45,45.6,45,29,True,False,False,False,False,False,False
1,2019,1,2,44,45,45.7,44,61,False,False,True,False,False,False,False
2,2019,1,3,45,44,45.8,41,56,False,False,False,True,False,False,False
3,2019,1,4,44,41,45.9,40,53,False,True,False,False,False,False,False
4,2019,1,5,41,40,46.0,44,41,False,False,False,False,False,True,False


In [8]:
print('Shape of features after one-hot encoding:', df.shape)

Shape of features after one-hot encoding: (348, 15)


In [9]:
# Labels are the values we want to predict
labels = df['actual']

# Remove the labels from the features
df = df.drop('actual', axis = 1)

# Saving feature names for later use
feature_list = list(df.columns)

In [11]:
feature_list

['year',
 'month',
 'day',
 'temp_2',
 'temp_1',
 'average',
 'friend',
 'week_Fri',
 'week_Mon',
 'week_Sat',
 'week_Sun',
 'week_Thurs',
 'week_Tues',
 'week_Wed']

In [12]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(df,
                                                                            labels,
                                                                            test_size = 0.20,
                                                                            random_state = 42)

In [13]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (278, 14)
Training Labels Shape: (278,)
Testing Features Shape: (70, 14)
Testing Labels Shape: (70,)


In [27]:
x=test_features.to_numpy()
x[12]

array([2019, 12, 10, 41, 36, 45.9, 65, False, False, True, False, False,
       False, False], dtype=object)

In [33]:
rf.predict([x[13]])



array([63.096])

In [34]:
y=test_labels.to_numpy()
y[13]

65

In [14]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor

# Instantiate model
rf = RandomForestRegressor(n_estimators= 1000, random_state=42)

# Train the model on training data
rf.fit(train_features, train_labels);

In [35]:
# Use the forest's predict method on the test data
predictions = rf.predict(test_features)

# Calculate the absolute errors
errors = abs(predictions - test_labels)

# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')

Mean Absolute Error: 3.78 degrees.


In [16]:
predictions

array([68.911, 61.161, 51.789, 61.251, 64.903, 72.525, 81.126, 78.543,
       62.885, 73.721, 64.176, 72.5  , 38.401, 63.096, 71.144, 56.737,
       61.52 , 56.843, 57.115, 76.013, 63.782, 54.808, 65.227, 62.127,
       58.241, 51.965, 66.88 , 47.113, 61.21 , 78.513, 73.1  , 64.059,
       55.673, 79.288, 73.804, 61.929, 53.852, 51.291, 68.417, 43.54 ,
       71.187, 57.225, 76.209, 42.681, 60.809, 74.822, 53.473, 78.87 ,
       54.505, 42.018, 45.627, 42.388, 65.052, 65.869, 75.278, 61.664,
       55.2  , 59.937, 53.554, 59.106, 66.218, 50.42 , 60.741, 70.153,
       60.392, 58.92 , 72.52 , 69.566, 76.215, 40.674])

In [18]:
test_labels

255    66
114    61
314    52
268    66
167    70
       ..
93     68
180    76
183    69
197    78
325    36
Name: actual, Length: 70, dtype: int64

In [17]:
errors

255    2.911
114    0.161
314    0.211
268    4.749
167    5.097
       ...  
93     9.080
180    3.480
183    0.566
197    1.785
325    4.674
Name: actual, Length: 70, dtype: float64

In [19]:
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)

# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')

Accuracy: 94.02 %.


In [20]:
# Pull out one tree from the forest
tree = rf.estimators_[5]

# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)

# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')

# Write graph to a png file
graph.write_png('tree.png');

In [21]:
print('The depth of this tree is:', tree.tree_.max_depth)

The depth of this tree is: 13
