In [None]:
# i shall be predicting the number of vehicles that shall be 
# passing the san franciso road 
# the data is in the format Tuesday,00:00,San Francisco,no,3
# Day of the week, time of the day, opponent team, binary value
# indicating whether or not a baseball game is currently going on (yes/no), 
# number of vehicles passing by

In [2]:
# importing packages
from time import sleep
import numpy as np
from sklearn import metrics 
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import model_selection, preprocessing
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
# loading our data
input_file = 'traffic_data.txt'

# reading our data
data = []
with open(input_file, 'r') as f:
    for line in f.readlines():
        items = line[:-1].split(',') # this would remove the newline escape sequence
        data.append(items)
data = np.array(data)

In [4]:
# time to encode the non-numerical features in the data and we
# have to be sure not to encode a numerical features.
# each feature that has to be encoded need to have a separate
# label encoder....that we need to keep track off since will be needing them 
# when we want to compute the output for an unknown data point
print('Converting string data to numerical data...')
sleep(2)
label_encoder = []
print(data.dtype)
X_encoded = np.empty(data.shape) # initializing an empty array based on the data shape
for i ,item in enumerate(data[0]):
    if item.isdigit(): # if the item is digit it should be now put into right column in the X_encoded
        X_encoded[:, i] = data[:, i]
    else:
        label_encoder.append(preprocessing.LabelEncoder()) 
        X_encoded[:, i] = label_encoder[-1].fit_transform(data[:, i]) # now the textual data has been coded into numerical data
print('Conversion completed')

# reading the transformed data and casting it to integer datatype
X = X_encoded[:, :-1].astype(int)
Y = X_encoded[:, -1].astype(int)

Converting string data to numerical data...
<U13
Conversion completed


In [5]:
# spltting our data into training and testing dataset
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(X, Y, test_size=0.25, random_state=5)
print(X_test)

[[  4  24   4   0]
 [  2 217   4   0]
 [  1 121   4   0]
 ...
 [  0 252   6   1]
 [  3 241  11   0]
 [  1 260  11   1]]


In [6]:
# creating an extremely random forests regressor paramater
params = {'n_estimators':100, 'max_depth':4, 'random_state':0}

# creating an extremely random forests regressor model
regressor = ExtraTreesRegressor(**params)

# Training the model
regressor.fit(X_train, Y_train)

ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, criterion='mse',
                    max_depth=4, max_features='auto', max_leaf_nodes=None,
                    max_samples=None, min_impurity_decrease=0.0,
                    min_impurity_split=None, min_samples_leaf=1,
                    min_samples_split=2, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=None, oob_score=False,
                    random_state=0, verbose=0, warm_start=False)

In [7]:
# computing the regressor performance on testing data 
# by predicting Y that we have from the value of X_test
# this would help us to calculate our model performance
Y_test_predict = regressor.predict(X_test)
print(X_test[0])
# computing our metric 
mae = metrics.mean_absolute_error(Y_test, Y_test_predict)
print('Mean absoulte error:', mae)

[ 4 24  4  0]
Mean absoulte error: 7.424465944802767


In [10]:
# lets test our model with a new data instance
print('Computing results for new datasets')
datapoints =np.array([['Saturday', '10:20', 'Atlanta', 'no'],['Tuesday','21:30','Arizona','yes'], ['Thursday', '01:00', 'Atlanta', 'no']] )
# we have to encode our datapoints to numerical datapoints
datapoints_encoded = np.empty(datapoints.shape) # initializing an empty array based on the data shape
count = 0
for i ,item in enumerate(datapoints[0]):
    if item.isdigit(): # if the item is digit it should be now put into right column in the X_encoded
        datapoints_encoded[:, i] = datapoints[:, i]
    else:
        datapoints_encoded[:, i] = label_encoder[count].transform(datapoints[:, i])    
        count = count + 1
datapoints_encoded = datapoints_encoded.astype(int)
print('datapoints encoded into -->\n',datapoints_encoded)

# Predict the output for the test datapoint
for i in datapoints_encoded:
    results = regressor.predict([i])
    print('\nPredicted traffic for:',results[0].round(0))

Computing results for new datasets
datapoints encoded into -->
 [[  2 124   1   0]
 [  5 258   0   1]
 [  4  12   1   0]]

Predicted traffic for: 26.0

Predicted traffic for: 25.0

Predicted traffic for: 6.0
