In [5]:
# Created by Yash Vakilna
# Last updated: 17 August 2020

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_squared_error
import math

plt.style.use('fivethirtyeight') 

In [6]:
df = pd.read_csv('./Border_Crossing_Entry_Data.csv', 
                 index_col='Date', parse_dates=['Date'])
df.head()

Unnamed: 0_level_0,Port Name,State,Port Code,Border,Measure,Value
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-02-01,Alcan,AK,3104,US-Canada Border,Personal Vehicle Passengers,1414
2020-02-01,Alcan,AK,3104,US-Canada Border,Personal Vehicles,763
2020-02-01,Alcan,AK,3104,US-Canada Border,Truck Containers Empty,412
2020-02-01,Alcan,AK,3104,US-Canada Border,Truck Containers Full,122
2020-02-01,Alcan,AK,3104,US-Canada Border,Trucks,545


# Data Preprocessing
Deleting State, Port Name, and Border since these information are uniquely represented in "Port Code"

In [7]:
del df['State']    
del df['Port Name']
del df['Border']


In [8]:
# Getting daily sums and constructing useful measures
df.groupby(['Date','Port Code', 'Measure']).sum()
df.reset_index()
df = df.reset_index()

# Getting Month, Year, and Day-of-the-week as predictor
df['Month'] = pd.DatetimeIndex(df['Date']).month
df['Year'] = pd.DatetimeIndex(df['Date']).year
df['day'] = pd.DatetimeIndex(df['Date']).dayofweek


In [9]:
# Extracting X, and y
X = df.loc[:,['Port Code','Measure','Year','Month','day']].to_numpy()
y = df['Value'].to_numpy()
X


array([[3104, 'Personal Vehicle Passengers', 2020, 2, 5],
       [3104, 'Personal Vehicles', 2020, 2, 5],
       [3104, 'Truck Containers Empty', 2020, 2, 5],
       ...,
       [115, 'Trucks', 1996, 1, 0],
       [3421, 'Truck Containers Empty', 1996, 1, 0],
       [3103, 'Buses', 1996, 1, 0]], dtype=object)

Converting categorical variables (Measure, Port Code) into onehot encoder

In [10]:
# Encoding Measure (col 1) into one-hot encodded
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
labelencoder_X = LabelEncoder()
X[:, 1] = labelencoder_X.fit_transform(X[:, 1])
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [11]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Training Random forest model 

In [None]:
# Fitting Decision Tree Regression to the dataset
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=0, splitter='best')

In [None]:
# Predicting a new result
y_pred = regressor.predict(X_test)

In [None]:
# Evaluating model using Root mean squared error
f"RMSE = {np.round(math.sqrt(mean_squared_error(y_test, y_pred)),2)}"