In [1]:
import pickle
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error

path = 'C:/Users/Administrator/practicumProject2/'

In [2]:
# load data
with open(path + 'trafficWeatherData_onlyTaxi.pickle', 'rb') as file:
    allData = pickle.load(file)

print(allData.dtypes)

zone_id            int32
timeWindow        object
numPu              int32
numDo              int32
taxi_all           int32
taxi_diff          int32
temperature      float64
precipitation    float64
weatherCode        int32
windSpeed        float64
weekday           object
holiday             bool
dtype: object


# Compute a busyness/slack measure 

Normalize the taxi busyness, and use the normalized taxi busyness as the busyness. 

In [3]:
# normalize the taxi
scaler = StandardScaler()

# normalize taxi_all and use it as busyness measure
allData['busyness'] = scaler.fit_transform(allData[['taxi_all']])

# normalize taxi_diff and use it as taxi slack measure
allData['slack '] = scaler.fit_transform(allData[['taxi_all']])
print(allData.dtypes)

zone_id            int32
timeWindow        object
numPu              int32
numDo              int32
taxi_all           int32
taxi_diff          int32
temperature      float64
precipitation    float64
weatherCode        int32
windSpeed        float64
weekday           object
holiday             bool
busyness         float64
slack            float64
dtype: object


# Prepare data for model training

In [4]:
# creeate the categorical feature list
categorialFeatures = ['zone_id', 'timeWindow', 'weekday', 'holiday', 'weatherCode']
# creeate the continuous feature list
continuousFeatures = ['temperature', 'precipitation', 'windSpeed']

# take out all the features
allFeatures = continuousFeatures + categorialFeatures
allData = allData[['busyness'] + allFeatures]

# convert categorial features to string type
for column in categorialFeatures:
    allData[column] = allData[column].astype(str)

allData.reset_index(drop=True, inplace=True)
print(allData.dtypes)

busyness         float64
temperature      float64
precipitation    float64
windSpeed        float64
zone_id           object
timeWindow        object
weekday           object
holiday           object
weatherCode       object
dtype: object


Now, split data set into training set and test set.

In [5]:
# seperate the target and input
subData = allData.sample(frac = 1, replace=False, random_state = 44215)
X = (subData.drop(columns=['busyness']))
y = subData['busyness']

# divid data to traning set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

Normalize the continuous columns

In [6]:
# create a scaler instance
scaler = StandardScaler()

# take out the continuous columns
X_train_cont = X_train[continuousFeatures]
X_test_cont = X_test[continuousFeatures]

# normalize the continuous columns
X_train_cont_scaled = scaler.fit_transform(X_train_cont)
X_test_cont_scaled = scaler.transform(X_test_cont)

# convert the normalized the continuous data to pandas data frame
X_train_cont_scaled = pd.DataFrame(X_train_cont_scaled, columns = continuousFeatures)
X_test_cont_scaled = pd.DataFrame(X_test_cont_scaled, columns = continuousFeatures)

Encode the caregorial columns

In [7]:
# take out the categorical columns
X_train_cat = X_train[categorialFeatures]
X_test_cat = X_test[categorialFeatures]

# encode the training data set
X_train_cat_encoded = pd.get_dummies(X_train_cat, dtype=int, drop_first=True)

# get the column names of the encoded training data set
X_train_cat_encoded_columns = X_train_cat_encoded.columns

# define an function to make the test data set be encoded consistently
def uniformEncoding(X_train_cat_encoded_columns, X_new_cat):
    # encode the new data set
    X_new_cat_encoded = pd.get_dummies(X_new_cat, dtype=int, drop_first=True)
    # take out the column names that new data misses
    missed_cols = set(X_train_cat_encoded_columns) - set(X_new_cat_encoded.columns)
    # add the missed columns to the new data set, assigning value 0 
    for col in missed_cols:
        X_new_cat_encoded[col] = 0
    # select the columns that training data have in the same order
    X_new_cat_encoded = X_new_cat_encoded[X_train_cat_encoded_columns]
    return X_new_cat_encoded

# apply the function to encode the categorical columns in testing data
X_test_cat_encoded = uniformEncoding(X_train_cat_encoded_columns, X_test_cat)


# reset row index
X_train_cat_encoded.reset_index(drop=True, inplace=True)
X_test_cat_encoded.reset_index(drop=True, inplace=True)

Merge the normalized continuous columns and encoded categorical columns for training and testing

In [8]:
# generate the whole training data set
X_train = pd.concat([X_train_cont_scaled, X_train_cat_encoded], axis=1)
print('The shape of training data frame:', X_train.shape)
print('The columns of training data frame:')
print(X_train.dtypes, '\n')


# generate the whole training data set
X_test = pd.concat([X_test_cont_scaled, X_test_cat_encoded], axis=1)
print('The shape of test data frame:', X_test.shape)
print('The columns of test data frame:')
print(X_test.dtypes, '\n')

The shape of training data frame: (614592, 109)
The columns of training data frame:
temperature       float64
precipitation     float64
windSpeed         float64
zone_id_107         int32
zone_id_113         int32
                   ...   
weatherCode_61      int32
weatherCode_63      int32
weatherCode_71      int32
weatherCode_73      int32
weatherCode_75      int32
Length: 109, dtype: object 

The shape of test data frame: (153648, 109)
The columns of test data frame:
temperature       float64
precipitation     float64
windSpeed         float64
zone_id_107         int32
zone_id_113         int32
                   ...   
weatherCode_61      int32
weatherCode_63      int32
weatherCode_71      int32
weatherCode_73      int32
weatherCode_75      int32
Length: 109, dtype: object 



Perform random forerst regression instance

In [9]:
# set hyper-parameters
n_estimators = 40
max_depth = 40

# create an instance of model
rf_regressor = RandomForestRegressor(n_estimators = n_estimators, max_depth = max_depth, random_state = 4452, oob_score = False)

# train the model on the training data
rf_regressor.fit(X_train, y_train)

# make prediction on the test data
y_pred = rf_regressor.predict(X_test)

# calculate the mse
mse = mean_squared_error(y_test, y_pred)


print('n_estimators = ', n_estimators)
print('max_depth = ', max_depth)
print('Mean Squared Error:', mse, end='\n')

n_estimators =  40
max_depth =  40
Mean Squared Error: 0.39907375177202004


Save the trained model on disk

In [11]:
# construct a dictionary to store the information for future prediction
trainedModel = {'model':rf_regressor,
                 'continuousFeatures':continuousFeatures,
                 'categorialFeatures':categorialFeatures,
                 'X_train_cat_encoded_columns':X_train_cat_encoded_columns,
                 'scaler':scaler,
                 'busyness':y.to_list()}

# save the model on disk
with open(path+'trainedModel_taxiBusy_40_40.pickle', 'wb') as file:
    pickle.dump(trainedModel, file)

# Record the experiment result


n_estimators =  40
max_depth =  40
Mean Squared Error: 0.39907375177202004
Size = 470M