# RTA Freeway Machine learning


In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np

#### Eliminating the entire column if empty cells are more than 10000.

In [2]:
def remove_nan_rows(dataframe):
    for column in dataframe.columns:
        if dataframe[column].isnull().sum() > 10000:
            del dataframe[column]
    return dataframe

##### Splitting the data into different feature set. Year, Month, Day, Hour, Minute

In [3]:
def spilt_date(list_of_date_string,date_separator='-',time_separator=':',format='yyyy-mm-dd hh:mm'):
    month_list = list([])
    day_list = list([])
    year_list = list([])
    hour_list = list([])
    minute_list = list([])
    for date_string in list_of_date_string:
        timestamp_list = date_string.strip().split(' ')
        date_list = timestamp_list[0].strip().split(date_separator)
        month_list.append(int(date_list[1]))
        day_list.append(int(date_list[2]))
        year_list.append(int(date_list[0]))
        time_list = timestamp_list[1].strip().split(time_separator)
        hour_list.append(int(time_list[0]))
        minute_list.append(int(time_list[1]))
    return month_list,day_list,year_list,hour_list,minute_list

##### Checking the column has numeric

In [4]:
def isfloat(num):
    try:
        num = float(num)
        return True
    except ValueError:
        return False

##### Printing the accuracy and prediction

In [5]:
def print_evaluation_metrics(trained_model,trained_model_name,X_test,y_test):
    print('--------- For Model : ', trained_model_name ,' ---------\n')
    predicted_values = trained_model.predict(X_test)
    print("Predicted Value : ", predicted_values)
 
    print("Mean Absolute Error : ", metrics.mean_absolute_error(y_test,predicted_values))
    print("Accuracy : ", metrics.accuracy_score(y_test.astype(int),predicted_values.astype(int)))
    print("---------------------------------------\n")

##### Reading the file, removing the non empty rows and columns. Splitting the datetime and putting into seperate list.
##### After spliting the datetime into seperate feature remove the column from dataset.

In [6]:
#Read file
data_file = 'RTAHistorical.csv'
dataset = pd.read_csv(data_file)
dataset = remove_nan_rows(dataset)
dataset.dropna(inplace=True)

#print(dataset.isnull().sum())
#Split DateTime column
dataset['month'],dataset['day'],dataset['year'],\
dataset['hour'],dataset['minute'] = spilt_date(list(dataset['Unnamed: 0'].values))
#print(dataset)

#spliting the Datetime column into day, month, year, hr, min
del dataset['Unnamed: 0']   
#print(dataset)



###### Reading the data. iteration_target_frame contains column values means full segement values. iteration_feature_frame contain row values
#####   means all segment of each row. 
###### Data is split with 80% train data and 20% test data
###### Random Forest Regressor is used to predict the time taken to travel segment.

In [None]:
columns = dataset.columns
column_list = list([])

#print(columns)

for c in columns:
    if isfloat(c):
        column_list.append(c)

abc = list([])
abc.append('month')
abc.append('day')
abc.append('year')
abc.append('hour')
abc.append('minute')

for col in column_list:
    print(col)
    iteration_target_frame = dataset[col].values
    iteration_feature_frame = dataset[abc].values
    X_train,X_test,y_train,y_test = train_test_split(iteration_feature_frame,iteration_target_frame,test_size=0.2,random_state=42)
    rf = RandomForestRegressor(n_estimators=51,min_samples_leaf=5,min_samples_split=3,random_state=42)
    rf.fit(X_train,y_train)
    print_evaluation_metrics(rf, "Random Forest", X_test,y_test)

40010
--------- For Model :  Random Forest  ---------

Predicted Value :  [826.18313011 850.67815883 826.63764897 ... 804.14644696 811.75835056
 824.16669915]
Mean Absolute Error :  49.53764483685065
Accuracy :  0.031187427947333292
---------------------------------------

40015
--------- For Model :  Random Forest  ---------

Predicted Value :  [209.86602134 221.36701183 208.12304276 ... 206.99353893 209.36462598
 208.45530581]
Mean Absolute Error :  16.9110669970084
Accuracy :  0.09256113099933257
---------------------------------------

40020
--------- For Model :  Random Forest  ---------

Predicted Value :  [898.92624534 887.32267269 797.17037912 ... 786.59337068 799.8777151
 787.30915269]
Mean Absolute Error :  62.33166027916452
Accuracy :  0.03413021054547661
---------------------------------------

40025
--------- For Model :  Random Forest  ---------

Predicted Value :  [321.1631116  291.93535293 242.21779807 ... 234.84359403 236.27241231
 240.54415188]
Mean Absolute Error :  