In [None]:
# RF_sub_26 - 99.97876

In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
import datetime as datetime
import bamboolib

  from numpy.core.umath_tests import inner1d


In [2]:
DF_train = pd.read_csv('Train.csv')
DF_test  = pd.read_csv('Test.csv')


In [3]:
#Dropping some outliers compared to test
print(DF_train.shape)
DF_train = DF_train.loc[((DF_train.rain_p_h < 11) & (DF_train.temperature !=0)),]
#DF_train = DF_train.loc[((DF_train.temperature !=0)),]
print(DF_train.shape)

# Combining the train and test
DF_test['traffic_volume'] = np.nan
print(DF_train.shape, DF_test.shape)
DF_prod = pd.concat([DF_train, DF_test])
print(DF_prod.shape)

(33750, 15)
(33664, 15)
(33664, 15) (14454, 15)
(48118, 15)


In [4]:
# Dropping the outliers
print(DF_prod.shape)
DF_prod = DF_prod.loc[DF_prod.rain_p_h != DF_prod.rain_p_h.max(),]
print(DF_prod.shape)

#Correcting the date format
DF_prod['date_time'] = pd.to_datetime(DF_prod.date_time)


(48118, 15)
(48115, 15)


In [5]:
# Extending the holiday to the entire day
temp = DF_prod.loc[DF_prod.is_holiday != 'None', ['date_time','is_holiday']]
temp['date_time'] = temp['date_time'].dt.date
temp = temp.loc[~temp.duplicated(),]
print(temp.shape)
temp.head()

print(DF_prod.shape)
#Merging it to the dataframe
DF_prod = DF_prod.merge(temp, how = 'left', 
                            left_on = [DF_prod['date_time'].dt.date], 
                            right_on = ['date_time']).drop(['date_time', 'date_time_y', 'is_holiday_x'], axis = 1).rename(columns = {'date_time_x' : 'date_time', 
                                                                                                                                     'is_holiday_y' : 'is_holiday'})
print(DF_prod.shape)
DF_prod.loc[DF_prod.is_holiday.isnull(), ['is_holiday']] = 'None'
DF_prod.is_holiday.value_counts()

(53, 2)
(48115, 15)
(48115, 15)


None                         46706
Labor Day                      157
Washingtons Birthday           136
Thanksgiving Day               135
Memorial Day                   134
New Years Day                  131
Christmas Day                  131
Independence Day               126
Veterans Day                   120
State Fair                     120
Columbus Day                   112
Martin Luther King Jr Day      107
Name: is_holiday, dtype: int64

# Feature Engineering

In [301]:
DF = DF_prod.copy()
DF['day'] = DF.date_time.dt.day
DF['month'] = DF.date_time.dt.month
DF['hour'] = DF.date_time.dt.hour
DF['weekday_number'] = DF.date_time.dt.dayofweek
DF['week_number'] = DF.date_time.dt.week
DF['weekday'] = np.where(DF.date_time.dt.dayofweek < 5, 1, 0)
DF['holiday'] = np.where(DF.is_holiday == 'None', 0, 1)

DF['weekday_plus_holiday'] = np.where((DF['weekday'] + DF['holiday']) == 2, 1 ,0)
DF['wind_direction_NSEW'] = np.where((DF.wind_direction >= 0) & (DF.wind_direction < 90) , 'North_East', 
                                    np.where((DF.wind_direction >= 90) & (DF.wind_direction < 180) , 'South_East', 
                                            np.where((DF.wind_direction >= 180) & (DF.wind_direction < 270) , 'South_West', 'North_West')))

DF['seasons'] = np.where(DF.date_time.dt.month.isin([3,4,5]),'Spring',
                         np.where(DF.date_time.dt.month.isin([6,7,8]),'Summer',
                                  np.where(DF.date_time.dt.month.isin([9,10,11]),'Fall','Winter')))
#DF.head()

In [None]:
DF['peak_hour']= np.where(((DF.hour>=6) & (DF.hour<=8)) | ((DF.hour>=15) & (DF.hour<=17)), 1, 0)

day_time_start = datetime.time(6,00,00)
afternoon_time_start = datetime.time(11,00,00)
evening_start_time = datetime.time(15,00,00)
night_time_start = datetime.time(20,00,00)

DF['part_of_day'] = np.where(((DF['date_time'].dt.time > day_time_start) & (DF['date_time'].dt.time <= afternoon_time_start)), 0, 
                             np.where(((DF['date_time'].dt.time > afternoon_time_start) & (DF['date_time'].dt.time <= evening_start_time)), 1, 
                                     np.where(((DF['date_time'].dt.time > evening_start_time) & (DF['date_time'].dt.time <= night_time_start)), 2, 3)))  

day_light_start = datetime.time(6,00,00)
night_time_start = datetime.time(20,00,00)
DF['Day_light'] = np.where(((DF['date_time'].dt.time >= day_light_start) & (DF['date_time'].dt.time <= night_time_start)), 1, 0)


DF['temperature_lag_1'] = DF.temperature.shift(1)

#DF.head()

In [306]:
#Next day and previous day holiday
temp = DF[['date_time', 'holiday']]
temp['date_time'] = temp['date_time'].dt.date
temp = temp.loc[~temp.duplicated(),]
temp['previous_day_holiday'] = temp['holiday'].shift(1)
temp['next_day_holiday'] = temp['holiday'].shift(-1)

DF = DF.merge(temp[['date_time', 'previous_day_holiday', 'next_day_holiday']], 
              how = 'left', left_on = [DF['date_time'].dt.date], 
              right_on = ['date_time']).drop(['date_time', 'date_time_y'], axis = 1).rename(columns = {'date_time_x' : 'date_time'})


(48115, 2)
(1860, 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [308]:
cat_columns = DF.select_dtypes(include = ['object']).columns
num_columns = DF.columns.difference(cat_columns)
print("Cat columns")
print(cat_columns)
print("Num columns")
print(num_columns)

Cat columns
Index(['weather_type', 'weather_description', 'is_holiday',
       'wind_direction_NSEW', 'seasons'],
      dtype='object')
Num columns
Index(['Day_light', 'air_pollution_index', 'clouds_all', 'date_time', 'day',
       'dew_point', 'holiday', 'hour', 'humidity', 'month', 'next_day_holiday',
       'part_of_day', 'peak_hour', 'previous_day_holiday', 'rain_p_h',
       'snow_p_h', 'temperature', 'temperature_lag_1', 'traffic_volume',
       'visibility_in_miles', 'week_number', 'weekday', 'weekday_number',
       'weekday_plus_holiday', 'wind_direction', 'wind_speed'],
      dtype='object')


# Filling the null values with -999

In [309]:
DF[num_columns] = DF[num_columns].fillna(-999)

# Label Encoding the categorical features

In [310]:
LE = LabelEncoder()
print(DF[cat_columns].head())
DF[cat_columns] = DF[cat_columns].apply(lambda x : LE.fit_transform(x))
print(DF[cat_columns].head())

# DF = pd.get_dummies(DF)
# DF.head()

  weather_type weather_description is_holiday wind_direction_NSEW seasons
0       Clouds    scattered clouds       None          North_West    Fall
1       Clouds       broken clouds       None          North_West    Fall
2       Clouds     overcast clouds       None          North_West    Fall
3       Clouds     overcast clouds       None          North_West    Fall
4       Clouds       broken clouds       None          North_West    Fall
   weather_type  weather_description  is_holiday  wind_direction_NSEW  seasons
0             1                   24           7                    1        0
1             1                    2           7                    1        0
2             1                   19           7                    1        0
3             1                   19           7                    1        0
4             1                    2           7                    1        0


# Drop columns, select columns for modelling

In [311]:

ignore_columns = ['date_time', 'visibility_in_miles', 'snow_p_h'
                  #, 'holiday', 'weekday_plus_holiday', 'is_holiday'
                 ]
dep = 'traffic_volume'
drop = np.append(np.array(ignore_columns), np.array([dep]))
indep = DF.columns.difference(drop)
print(indep)

Index(['Day_light', 'air_pollution_index', 'clouds_all', 'day', 'dew_point',
       'holiday', 'hour', 'humidity', 'is_holiday', 'month',
       'next_day_holiday', 'part_of_day', 'peak_hour', 'previous_day_holiday',
       'rain_p_h', 'seasons', 'temperature', 'temperature_lag_1',
       'weather_description', 'weather_type', 'week_number', 'weekday',
       'weekday_number', 'weekday_plus_holiday', 'wind_direction',
       'wind_direction_NSEW', 'wind_speed'],
      dtype='object')


# Data Validation split

In [312]:
#train_test_split
train_prod = DF.loc[DF.traffic_volume != -999,]
test_prod = DF.loc[DF.traffic_volume == -999,]
print('Train shape', train_prod.shape, 'Test shape', test_prod.shape)

# Validation period
train_local = train_prod.loc[train_prod.date_time.dt.year <= 2015,]
test_local  = train_prod.loc[train_prod.date_time.dt.year > 2015,]

print(train_local.shape, test_local.shape)
print("")
print(train_local.date_time.dt.year.value_counts()) 
print(test_local.date_time.dt.year.value_counts())
print(test_prod.date_time.dt.year.value_counts())

Train shape (33661, 31) Test shape (14454, 31)
(20276, 31) (13385, 31)

2013    8545
2014    4824
2015    4348
2012    2559
Name: date_time, dtype: int64
2016    9285
2017    4100
Name: date_time, dtype: int64
2018    7949
2017    6505
Name: date_time, dtype: int64


# Random Forest

#### Validation model

In [318]:
np.random.seed(100)
RF = RandomForestRegressor(n_estimators = 120, max_depth= 15, 
                          min_samples_leaf = 4
                          )
RF.fit(train_local[indep], train_local[dep])
RF_predict = RF.predict(test_local[indep])
print('RMSE', np.sqrt(mean_squared_error(RF_predict, test_local[dep])))

#RF_prod_predict = RF.predict(test_prod[indep])

RMSE 459.80545516391885


In [295]:
RF_imp = pd.DataFrame({ 'Feature' : indep, 'imp' : RF.feature_importances_}).sort_values(['imp'], ascending = False)
RF_imp

Unnamed: 0,Feature,imp
0,Day_light,0.639173
6,hour,0.191295
23,weekday_number,0.05426
22,weekday,0.042205
13,peak_hour,0.02288
21,week_number,0.00773
24,weekday_plus_holiday,0.006844
5,holiday,0.006328
17,temperature,0.004566
3,day,0.003868


# Final model

In [319]:
np.random.seed(100)
RF = RandomForestRegressor(n_estimators = 120, max_depth= 15, 
                           min_samples_leaf = 4)
RF.fit(train_prod[indep], train_prod[dep])
RF_predict = RF.predict(test_local[indep])
print('RMSE', np.sqrt(mean_squared_error(RF_predict, test_local[dep])))

RF_prod_predict = RF.predict(test_prod[indep])


RMSE 269.5930043762874


In [320]:
RF_sub = pd.DataFrame({'date_time': test_prod['date_time'],
                       'traffic_volume' : RF_prod_predict})
RF_sub.to_csv('RF_sub_26.csv', index = False)