In [14]:
import pandas as pd
import warnings 
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures
import xgboost

In [15]:
import os
for dirname, _, filenames in os.walk('./dataset'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./dataset/Test.csv
./dataset/Train.csv
./dataset/sample_submission.csv


In [16]:
warnings.filterwarnings('ignore')
df=pd.read_csv('./dataset/Train.csv')
dftest=pd.read_csv('./dataset/Test.csv')
df.head(2)

Unnamed: 0,date_time,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,weather_type,weather_description,traffic_volume
0,2012-10-02 09:00:00,,121,89,2,329,1,1,288.28,0.0,0.0,40,Clouds,scattered clouds,5545
1,2012-10-02 10:00:00,,178,67,3,330,1,1,289.36,0.0,0.0,75,Clouds,broken clouds,4516


In [29]:
df

Unnamed: 0,date_time,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,weather_type,weather_description,traffic_volume,date,time,day
0,2012-10-02 09:00:00,0,121,89,2,329,1,1,288.28,0.0,0.0,40,0,scattered clouds,5545,2012-10-02,9,0
1,2012-10-02 10:00:00,0,178,67,3,330,1,1,289.36,0.0,0.0,75,0,broken clouds,4516,2012-10-02,10,0
2,2012-10-02 11:00:00,0,113,66,3,329,2,2,289.58,0.0,0.0,90,0,overcast clouds,4767,2012-10-02,11,0
3,2012-10-02 12:00:00,0,20,66,3,329,5,5,290.13,0.0,0.0,90,0,overcast clouds,5026,2012-10-02,12,0
4,2012-10-02 13:00:00,0,281,65,3,329,7,7,291.14,0.0,0.0,75,0,broken clouds,4918,2012-10-02,13,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33745,2017-05-17 21:00:00,0,74,85,1,328,7,7,287.88,0.0,0.0,90,4,mist,2348,2017-05-17,21,1
33746,2017-05-17 22:00:00,0,81,70,1,24,2,2,286.95,0.0,0.0,90,2,heavy intensity rain,2194,2017-05-17,22,1
33747,2017-05-17 22:00:00,0,138,70,1,24,1,1,286.95,0.0,0.0,90,4,mist,2194,2017-05-17,22,1
33748,2017-05-17 23:00:00,0,229,66,1,42,6,6,285.75,0.0,0.0,90,2,heavy intensity rain,1328,2017-05-17,23,1


In [17]:
#Separating date and time columns
df['date_time']=pd.to_datetime(df.date_time)
df['date']=df['date_time'].dt.date
df['time']=df['date_time'].dt.time

In [19]:
df.head()

Unnamed: 0,date_time,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,weather_type,weather_description,traffic_volume,date,time
0,2012-10-02 09:00:00,,121,89,2,329,1,1,288.28,0.0,0.0,40,Clouds,scattered clouds,5545,2012-10-02,09:00:00
1,2012-10-02 10:00:00,,178,67,3,330,1,1,289.36,0.0,0.0,75,Clouds,broken clouds,4516,2012-10-02,10:00:00
2,2012-10-02 11:00:00,,113,66,3,329,2,2,289.58,0.0,0.0,90,Clouds,overcast clouds,4767,2012-10-02,11:00:00
3,2012-10-02 12:00:00,,20,66,3,329,5,5,290.13,0.0,0.0,90,Clouds,overcast clouds,5026,2012-10-02,12:00:00
4,2012-10-02 13:00:00,,281,65,3,329,7,7,291.14,0.0,0.0,75,Clouds,broken clouds,4918,2012-10-02,13:00:00


In [20]:
dupdf=df.copy()
dupdf.drop(['weather_description','weather_type'],axis=1,inplace=True)
print("Length of duplicated rows= ",len(df[dupdf.duplicated()].index)) 

Length of duplicated rows=  0


In [21]:
list(df['weather_description'][:20].groupby(df['weather_type']))
#Weather description shouldn't be used in model fit
#Too descriptive adds classes and less value

[('Clear',
  5     sky is clear
  6     sky is clear
  7     sky is clear
  11    sky is clear
  12    sky is clear
  13    sky is clear
  14    sky is clear
  15    sky is clear
  16    sky is clear
  17    sky is clear
  18    sky is clear
  19    sky is clear
  Name: weather_description, dtype: object),
 ('Clouds',
  0     scattered clouds
  1        broken clouds
  2      overcast clouds
  3      overcast clouds
  4        broken clouds
  8           few clouds
  9           few clouds
  10          few clouds
  Name: weather_description, dtype: object)]

In [22]:
def apply_encoding(train_uniquelist,col):
    col=col.apply(lambda x: np.where(train_uniquelist==x)[0][0]) #Much faster than for loops
    return pd.Series(col)
train_uniquelist=np.array(df.weather_type.unique())
df.weather_type=apply_encoding(train_uniquelist,df.weather_type)

In [23]:
# Categorising holiday as 1 (for True) and 0 (for false)
df['is_holiday'].replace('None',0,inplace=True)
df.is_holiday=df.is_holiday.apply(lambda x: 1 if x!=0 else 0)

In [24]:
df['time']=df['time'].apply(lambda x: int(str(x).split(':')[0]))
df['day']=pd.to_datetime(df['date']).dt.day_name()
train_uniquelist=np.array(df.day.unique())
df['day']=apply_encoding(train_uniquelist,df['day'])

In [25]:
df.head()

Unnamed: 0,date_time,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,weather_type,weather_description,traffic_volume,date,time,day
0,2012-10-02 09:00:00,0,121,89,2,329,1,1,288.28,0.0,0.0,40,0,scattered clouds,5545,2012-10-02,9,0
1,2012-10-02 10:00:00,0,178,67,3,330,1,1,289.36,0.0,0.0,75,0,broken clouds,4516,2012-10-02,10,0
2,2012-10-02 11:00:00,0,113,66,3,329,2,2,289.58,0.0,0.0,90,0,overcast clouds,4767,2012-10-02,11,0
3,2012-10-02 12:00:00,0,20,66,3,329,5,5,290.13,0.0,0.0,90,0,overcast clouds,5026,2012-10-02,12,0
4,2012-10-02 13:00:00,0,281,65,3,329,7,7,291.14,0.0,0.0,75,0,broken clouds,4918,2012-10-02,13,0


In [26]:
## Normalizing data
def normalize(da,cols):
    for i in cols:
        for j in range(len(i)):
            if(da[i].max()>1):
                da[i]=da[i]/da[i].max()
    return da

In [27]:
data=df[['day','time','is_holiday','air_pollution_index','humidity','wind_speed','wind_direction','visibility_in_miles','dew_point','temperature','rain_p_h','snow_p_h','clouds_all','weather_type']]
data=normalize(data,data.columns[3:13])
data=pd.DataFrame(data[['day','time','is_holiday','air_pollution_index','wind_speed','visibility_in_miles','temperature','rain_p_h','snow_p_h','clouds_all','weather_type']])
target=df[['traffic_volume']]
X=data
y=target

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [28]:
X_train


Unnamed: 0,day,time,is_holiday,air_pollution_index,wind_speed,visibility_in_miles,temperature,rain_p_h,snow_p_h,clouds_all,weather_type
2818,3,14,0,0.692308,0.2500,0.666667,0.898294,0.0,0.0,0.90,0
10214,3,11,0,0.528428,0.0625,0.222222,0.870750,0.0,0.0,0.64,0
27782,1,22,0,0.655518,0.1875,0.111111,0.914742,0.0,0.0,0.75,0
27918,0,8,0,0.926421,0.1875,0.555556,0.904814,0.0,0.0,0.01,1
8866,0,2,0,0.454849,0.0625,0.111111,0.959350,0.0,0.0,0.08,1
...,...,...,...,...,...,...,...,...,...,...,...
20757,2,16,0,0.090301,0.1875,0.666667,0.866922,0.0,0.0,0.90,4
32103,5,15,0,0.688963,0.3125,0.222222,0.903257,0.0,0.0,0.90,0
30403,6,0,0,0.053512,0.0625,0.666667,0.890475,0.0,0.0,0.90,3
21243,3,15,0,0.615385,0.1250,0.111111,0.845056,0.0,0.0,0.01,1
