In [21]:
import pandas as pd
import warnings 
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn.preprocessing import PolynomialFeatures

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM

In [22]:
import os
for dirname, _, filenames in os.walk('./dataset'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./dataset/Test.csv
./dataset/Train.csv
./dataset/sample_submission.csv


In [23]:
warnings.filterwarnings('ignore')
df=pd.read_csv('./dataset/Train.csv')
dftest=pd.read_csv('./dataset/Test.csv')
df.head(2)

Unnamed: 0,date_time,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,weather_type,weather_description,traffic_volume
0,2012-10-02 09:00:00,,121,89,2,329,1,1,288.28,0.0,0.0,40,Clouds,scattered clouds,5545
1,2012-10-02 10:00:00,,178,67,3,330,1,1,289.36,0.0,0.0,75,Clouds,broken clouds,4516


In [24]:
df

Unnamed: 0,date_time,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,weather_type,weather_description,traffic_volume
0,2012-10-02 09:00:00,,121,89,2,329,1,1,288.28,0.0,0.0,40,Clouds,scattered clouds,5545
1,2012-10-02 10:00:00,,178,67,3,330,1,1,289.36,0.0,0.0,75,Clouds,broken clouds,4516
2,2012-10-02 11:00:00,,113,66,3,329,2,2,289.58,0.0,0.0,90,Clouds,overcast clouds,4767
3,2012-10-02 12:00:00,,20,66,3,329,5,5,290.13,0.0,0.0,90,Clouds,overcast clouds,5026
4,2012-10-02 13:00:00,,281,65,3,329,7,7,291.14,0.0,0.0,75,Clouds,broken clouds,4918
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33745,2017-05-17 21:00:00,,74,85,1,328,7,7,287.88,0.0,0.0,90,Mist,mist,2348
33746,2017-05-17 22:00:00,,81,70,1,24,2,2,286.95,0.0,0.0,90,Rain,heavy intensity rain,2194
33747,2017-05-17 22:00:00,,138,70,1,24,1,1,286.95,0.0,0.0,90,Mist,mist,2194
33748,2017-05-17 23:00:00,,229,66,1,42,6,6,285.75,0.0,0.0,90,Rain,heavy intensity rain,1328


In [25]:
#Separating date and time columns
df['date_time']=pd.to_datetime(df.date_time)
df['date']=df['date_time'].dt.date
df['time']=df['date_time'].dt.time

In [26]:
df.head()

Unnamed: 0,date_time,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,weather_type,weather_description,traffic_volume,date,time
0,2012-10-02 09:00:00,,121,89,2,329,1,1,288.28,0.0,0.0,40,Clouds,scattered clouds,5545,2012-10-02,09:00:00
1,2012-10-02 10:00:00,,178,67,3,330,1,1,289.36,0.0,0.0,75,Clouds,broken clouds,4516,2012-10-02,10:00:00
2,2012-10-02 11:00:00,,113,66,3,329,2,2,289.58,0.0,0.0,90,Clouds,overcast clouds,4767,2012-10-02,11:00:00
3,2012-10-02 12:00:00,,20,66,3,329,5,5,290.13,0.0,0.0,90,Clouds,overcast clouds,5026,2012-10-02,12:00:00
4,2012-10-02 13:00:00,,281,65,3,329,7,7,291.14,0.0,0.0,75,Clouds,broken clouds,4918,2012-10-02,13:00:00


In [27]:
dupdf=df.copy()
dupdf.drop(['weather_description','weather_type'],axis=1,inplace=True)
print("Length of duplicated rows = ",len(df[dupdf.duplicated()].index)) 

Length of duplicated rows =  0


In [28]:
list(df['weather_description'][:20].groupby(df['weather_type']))
#Weather description shouldn't be used in model fit
#Too descriptive adds classes and less value

[('Clear',
  5     sky is clear
  6     sky is clear
  7     sky is clear
  11    sky is clear
  12    sky is clear
  13    sky is clear
  14    sky is clear
  15    sky is clear
  16    sky is clear
  17    sky is clear
  18    sky is clear
  19    sky is clear
  Name: weather_description, dtype: object),
 ('Clouds',
  0     scattered clouds
  1        broken clouds
  2      overcast clouds
  3      overcast clouds
  4        broken clouds
  8           few clouds
  9           few clouds
  10          few clouds
  Name: weather_description, dtype: object)]

In [29]:
def apply_encoding(train_uniquelist,col):
    col=col.apply(lambda x: np.where(train_uniquelist==x)[0][0]) #Much faster than for loops
    return pd.Series(col)
train_uniquelist=np.array(df.weather_type.unique())
df.weather_type=apply_encoding(train_uniquelist,df.weather_type)

In [30]:
# Categorising holiday as 1 (for True) and 0 (for false)
df['is_holiday'].replace('None',0,inplace=True)
df.is_holiday=df.is_holiday.apply(lambda x: 1 if x!=0 else 0)

In [31]:
df['time']=df['time'].apply(lambda x: int(str(x).split(':')[0]))
df['day']=pd.to_datetime(df['date']).dt.day_name()
train_uniquelist=np.array(df.day.unique())
df['day']=apply_encoding(train_uniquelist,df['day'])

In [32]:
df.head()

Unnamed: 0,date_time,is_holiday,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,weather_type,weather_description,traffic_volume,date,time,day
0,2012-10-02 09:00:00,0,121,89,2,329,1,1,288.28,0.0,0.0,40,0,scattered clouds,5545,2012-10-02,9,0
1,2012-10-02 10:00:00,0,178,67,3,330,1,1,289.36,0.0,0.0,75,0,broken clouds,4516,2012-10-02,10,0
2,2012-10-02 11:00:00,0,113,66,3,329,2,2,289.58,0.0,0.0,90,0,overcast clouds,4767,2012-10-02,11,0
3,2012-10-02 12:00:00,0,20,66,3,329,5,5,290.13,0.0,0.0,90,0,overcast clouds,5026,2012-10-02,12,0
4,2012-10-02 13:00:00,0,281,65,3,329,7,7,291.14,0.0,0.0,75,0,broken clouds,4918,2012-10-02,13,0


In [33]:
## Normalizing data
def normalize(da,cols):
    for i in cols:
        for j in range(len(i)):
            if(da[i].max()>1):
                da[i]=da[i]/da[i].max()
    return da

In [34]:
data=df[['day','time','is_holiday','air_pollution_index','humidity','wind_speed','wind_direction','visibility_in_miles','dew_point','temperature','rain_p_h','snow_p_h','clouds_all','weather_type']]
data=normalize(data,data.columns[3:13])
data=pd.DataFrame(data[['day','time','is_holiday','air_pollution_index','wind_speed','visibility_in_miles','temperature','rain_p_h','snow_p_h','clouds_all','weather_type']])
target=df[['traffic_volume']]
X=data
y=target

x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [39]:
x_train.shape
x_train[:1].shape

(1, 11)

In [None]:
model = Sequential()

# IF you are running with a GPU, try out the CuDNNLSTM layer type instead (don't pass an activation, tanh is required)
model.add(LSTM(128, input_shape=(11), activation='relu', return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(128, activation='relu'))
model.add(Dropout(0.1))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))

model.add(Dense(1, activation='sigmoid'))

opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)

# Compile model
model.compile(
    loss='mean_squared_error',
    optimizer=opt,
    metrics=['accuracy'],
)

NameError: name 'Sequential' is not defined

In [None]:
model.fit(x_train,
          y_train,
          epochs=3,
          validation_data=(x_test, y_test))