In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor


In [31]:
train= pd.read_csv('train.csv')

In [32]:
train.columns

Index(['id', 'traffic_volume', 'holiday', 'temp', 'rain_1h', 'snow_1h',
       'clouds_all', 'weather_main', 'weather_description', 'date_time'],
      dtype='object')

In [33]:
train.describe()

Unnamed: 0,id,traffic_volume,temp,rain_1h,snow_1h,clouds_all
count,28923.0,28923.0,28923.0,28923.0,28923.0,28923.0
mean,14461.0,3491.231887,325.889376,4.649362,6.501603,49.969747
std,8349.495254,2149.526346,15.869806,58.503935,4.065676,38.606295
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,7230.5,1251.062668,315.834646,1.516596,3.2625,1.0
50%,14461.0,3589.865304,327.221298,4.044256,6.525001,64.0
75%,21691.5,5314.72551,338.13254,7.077448,9.787501,90.0
max,28922.0,7826.553185,357.415637,9940.074027,13.050001,100.0


In [34]:
train.isna().sum()

id                         0
traffic_volume             0
holiday                28886
temp                       0
rain_1h                    0
snow_1h                    0
clouds_all                 0
weather_main               0
weather_description        0
date_time                  0
dtype: int64

In [35]:
train.drop('holiday',axis=1)

Unnamed: 0,id,traffic_volume,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time
0,0,5977.718652,334.271282,7.077448,2.175000,40,Clouds,scattered clouds,02-10-2012 09:00
1,1,4868.417932,335.523582,5.055320,7.612501,75,Clouds,broken clouds,02-10-2012 10:00
2,2,5139.005377,335.778680,8.088512,3.262500,90,Clouds,overcast clouds,02-10-2012 11:00
3,3,5418.217122,336.416425,6.066384,8.700001,90,Clouds,overcast clouds,02-10-2012 12:00
4,4,5301.789059,337.587557,5.055320,13.050001,75,Clouds,broken clouds,02-10-2012 13:00
...,...,...,...,...,...,...,...,...,...
28918,28918,1094.208193,307.520767,8.088512,4.350000,90,Haze,haze,08-12-2016 04:00
28919,28919,3056.236678,307.833842,2.022128,0.000000,75,Snow,light snow,08-12-2016 05:00
28920,28920,5961.548087,307.996177,6.066384,4.350000,1,Snow,light snow,08-12-2016 06:00
28921,28921,6594.356175,308.297656,3.033192,4.350000,1,Mist,mist,08-12-2016 07:00


In [36]:
label_encoder = LabelEncoder()
train['weather_main_encoded'] = label_encoder.fit_transform(train['weather_main'])

In [37]:
train['date_time'] = pd.to_datetime(train['date_time'], format='%d-%m-%Y %H:%M')

In [38]:
train['hour'] = train['date_time'].dt.hour
train

Unnamed: 0,id,traffic_volume,holiday,temp,rain_1h,snow_1h,clouds_all,weather_main,weather_description,date_time,weather_main_encoded,hour
0,0,5977.718652,,334.271282,7.077448,2.175000,40,Clouds,scattered clouds,2012-10-02 09:00:00,1,9
1,1,4868.417932,,335.523582,5.055320,7.612501,75,Clouds,broken clouds,2012-10-02 10:00:00,1,10
2,2,5139.005377,,335.778680,8.088512,3.262500,90,Clouds,overcast clouds,2012-10-02 11:00:00,1,11
3,3,5418.217122,,336.416425,6.066384,8.700001,90,Clouds,overcast clouds,2012-10-02 12:00:00,1,12
4,4,5301.789059,,337.587557,5.055320,13.050001,75,Clouds,broken clouds,2012-10-02 13:00:00,1,13
...,...,...,...,...,...,...,...,...,...,...,...,...
28918,28918,1094.208193,,307.520767,8.088512,4.350000,90,Haze,haze,2016-12-08 04:00:00,4,4
28919,28919,3056.236678,,307.833842,2.022128,0.000000,75,Snow,light snow,2016-12-08 05:00:00,8,5
28920,28920,5961.548087,,307.996177,6.066384,4.350000,1,Snow,light snow,2016-12-08 06:00:00,8,6
28921,28921,6594.356175,,308.297656,3.033192,4.350000,1,Mist,mist,2016-12-08 07:00:00,5,7


In [39]:
X=train[['temp','rain_1h','snow_1h','weather_main_encoded','hour']]
y=train['traffic_volume']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
rf=RandomForestRegressor(n_estimators=2000,max_depth=5)
rf.fit(X_train,y_train)

In [41]:
y_pred=rf.predict(X_valid)
y_pred

array([ 551.10743237, 4897.26372859, 4897.93643176, ..., 2399.19900056,
       2802.50585843, 2297.76151154])

In [42]:
mse=mean_squared_error(y_valid, y_pred,squared=False)
print("Mean Squared Error:", mse)

Mean Squared Error: 1019.4055268601657




In [43]:
test=pd.read_csv('test.csv')

In [44]:
label_encoder=LabelEncoder()
test['weather_main_encoded']=label_encoder.fit_transform(test['weather_main'])

test['date_time']=pd.to_datetime(test['date_time'], format='%d-%m-%Y %H:%M')

test['hour']=test['date_time'].dt.hour


In [45]:
X_test= test[['temp','rain_1h','snow_1h','weather_main_encoded','hour']]
predictions= rf.predict(X_test)

In [46]:
predictions_df = pd.DataFrame({'id':test['id'],'traffic_volume': predictions})
predictions_df.to_csv('random_forest_predictions.csv', index=False)