In [1]:
import pandas as pd
import numpy as np

In [2]:
train_data = pd.read_csv("Train.csv", na_values=("NA", "", " "))
test_data = pd.read_csv("Test.csv", na_values=("NA", "", " "))

In [3]:
print(train_data.shape)
print(test_data.shape)

(33750, 15)
(14454, 14)


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33750 entries, 0 to 33749
Data columns (total 15 columns):
date_time              33750 non-null object
is_holiday             33750 non-null object
air_pollution_index    33750 non-null int64
humidity               33750 non-null int64
wind_speed             33750 non-null int64
wind_direction         33750 non-null int64
visibility_in_miles    33750 non-null int64
dew_point              33750 non-null int64
temperature            33750 non-null float64
rain_p_h               33750 non-null float64
snow_p_h               33750 non-null float64
clouds_all             33750 non-null int64
weather_type           33750 non-null object
weather_description    33750 non-null object
traffic_volume         33750 non-null int64
dtypes: float64(3), int64(8), object(4)
memory usage: 3.9+ MB


In [5]:
train_data.describe()

Unnamed: 0,air_pollution_index,humidity,wind_speed,wind_direction,visibility_in_miles,dew_point,temperature,rain_p_h,snow_p_h,clouds_all,traffic_volume
count,33750.0,33750.0,33750.0,33750.0,33750.0,33750.0,33750.0,33750.0,33750.0,33750.0,33750.0
mean,154.841422,71.209007,3.378193,199.471852,4.989748,4.989748,280.069587,0.448739,0.000318,50.458785,3240.118163
std,83.735515,16.852248,2.055792,99.841088,2.570021,2.570021,13.415256,53.5265,0.00976,38.871734,1991.487289
min,10.0,13.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,83.0,60.0,2.0,130.0,3.0,3.0,271.72,0.0,0.0,1.0,1165.25
50%,155.0,72.0,3.0,200.0,5.0,5.0,280.15,0.0,0.0,64.0,3335.0
75%,228.0,85.0,5.0,290.0,7.0,7.0,290.62,0.0,0.0,90.0,4926.0
max,299.0,100.0,16.0,360.0,9.0,9.0,308.24,9831.3,0.51,100.0,7280.0


In [6]:
week = test_data["date_time"]

In [7]:
# max(train_data["rain_p_h"])  ## this is an outlier

In [8]:
# train_data.loc[train_data["rain_p_h"] == 9831.3]
# train_data.loc[train_data["rain_p_h"] == 9831.3].index

In [9]:
## Drop a row
# train_data = train_data.drop(train_data.index[24872])
# train_data.shape

In [10]:
train_data["date_time"] = pd.to_datetime(train_data["date_time"], format = "%Y%m%d %H:%M:%S")
train_data["date_time"] = train_data["date_time"].dt.day_name()

test_data["date_time"] = pd.to_datetime(test_data["date_time"], format = "%Y%m%d %H:%M:%S")
test_data["date_time"] = test_data["date_time"].dt.day_name()

In [11]:
train_data["date_time"] = train_data["date_time"].astype("category")
train_data["is_holiday"] = train_data["is_holiday"].astype("category")
train_data["weather_type"] = train_data["weather_type"].astype("category")
train_data["weather_description"] = train_data["weather_description"].astype("category")

test_data["date_time"] = test_data["date_time"].astype("category")
test_data["is_holiday"] = test_data["is_holiday"].astype("category")
test_data["weather_type"] = test_data["weather_type"].astype("category")
test_data["weather_description"] = test_data["weather_description"].astype("category")

In [12]:
train_data.isnull().sum()
test_data.isnull().sum()

date_time              0
is_holiday             0
air_pollution_index    0
humidity               0
wind_speed             0
wind_direction         0
visibility_in_miles    0
dew_point              0
temperature            0
rain_p_h               0
snow_p_h               0
clouds_all             0
weather_type           0
weather_description    0
dtype: int64

In [13]:
## Splitting numerical and categorical variables
train_data_num = train_data.iloc[:, 2:12]
test_data_num = test_data.iloc[:, 2:12]

In [14]:
## Encoding categorical data
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [15]:
cat_var = ["date_time", "is_holiday", "weather_type", "weather_description"]
train_data_cat = pd.DataFrame()
test_data_cat = pd.DataFrame()
for col in cat_var:
    train_data_cat[col] = le.fit_transform(train_data[col])
    test_data_cat[col] = le.transform(test_data[col])

In [16]:
print(train_data.shape, train_data_cat.shape)
print(test_data.shape, test_data_cat.shape)

(33750, 15) (33750, 4)
(14454, 14) (14454, 4)


In [17]:
for i in ["date_time", "is_holiday", "weather_type", "weather_description"]:
    print(np.unique(train_data[i]),"\n", np.unique(train_data_cat[i]))

['Friday' 'Monday' 'Saturday' 'Sunday' 'Thursday' 'Tuesday' 'Wednesday'] 
 [0 1 2 3 4 5 6]
['Christmas Day' 'Columbus Day' 'Independence Day' 'Labor Day'
 'Martin Luther King Jr Day' 'Memorial Day' 'New Years Day' 'None'
 'State Fair' 'Thanksgiving Day' 'Veterans Day' 'Washingtons Birthday'] 
 [ 0  1  2  3  4  5  6  7  8  9 10 11]
['Clear' 'Clouds' 'Drizzle' 'Fog' 'Haze' 'Mist' 'Rain' 'Smoke' 'Snow'
 'Squall' 'Thunderstorm'] 
 [ 0  1  2  3  4  5  6  7  8  9 10]
['SQUALLS' 'Sky is Clear' 'broken clouds' 'drizzle' 'few clouds' 'fog'
 'freezing rain' 'haze' 'heavy intensity drizzle' 'heavy intensity rain'
 'heavy snow' 'light intensity drizzle' 'light intensity shower rain'
 'light rain' 'light rain and snow' 'light shower snow' 'light snow'
 'mist' 'moderate rain' 'overcast clouds' 'proximity shower rain'
 'proximity thunderstorm' 'proximity thunderstorm with drizzle'
 'proximity thunderstorm with rain' 'scattered clouds' 'shower drizzle'
 'shower snow' 'sky is clear' 'sleet' 'smoke' 'sn

In [18]:
for i in ["date_time", "is_holiday", "weather_type", "weather_description"]:
    print(np.unique(test_data[i]),"\n", np.unique(test_data_cat[i]))

['Friday' 'Monday' 'Saturday' 'Sunday' 'Thursday' 'Tuesday' 'Wednesday'] 
 [0 1 2 3 4 5 6]
['Christmas Day' 'Columbus Day' 'Independence Day' 'Labor Day'
 'Martin Luther King Jr Day' 'Memorial Day' 'New Years Day' 'None'
 'State Fair' 'Thanksgiving Day' 'Veterans Day' 'Washingtons Birthday'] 
 [ 0  1  2  3  4  5  6  7  8  9 10 11]
['Clear' 'Clouds' 'Drizzle' 'Fog' 'Haze' 'Mist' 'Rain' 'Smoke' 'Snow'
 'Thunderstorm'] 
 [ 0  1  2  3  4  5  6  7  8 10]
['Sky is Clear' 'broken clouds' 'drizzle' 'few clouds' 'fog' 'haze'
 'heavy intensity drizzle' 'heavy intensity rain' 'heavy snow'
 'light intensity drizzle' 'light intensity shower rain' 'light rain'
 'light shower snow' 'light snow' 'mist' 'moderate rain' 'overcast clouds'
 'proximity shower rain' 'proximity thunderstorm'
 'proximity thunderstorm with drizzle' 'proximity thunderstorm with rain'
 'scattered clouds' 'shower drizzle' 'sky is clear' 'sleet' 'smoke' 'snow'
 'thunderstorm' 'thunderstorm with heavy rain'
 'thunderstorm with ligh

In [19]:
print(train_data_num.shape)
print(train_data_cat.shape)

(33750, 10)
(33750, 4)


In [20]:
## Add numerical and categorical data
train = pd.concat([train_data_num, train_data_cat], axis = 1)
test = pd.concat([test_data_num, test_data_cat], axis = 1)

In [21]:
print(train.shape)
print(test.shape)

(33750, 14)
(14454, 14)


In [22]:
print(max(train["rain_p_h"]))
print(train.loc[train["rain_p_h"] == 9831.3].index)
# ## Drop that row in train data
train = train.drop(train.index[24872])


9831.3
Int64Index([24872], dtype='int64')


In [23]:
X = train
y = train_data["traffic_volume"]

In [24]:
# ## Drop the same index 24872 in target also
y = y.drop(y.index[24872])
y = y.values

In [25]:
print(X.shape)
print(y.shape)

(33749, 14)
(33749,)


In [26]:
## Fitting Random Forest regression to the dataset
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators= 1000, random_state = 0)
regressor.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [27]:
y_pred = regressor.predict(X)
# print(y_pred)    ## predicted salaries
# print(y)

In [28]:
predict = regressor.predict(test)
predict = predict.astype('int32')

In [29]:
final = pd.DataFrame({"date_time":week, 'traffic_volume':predict})

In [30]:
final.to_csv("submission_7.csv", index = False)