## A simple random forest without fastai's library

In [2]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

In [3]:
PATH = "input"

In [4]:
dataset = pd.read_csv(f'{PATH}/train_revised.csv', low_memory=False)

In [7]:
rides = dataset['ride_id'].unique()#Get all the ride_ids

In [8]:
#Melt the datatable i.e create a column having the total numbers of tickets sold to a ride id
rows = []
for ride in rides:
    df_temp = dataset[dataset['ride_id'] == ride].copy()
    df_temp['number_of_tickets'] = len(df_temp)
    rows.append(df_temp[:1])
df_train_set = pd.concat(rows)

In [9]:
#Removing cols that i dont feel will positively contribute to the outcome
del df_train_set['seat_number']
del df_train_set['payment_method']
del df_train_set['payment_receipt']
del df_train_set['ride_id']
del df_train_set['travel_to']

In [10]:
df_train_set.head()

Unnamed: 0,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets
0,17/10/2017,07:15,Migori,Bus,49,1
1,19/11/2017,07:12,Migori,Bus,49,1
2,26/11/2017,07:05,Keroka,Bus,49,1
3,27/11/2017,07:10,Homa Bay,Bus,49,5
4,27/11/2017,07:12,Migori,Bus,49,31


In [11]:
store = df_train_set.copy()#Saving this here so i dont have to re-fetch the data everytime

In [12]:
df_train_set = store.copy()

In [13]:
df_train_set["travel_date"] = pd.to_datetime(df_train_set["travel_date"],infer_datetime_format=True)
df_train_set["travel_date"] = df_train_set["travel_date"].dt.dayofweek #change the full date to day of week

In [14]:
#Taking care of categorical data
df_train_set["car_type"] = pd.Categorical(df_train_set["car_type"])
car_type_categories = df_train_set.car_type.cat.categories #This would be used when doing that of the test data
df_train_set["car_type"] = df_train_set.car_type.cat.codes

df_train_set["travel_from"] = pd.Categorical(df_train_set["travel_from"])
travel_from_categories = df_train_set.travel_from.cat.categories #This would be used when doing that of the test data
df_train_set["travel_from"] = df_train_set.travel_from.cat.codes

In [15]:
#express travel time in minutes
df_train_set["travel_time"] = df_train_set["travel_time"].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))

In [16]:
df_train_set.head()

Unnamed: 0,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets
0,1,435,9,0,49,1
1,6,432,9,0,49,1
2,6,425,4,0,49,1
3,0,430,1,0,49,5
4,0,432,9,0,49,31


# Random forest model

In [17]:
X = df_train_set.drop(["number_of_tickets"], axis=1)
y = df_train_set.number_of_tickets

In [18]:
model = RandomForestRegressor(n_estimators=100, criterion="mae", n_jobs=-1)

In [19]:
model.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [20]:
preds_train_set = model.predict(X)

In [21]:
print (mean_absolute_error(preds_train_set,y)) #Result gets me to 80th position as of 19th Dec. 2018

3.1128164506321014


# Predictions for test set

In [22]:
df_test_set = pd.read_csv(f'{PATH}/test_questions.csv', low_memory=False)

Let's first format the data as we did for the training set.

In [23]:
df_test_set.drop(['travel_to'], axis=1, inplace=True)

In [24]:
df_test_set["travel_date"] = pd.to_datetime(df_test_set["travel_date"],infer_datetime_format=True)
df_test_set["travel_date"] = df_test_set["travel_date"].dt.dayofweek

In [25]:
df_test_set["car_type"] = pd.Categorical(df_test_set["car_type"], categories=car_type_categories)
df_test_set["car_type"] = df_test_set.car_type.cat.codes

In [26]:
df_test_set["travel_from"] = pd.Categorical(df_test_set["travel_from"], categories=travel_from_categories)
df_test_set["travel_from"] = df_test_set.travel_from.cat.codes

In [27]:
df_test_set["travel_time"] = df_test_set["travel_time"].str.split(':').apply(lambda x: int(x[0]) * 60 + int(x[1]))

In [28]:
df_test_set.head()

Unnamed: 0,ride_id,travel_date,travel_time,travel_from,car_type,max_capacity
0,247,0,426,7,0,49
1,256,6,668,7,1,11
2,275,4,300,7,1,11
3,285,4,550,7,1,11
4,286,4,560,7,1,11


In [29]:
X.head()

Unnamed: 0,travel_date,travel_time,travel_from,car_type,max_capacity
0,1,435,9,0,49
1,6,432,9,0,49
2,6,425,4,0,49
3,0,430,1,0,49
4,0,432,9,0,49


Now let's calculate predictions using the random forest model we trained.

In [30]:
X_test = df_test_set.drop(['ride_id'], axis=1)
test_set_predictions = model.predict(X_test)

And finally let's create a csv file with predictions. 

In [31]:
d = {'ride_id': df_test_set["ride_id"], 'number_of_ticket': test_set_predictions}
df_predictions = pd.DataFrame(data=d)
df_predictions = df_predictions[['ride_id','number_of_ticket']]

In [32]:
df_predictions.head()

Unnamed: 0,ride_id,number_of_ticket
0,247,6.96
1,256,5.595
2,275,1.08
3,285,10.02
4,286,10.235


In [34]:
df_predictions.to_csv('preds_test_github.csv', index=False) #save to csv file