## Approach using fastai's library

In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

  from numpy.core.umath_tests import inner1d


In [2]:
from fastai.imports import *
from fastai.structured import *

from IPython.display import display

In [3]:
PATH = "input"

In [4]:
dataset = pd.read_csv(f'{PATH}/train_revised.csv', low_memory=False)

In [5]:
rides = dataset['ride_id'].unique()#Get the list of ride_ids

In [6]:
#Melt the datatable i.e create a column having the total numbers of tickets for each ride
rows = []
for ride in rides:
    df_temp = dataset[dataset['ride_id'] == ride].copy()
    df_temp['number_of_tickets'] = len(df_temp)
    rows.append(df_temp[:1])
df_train_set = pd.concat(rows)

In [7]:
#Removing cols that i dont feel will positively contribute to the outcome
del df_train_set['seat_number']
del df_train_set['payment_method']
del df_train_set['payment_receipt']
del df_train_set['ride_id']
del df_train_set['travel_to']

In [8]:
df_train_set.head()

Unnamed: 0,travel_date,travel_time,travel_from,car_type,max_capacity,number_of_tickets
0,17/10/2017,07:15,Migori,Bus,49,1
1,19/11/2017,07:12,Migori,Bus,49,1
2,26/11/2017,07:05,Keroka,Bus,49,1
3,27/11/2017,07:10,Homa Bay,Bus,49,5
4,27/11/2017,07:12,Migori,Bus,49,31


In [9]:
store = df_train_set.copy()#Store this here to save us from re-fetching(Fast ai has a more permanent method for this)

In [10]:
df_train_set = store.copy()

In [11]:
#Using fastai's add_datepart method, we use the travel_date for feature engineering
add_datepart(df_train_set, 'travel_date')
df_train_set.head()

Unnamed: 0,travel_time,travel_from,car_type,max_capacity,number_of_tickets,travel_Year,travel_Month,travel_Week,travel_Day,travel_Dayofweek,travel_Dayofyear,travel_Is_month_end,travel_Is_month_start,travel_Is_quarter_end,travel_Is_quarter_start,travel_Is_year_end,travel_Is_year_start,travel_Elapsed
0,07:15,Migori,Bus,49,1,2017,10,42,17,1,290,False,False,False,False,False,False,1508198400
1,07:12,Migori,Bus,49,1,2017,11,46,19,6,323,False,False,False,False,False,False,1511049600
2,07:05,Keroka,Bus,49,1,2017,11,47,26,6,330,False,False,False,False,False,False,1511654400
3,07:10,Homa Bay,Bus,49,5,2017,11,48,27,0,331,False,False,False,False,False,False,1511740800
4,07:12,Migori,Bus,49,31,2017,11,48,27,0,331,False,False,False,False,False,False,1511740800


In [12]:
#Coverted the time to a categorical variable
def convert(x):
    if(int(x[0]) >= 0 and int(x[0]) < 12):
        return 'Morning'
    elif(int(x[0]) >= 12 and int(x[0]) < 6):
        return 'Afternoon'
    else:
        return 'Night'

In [13]:
df_train_set["travel_time"] = df_train_set["travel_time"].str.split(':').apply(convert)

In [14]:
#Taking care of categorical vars
df_train_set["car_type"] = pd.Categorical(df_train_set["car_type"])
car_type_categories = df_train_set.car_type.cat.categories
df_train_set["car_type"] = df_train_set.car_type.cat.codes

df_train_set["travel_from"] = pd.Categorical(df_train_set["travel_from"])
travel_from_categories = df_train_set.travel_from.cat.categories
df_train_set["travel_from"] = df_train_set.travel_from.cat.codes

df_train_set["travel_time"] = pd.Categorical(df_train_set["travel_time"])
travel_time_categories = df_train_set.travel_time.cat.categories
df_train_set["travel_time"] = df_train_set.travel_time.cat.codes

Using the feature engineered columns as-is made the randon forest model do poorly than when it was just a single colum representing the dates and that's weird, so i changed the Boolean values to Ones and Zeros

In [15]:
#Converting booleans to digits
df_train_set["travel_Is_month_end"] = pd.Categorical(df_train_set["travel_Is_month_end"])
travel_Is_month_end_categories = df_train_set.travel_Is_month_end.cat.categories
df_train_set["travel_Is_month_end"] = df_train_set.travel_Is_month_end.cat.codes

df_train_set["travel_Is_month_start"] = pd.Categorical(df_train_set["travel_Is_month_start"])
travel_Is_month_start_categories = df_train_set.travel_Is_month_start.cat.categories
df_train_set["travel_Is_month_start"] = df_train_set.travel_Is_month_start.cat.codes

df_train_set["travel_Is_quarter_end"] = pd.Categorical(df_train_set["travel_Is_quarter_end"])
travel_Is_quarter_end_categories = df_train_set.travel_Is_quarter_end.cat.categories
df_train_set["travel_Is_quarter_end"] = df_train_set.travel_Is_quarter_end.cat.codes

df_train_set["travel_Is_quarter_start"] = pd.Categorical(df_train_set["travel_Is_quarter_start"])
travel_Is_quarter_start_categories = df_train_set.travel_Is_quarter_start.cat.categories
df_train_set["travel_Is_quarter_start"] = df_train_set.travel_Is_quarter_start.cat.codes

df_train_set["travel_Is_year_end"] = pd.Categorical(df_train_set["travel_Is_year_end"])
travel_Is_year_end_categories = df_train_set.travel_Is_year_end.cat.categories
df_train_set["travel_Is_year_end"] = df_train_set.travel_Is_year_end.cat.codes

df_train_set["travel_Is_year_start"] = pd.Categorical(df_train_set["travel_Is_year_start"])
travel_Is_year_start_categories = df_train_set.travel_Is_year_start.cat.categories
df_train_set["travel_Is_year_start"] = df_train_set.travel_Is_year_start.cat.codes

In [16]:
df_train_set.head()

Unnamed: 0,travel_time,travel_from,car_type,max_capacity,number_of_tickets,travel_Year,travel_Month,travel_Week,travel_Day,travel_Dayofweek,travel_Dayofyear,travel_Is_month_end,travel_Is_month_start,travel_Is_quarter_end,travel_Is_quarter_start,travel_Is_year_end,travel_Is_year_start,travel_Elapsed
0,0,9,0,49,1,2017,10,42,17,1,290,0,0,0,0,0,0,1508198400
1,0,9,0,49,1,2017,11,46,19,6,323,0,0,0,0,0,0,1511049600
2,0,4,0,49,1,2017,11,47,26,6,330,0,0,0,0,0,0,1511654400
3,0,1,0,49,5,2017,11,48,27,0,331,0,0,0,0,0,0,1511740800
4,0,9,0,49,31,2017,11,48,27,0,331,0,0,0,0,0,0,1511740800


# Random forest model

In [17]:
X = df_train_set.drop(["number_of_tickets"], axis=1)
y = df_train_set.number_of_tickets

In [18]:
model = RandomForestRegressor(n_estimators=100, criterion="mae", n_jobs=-1)

In [39]:
model.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [41]:
preds_train_set = model.predict(X)

In [24]:
print (mean_absolute_error(preds_train_set,y)) #Gave a zindi error score of 5.87

3.1838742198751797


In [42]:
# print (mean_absolute_error(preds_train_set,y))#I tried feature scaling and results were similar

3.179527124339894


# Predictions for test set

In [25]:
df_test_set = pd.read_csv(f'{PATH}/test_questions.csv', low_memory=False)

Let's first format the data as we did for the training set.

In [26]:
df_test_set.drop(['travel_to'], axis=1, inplace=True)

In [27]:
add_datepart(df_test_set, 'travel_date')
df_test_set.head()

Unnamed: 0,ride_id,travel_time,travel_from,car_type,max_capacity,travel_Year,travel_Month,travel_Week,travel_Day,travel_Dayofweek,travel_Dayofyear,travel_Is_month_end,travel_Is_month_start,travel_Is_quarter_end,travel_Is_quarter_start,travel_Is_year_end,travel_Is_year_start,travel_Elapsed
0,247,07:06,Kisii,Bus,49,2018,5,19,7,0,127,False,False,False,False,False,False,1525651200
1,256,11:08,Kisii,shuttle,11,2018,5,18,6,6,126,False,False,False,False,False,False,1525564800
2,275,05:00,Kisii,shuttle,11,2018,5,18,4,4,124,False,False,False,False,False,False,1525392000
3,285,09:10,Kisii,shuttle,11,2018,5,18,4,4,124,False,False,False,False,False,False,1525392000
4,286,09:20,Kisii,shuttle,11,2018,5,18,4,4,124,False,False,False,False,False,False,1525392000


In [28]:
df_test_set["travel_time"] = df_test_set["travel_time"].str.split(':').apply(convert)

In [29]:
#Converting date booleans to digits
df_test_set["travel_Is_month_end"] = pd.Categorical(df_test_set["travel_Is_month_end"], categories = travel_Is_month_end_categories)
df_test_set["travel_Is_month_end"] = df_test_set.travel_Is_month_end.cat.codes

df_test_set["travel_Is_month_start"] = pd.Categorical(df_test_set["travel_Is_month_start"], categories = travel_Is_month_start_categories)
df_test_set["travel_Is_month_start"] = df_test_set.travel_Is_month_start.cat.codes

df_test_set["travel_Is_quarter_end"] = pd.Categorical(df_test_set["travel_Is_quarter_end"], categories = travel_Is_quarter_end_categories)
df_test_set["travel_Is_quarter_end"] = df_test_set.travel_Is_quarter_end.cat.codes

df_test_set["travel_Is_quarter_start"] = pd.Categorical(df_test_set["travel_Is_quarter_start"], categories = travel_Is_quarter_start_categories)
df_test_set["travel_Is_quarter_start"] = df_test_set.travel_Is_quarter_start.cat.codes

df_test_set["travel_Is_year_end"] = pd.Categorical(df_test_set["travel_Is_year_end"], categories = travel_Is_year_end_categories)
df_test_set["travel_Is_year_end"] = df_test_set.travel_Is_year_end.cat.codes

df_test_set["travel_Is_year_start"] = pd.Categorical(df_test_set["travel_Is_year_start"], categories = travel_Is_year_start_categories)
df_test_set["travel_Is_year_start"] = df_test_set.travel_Is_year_start.cat.codes

In [30]:
df_test_set["car_type"] = pd.Categorical(df_test_set["car_type"], categories=car_type_categories)
df_test_set["car_type"] = df_test_set.car_type.cat.codes

df_test_set["travel_from"] = pd.Categorical(df_test_set["travel_from"], categories=travel_from_categories)
df_test_set["travel_from"] = df_test_set.travel_from.cat.codes

df_test_set["travel_time"] = pd.Categorical(df_test_set["travel_time"], categories=travel_time_categories)
df_test_set["travel_time"] = df_test_set.travel_time.cat.codes

In [31]:
df_test_set.head()

Unnamed: 0,ride_id,travel_time,travel_from,car_type,max_capacity,travel_Year,travel_Month,travel_Week,travel_Day,travel_Dayofweek,travel_Dayofyear,travel_Is_month_end,travel_Is_month_start,travel_Is_quarter_end,travel_Is_quarter_start,travel_Is_year_end,travel_Is_year_start,travel_Elapsed
0,247,0,7,0,49,2018,5,19,7,0,127,0,0,0,0,0,0,1525651200
1,256,0,7,1,11,2018,5,18,6,6,126,0,0,0,0,0,0,1525564800
2,275,0,7,1,11,2018,5,18,4,4,124,0,0,0,0,0,0,1525392000
3,285,0,7,1,11,2018,5,18,4,4,124,0,0,0,0,0,0,1525392000
4,286,0,7,1,11,2018,5,18,4,4,124,0,0,0,0,0,0,1525392000


In [32]:
X.head()

Unnamed: 0,travel_time,travel_from,car_type,max_capacity,travel_Year,travel_Month,travel_Week,travel_Day,travel_Dayofweek,travel_Dayofyear,travel_Is_month_end,travel_Is_month_start,travel_Is_quarter_end,travel_Is_quarter_start,travel_Is_year_end,travel_Is_year_start,travel_Elapsed
0,0,9,0,49,2017,10,42,17,1,290,0,0,0,0,0,0,1508198400
1,0,9,0,49,2017,11,46,19,6,323,0,0,0,0,0,0,1511049600
2,0,4,0,49,2017,11,47,26,6,330,0,0,0,0,0,0,1511654400
3,0,1,0,49,2017,11,48,27,0,331,0,0,0,0,0,0,1511740800
4,0,9,0,49,2017,11,48,27,0,331,0,0,0,0,0,0,1511740800


Now let's calculate predictions using the random forest model we trained.

In [33]:
X_test = df_test_set.drop(['ride_id'], axis=1)
test_set_predictions = model.predict(X_test)

And finally let's create a csv file with predictions. 

In [34]:
d = {'ride_id': df_test_set["ride_id"], 'number_of_ticket': test_set_predictions}
df_predictions = pd.DataFrame(data=d)
df_predictions = df_predictions[['ride_id','number_of_ticket']]

In [35]:
df_predictions.head()

Unnamed: 0,ride_id,number_of_ticket
0,247,1.65
1,256,2.435
2,275,1.53
3,285,1.53
4,286,1.53


In [37]:
df_predictions.to_csv('preds_fastai_github.csv', index=False) #save to csv file