# Zindi UmojaHack Africa 2021 #2: Sendy - Delivery Rider Response Challenge (INTERMEDIATE) by UmojaHack Africa
This notebook was written in Amazon SageMaker. It uses a number of models to produce meta features and Catboost as the final predictor model.

In [2]:
#import libraries
!pip install catboost xgboost heamy lightgbm tqdm
import math
import numpy as np
import pandas as pd
import logging
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import KFold
from sklearn.naive_bayes import GaussianNB, MultinomialNB,  BernoulliNB
from sklearn.metrics import accuracy_score, log_loss
from sklearn.preprocessing import normalize
from sklearn import svm
from sklearn.model_selection import GridSearchCV
#xboost and heamy
import xgboost as xgb
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from heamy.dataset import Dataset
from heamy.estimator import Classifier
from heamy.pipeline import ModelsPipeline
#Keras modules
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.utils import to_categorical
from keras.datasets import mnist
from keras.utils.vis_utils import model_to_dot
from IPython.display import SVG
from keras.utils import np_utils

Collecting catboost
  Downloading catboost-0.25-cp36-none-manylinux1_x86_64.whl (67.3 MB)
[K     |████████████████████████████████| 67.3 MB 37.8 MB/s eta 0:00:01    |████                            | 8.5 MB 37.8 MB/s eta 0:00:02
[?25hCollecting xgboost
  Downloading xgboost-1.3.3-py3-none-manylinux2010_x86_64.whl (157.5 MB)
[K     |████████████████████████████████| 157.5 MB 81 kB/s /s eta 0:00:01
[?25hCollecting heamy
  Downloading heamy-0.0.7.tar.gz (30 kB)
Collecting lightgbm
  Downloading lightgbm-3.2.0-py3-none-manylinux1_x86_64.whl (2.0 MB)
[K     |████████████████████████████████| 2.0 MB 113.8 MB/s eta 0:00:01
[?25hCollecting tqdm
  Downloading tqdm-4.59.0-py2.py3-none-any.whl (74 kB)
[K     |████████████████████████████████| 74 kB 4.7 MB/s s eta 0:00:01
Collecting graphviz
  Downloading graphviz-0.16-py2.py3-none-any.whl (19 kB)
Building wheels for collected packages: heamy
  Building wheel for heamy (setup.py) ... [?25ldone
[?25h  Created wheel for heamy: filename=heam

Using TensorFlow backend.


# Importing datasets

In [3]:
# Data loading
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
riders = pd.read_csv('Riders.csv')
SUBMISSION_FILE = pd.read_csv('SampleSubmission.csv')

# Merge rider dataset to train and test sets
TRAIN_FILE = train.merge(riders, how = 'left', left_on='rider_id', right_on='Rider ID')
TEST_FILE = test.merge(riders, how = 'left', left_on='rider_id', right_on='Rider ID')


In [4]:
#Plot distplots
def plot_distplots(TRAIN_FILE, colum_num_from=1, colum_num_to=11):
    train = TRAIN_FILE
    _ = plt.figure(figsize=(20, 20))
    i = 0
    for feature in train.columns[colum_num_from:colum_num_to]:
        i += 1
        plt.subplot(5, 5, i)
        sns.distplot(train[train.target == 0][feature], hist=False, label='0')
        sns.distplot(train[train.target == 1][feature], hist=False, label='1')
        sns.distplot(train[train.target == 2][feature], hist=False, label='2')


In [5]:
#plot_distplots(TRAIN_FILE)

In [6]:
# Set global variables
CACHE = False
NFOLDS = 5
SEED = 1337

ID = 'ID'
TARGET = 'target'

#Seed the enviroment - for reproducibilty
np.random.seed(SEED)
logging.basicConfig(level=logging.WARNING)

# Feature Engineering

In [7]:
from math import sin, cos, sqrt, atan2, radians

#Harvesine function for calculating distance 
def calc_dist(lat1, lon1, lat2, lon2):
    # approximate radius of earth in km
    R = 6373.0

    #Convert latitudes to radians
    lat1 = radians(lat1)
    lon1 = radians(lon1)
    lat2 = radians(lat2)
    lon2 = radians(lon2)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))

    distance = R * c

    return distance

In [9]:
from datetime import datetime
from tqdm import tqdm

# Generate features
def add_feats(df):
    # One hot encoding
    df = pd.get_dummies(df, columns=['client_type', 'vendor_type'])
    
    # Convert timestamp to hour of day
    for i in tqdm(range(0, df.shape[0])):
        df.at[i, 'dispatch_time'] = int(df['dispatch_time'][i].split(":")[0])
        
    df['dispatch_time'] = df.dispatch_time.astype(int)

    # Calculate distances
    for i in tqdm(range(0, df.shape[0])):
        rider_lat = df['rider_lat'][i]
        rider_lon = df['rider_long'][i]
        pickup_lat = df['pickup_lat'][i]
        pickup_long = df['pickup_long'][i]
        drop_off_lat = df['drop_off_lat'][i]
        drop_off_long = df['drop_off_long'][i]

        rider_to_pickup = calc_dist(rider_lat, rider_lon, pickup_lat, pickup_long)
        pickup_to_dropoff = calc_dist(drop_off_lat, drop_off_long, pickup_lat, pickup_long)

        df.at[i, 'rider_dist'] = rider_to_pickup
        df.at[i, 'pickup_dist'] = pickup_to_dropoff
        
    return df
        

In [10]:
#Preprocessor for composing heamy dataset
def load_and_process_dataset():
    train = TRAIN_FILE
    test = TEST_FILE

    #Flatten target to y_train
    y_train = train[TARGET].ravel()
    
    classes = train.target.unique()
    num_classes = len(classes)
    print("There are {0} classes: {1} ".format(num_classes, classes))        

    #Drop target and ID columns - not needed here
    train.drop([ID, TARGET], axis=1, inplace=True)
    test.drop([ID], axis=1, inplace=True)
    
    #Add features to datasets
    train = add_feats(train)    
    test = add_feats(test)    
    
    #Normalize num columns
    cols_to_normalize = ['Average Partner Rating', 'Active Rider Age', 'rider_amount']
    train[cols_to_normalize] = normalize(train[cols_to_normalize])
    test[cols_to_normalize] = normalize(test[cols_to_normalize])
 
    
    x_train = train.values
    x_test = test.values

    return {'X_train': x_train, 'X_test': x_test, 'y_train': y_train}

# Create composed Dataset

In [11]:
# Create heamy dataset for use in pipeline
dataset = Dataset(preprocessor=load_and_process_dataset, use_cache=True)

# Model building

In [45]:
#Multiple Models for Meta Features Estimator

#RandomForest1
rf_params = {'n_estimators': 200, 'criterion': 'entropy', 'random_state': 0}
rf = Classifier(dataset=dataset, estimator=RandomForestClassifier, 
                use_cache=False, parameters=rf_params, name='rf')

#RF2
rf1_params = {'n_estimators': 200, 'criterion': 'gini', 'random_state': 0}
rf1 = Classifier(dataset=dataset, estimator=RandomForestClassifier, 
                 use_cache=False, parameters=rf1_params,name='rf1')

#ExtraTreesClassifier__1
et_params = {'n_estimators': 200, 'criterion': 'entropy', 'random_state': 0}
et = Classifier(dataset=dataset, estimator=ExtraTreesClassifier, 
                use_cache=False, parameters=et_params,name='et')

#ExtraTreesClassifier__2
et1_params = {'n_estimators': 200, 'criterion': 'gini', 'random_state': 0}
et1 = Classifier(dataset=dataset, use_cache=False, estimator=ExtraTreesClassifier,
                 parameters=et1_params,name='et1')

#LGBMClassifier__Base
lgb_params = {'n_estimators': 200, 'learning_rate':0.1}
lgbc = Classifier(dataset=dataset, estimator=LGBMClassifier, 
                  use_cache=False, parameters=lgb_params,name='lgbc')

cb_params = {'random_state' : SEED, 'iterations': 200}
#Stacked model
cb = Classifier(dataset=dataset, estimator=CatBoostClassifier, use_cache=False, parameters=cb_params)


#LogisticRegression__Base
logr_params = {'solver' : 'liblinear', 'multi_class' : 'ovr', 'C': 1, 'random_state': 0}
logr = Classifier(dataset=dataset, estimator=LogisticRegression, 
                  use_cache=False, parameters=logr_params,name='logr')

#NaiveBayes__Base
gnb = Classifier(dataset=dataset,estimator=GaussianNB, use_cache=False, name='gnb')

In [46]:
#XGBoost__Base
def xgb_classifier(X_train, y_train, X_test, y_test=None):
    xg_params = {'seed': 0,
                'colsample_bytree': 0.7,
                'silent': 1,
                'subsample': 0.8,
                'learning_rate': 0.03,
                'objective': 'multi:softprob',   
                'num_class': 3,
                'max_depth': 7,
                'min_child_weight': 1,
                'eval_metric': 'mlogloss',
                'nrounds': 200}
    
    X_train = xgb.DMatrix(X_train, label=y_train)
    model = xgb.train(xg_params, X_train, xg_params['nrounds'])
    return model.predict(xgb.DMatrix(X_test))

xgb_first = Classifier(estimator=xgb_classifier, dataset=dataset, use_cache=CACHE, name='xgb_classifier')

# Generate Meta Features from Model Pipeline

In [None]:
pipeline = ModelsPipeline(rf, et, et1, lgbc, logr, cb, gnb, xgb_first)
stack_ds = pipeline.stack(k=NFOLDS,seed=SEED)

# Preview Meta Features

In [28]:
stack_ds.X_train.head()

Unnamed: 0,rf_0,rf_1,rf_2,et_0,et_1,et_2,et1_0,et1_1,et1_2,lgbc_0,...,lgbc_2,logr_0,logr_1,logr_2,gnb_0,gnb_1,gnb_2,xgb_classifier_0,xgb_classifier_1,xgb_classifier_2
0,0.3625,0.1325,0.505,0.475,0.065,0.46,0.5,0.115,0.385,0.315337,...,0.526645,0.46916,0.350343,0.180497,0.56557,0.297674,0.136756,0.386387,0.147178,0.466435
1,0.38,0.155,0.465,0.465,0.175,0.36,0.515,0.15,0.335,0.454191,...,0.430314,0.490457,0.306092,0.203451,0.540434,0.333575,0.125991,0.489247,0.112208,0.398545
2,0.543333,0.361667,0.095,0.66,0.24,0.1,0.6525,0.2375,0.11,0.316149,...,0.111471,0.45504,0.335491,0.209469,0.539537,0.308877,0.151586,0.365196,0.466369,0.168435
3,0.468333,0.376667,0.155,0.325,0.54,0.135,0.3875,0.4825,0.13,0.316979,...,0.266897,0.568211,0.17673,0.25506,0.521407,0.187813,0.29078,0.392689,0.362347,0.244964
4,0.535833,0.239167,0.225,0.725,0.135,0.14,0.64,0.215,0.145,0.658739,...,0.168581,0.617371,0.202754,0.179876,0.552062,0.226541,0.221398,0.596398,0.231854,0.171748


In [29]:
print("Shape of out-of-fold predictions:", "X shape: ", stack_ds.X_train.shape, "y shape: ", stack_ds.y_train.shape)

Shape of out-of-fold predictions: X shape:  (179867, 21) y shape:  (179867,)


# Prepare Data for Catboost Predictor Model

In [30]:
X_train_outfold = stack_ds.X_train.values
X_test_outfold = stack_ds.X_test.values
X = X_train_outfold
y_train_sv = stack_ds.y_train
y = y_train_sv

# Final Model Building
Using out of fold

In [43]:
#CatBoost__Final
cb = CatBoostClassifier
cb_params = {'random_state' : SEED, 'learning_rate': 0.01, 'iterations': 1500, 'depth': 7}

#Stacked model
stacker = Classifier(dataset=stack_ds, estimator=cb, use_cache=False, parameters=cb_params)

# Predict  and export for submission

In [44]:
#Get probabilities
preds_proba = stacker.predict()
#Get class with highest probability
predictions = np.round(np.argmax(preds_proba, axis=1)).astype(int)

#Get a copy of submission dataframe
submission = SUBMISSION_FILE.copy()
#Assign predictions to target
submission[TARGET] = predictions
#Export submission file as CSV
submission.to_csv('new_catboost_out_of_fold.csv', index=None)

0:	learn: 1.0929261	total: 25.3ms	remaining: 37.9s
1:	learn: 1.0873743	total: 48.8ms	remaining: 36.6s
2:	learn: 1.0819126	total: 74.3ms	remaining: 37.1s
3:	learn: 1.0766297	total: 99.2ms	remaining: 37.1s
4:	learn: 1.0714796	total: 123ms	remaining: 36.9s
5:	learn: 1.0664275	total: 149ms	remaining: 37.1s
6:	learn: 1.0614654	total: 174ms	remaining: 37s
7:	learn: 1.0565684	total: 198ms	remaining: 37s
8:	learn: 1.0517879	total: 223ms	remaining: 36.9s
9:	learn: 1.0471136	total: 248ms	remaining: 37s
10:	learn: 1.0425270	total: 273ms	remaining: 37s
11:	learn: 1.0380708	total: 298ms	remaining: 37s
12:	learn: 1.0337050	total: 323ms	remaining: 36.9s
13:	learn: 1.0294151	total: 348ms	remaining: 36.9s
14:	learn: 1.0252502	total: 372ms	remaining: 36.8s
15:	learn: 1.0211438	total: 397ms	remaining: 36.8s
16:	learn: 1.0170666	total: 422ms	remaining: 36.8s
17:	learn: 1.0130863	total: 447ms	remaining: 36.8s
18:	learn: 1.0092582	total: 471ms	remaining: 36.7s
19:	learn: 1.0054370	total: 496ms	remaining: 36