In [1]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
#from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Load files
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
riders = pd.read_csv('Riders.csv')
ss = pd.read_csv('SampleSubmission.csv')

In [3]:
# check data shapes
train.shape, test.shape, riders.shape, ss.shape

((179867, 21), (76791, 20), (2632, 4), (76791, 2))

In [4]:
# Preview train
train.head()

Unnamed: 0,ID,order_id,dispatch_day,dispatch_day_of_week,dispatch_time,client_id,client_type,order_license_status,order_carrier_type,vendor_type,...,rider_license_status,rider_carrier_type,rider_amount,rider_lat,rider_long,pickup_lat,pickup_long,drop_off_lat,drop_off_long,target
0,ID_SCUW21PVAU,4435,27,6,09:02:54,593630,Business,0,2,Bike,...,0,1,1080,-42.698343,-17.228539,-42.692371,-17.248305,-42.687442,-17.424682,1
1,ID_2HA7X30JMN,32711,30,7,13:01:37,837729,Personal,0,1,Bike,...,0,1,730,-42.787317,-17.288252,-42.784046,-17.290121,-42.673267,-17.234595,2
2,ID_IAJWDTBY6M,8712,14,2,10:01:00,695129,Personal,0,2,Bike,...,1,1,490,-42.74918,-17.287848,-42.765204,-17.293784,-42.813953,-17.294805,1
3,ID_LKSVPNYMTR,44869,22,3,14:11:16,1504660,Personal,0,2,Bike,...,1,1,510,-42.836266,-17.31192,-42.831913,-17.315311,-42.812409,-17.265441,2
4,ID_O7N8Y918YH,57590,27,5,16:11:38,36869,Business,0,2,Bike,...,0,0,400,-42.828195,-17.322818,-42.836056,-17.318111,-42.828517,-17.302052,0


In [5]:
# Preview riders
riders.head()

Unnamed: 0,Rider ID,Active Rider Age,Average Partner Rating,Number of Ratings
0,16261,308,21.05,321
1,8832,224,10.0,27
2,53866,238,17.76,25
3,46368,343,24.56,320
4,45609,399,14.97,214


In [6]:
# Merge rider dataset to train and test sets
train = train.merge(riders, how = 'left', left_on='rider_id', right_on='Rider ID')
test = test.merge(riders, how = 'left', left_on='rider_id', right_on='Rider ID')

# Preview merged dataframe
train.head()

Unnamed: 0,ID,order_id,dispatch_day,dispatch_day_of_week,dispatch_time,client_id,client_type,order_license_status,order_carrier_type,vendor_type,...,rider_long,pickup_lat,pickup_long,drop_off_lat,drop_off_long,target,Rider ID,Active Rider Age,Average Partner Rating,Number of Ratings
0,ID_SCUW21PVAU,4435,27,6,09:02:54,593630,Business,0,2,Bike,...,-17.228539,-42.692371,-17.248305,-42.687442,-17.424682,1,30153,11,10.0,1
1,ID_2HA7X30JMN,32711,30,7,13:01:37,837729,Personal,0,1,Bike,...,-17.288252,-42.784046,-17.290121,-42.673267,-17.234595,2,20884,68,24.13,229
2,ID_IAJWDTBY6M,8712,14,2,10:01:00,695129,Personal,0,2,Bike,...,-17.287848,-42.765204,-17.293784,-42.813953,-17.294805,1,33143,273,24.92,123
3,ID_LKSVPNYMTR,44869,22,3,14:11:16,1504660,Personal,0,2,Bike,...,-17.31192,-42.831913,-17.315311,-42.812409,-17.265441,2,96531,168,23.76,175
4,ID_O7N8Y918YH,57590,27,5,16:11:38,36869,Business,0,2,Bike,...,-17.322818,-42.836056,-17.318111,-42.828517,-17.302052,0,103546,95,24.53,42


In [7]:
# One hot encoding
train = pd.get_dummies(train, columns=['client_type', 'vendor_type'])
test = pd.get_dummies(test, columns=['client_type', 'vendor_type'])

In [8]:
# Split data
main_cols = train.columns.difference(['ID', 'order_id', 'rider_id', 'Rider ID', 'target', 'dispatch_time',	'client_id']).tolist()
X = train[main_cols]
y = train.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 3031)

# Train a model
model = CatBoostClassifier(random_state=3031)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Check score
accuracy_score(y_test, y_pred)

Learning rate set to 0.101949
0:	learn: 1.0718729	total: 134ms	remaining: 2m 14s
1:	learn: 1.0514418	total: 206ms	remaining: 1m 42s
2:	learn: 1.0348860	total: 268ms	remaining: 1m 29s
3:	learn: 1.0213431	total: 323ms	remaining: 1m 20s
4:	learn: 1.0093274	total: 389ms	remaining: 1m 17s
5:	learn: 0.9994345	total: 475ms	remaining: 1m 18s
6:	learn: 0.9912667	total: 543ms	remaining: 1m 17s
7:	learn: 0.9844980	total: 604ms	remaining: 1m 14s
8:	learn: 0.9786591	total: 666ms	remaining: 1m 13s
9:	learn: 0.9723790	total: 786ms	remaining: 1m 17s
10:	learn: 0.9672863	total: 855ms	remaining: 1m 16s
11:	learn: 0.9635087	total: 923ms	remaining: 1m 15s
12:	learn: 0.9601831	total: 1s	remaining: 1m 16s
13:	learn: 0.9571852	total: 1.08s	remaining: 1m 16s
14:	learn: 0.9545947	total: 1.15s	remaining: 1m 15s
15:	learn: 0.9524818	total: 1.22s	remaining: 1m 15s
16:	learn: 0.9502301	total: 1.3s	remaining: 1m 15s
17:	learn: 0.9475642	total: 1.42s	remaining: 1m 17s
18:	learn: 0.9449900	total: 1.52s	remaining: 1m 

0.6264524378717963

In [9]:
# Make predictions in test set and prepare submission file
predictions = model.predict(test[main_cols])
sub_file = ss.copy()
sub_file.target = predictions
sub_file.to_csv('Baseline-Catboost.csv', index = False)

In [None]:
from xgboost import XGBClassifier
# Split data
main_cols = train.columns.difference(['ID', 'order_id', 'rider_id', 'Rider ID', 'target', 'dispatch_time',	'client_id']).tolist()
X = train[main_cols]
y = train.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 3031)

# Train a model
model = XGBClassifier(random_state=3031)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Check score
accuracy_score(y_test, y_pred)





In [None]:
# Make predictions in test set and prepare submission file
predictions = model.predict(test[main_cols])
sub_file = ss.copy()
sub_file.target = predictions
sub_file.to_csv('Baseline-XGBoost.csv', index = False)

In [None]:
catboost_sub = pd.read_csv('Baseline-Catboost.csv')
xgboost_sub = pd.read_csv('Baseline-XGBoost.csv')

In [None]:
xgboost_sub.head()

In [None]:
catboost_sub.head(), xgboost_sub.head()
final_preds = catboost_sub['target']*0.5 + xgboost_sub['target']*0.5
final_sub = ss.copy()
final_sub.target = final_preds
final_sub.to_csv('Final_Sub_Baseline_CX_Ensemble.csv')


In [17]:
final_sub.head()

Unnamed: 0,ID,target
0,ID_3B4D2Q2DSI,0
1,ID_7MPWFJ9XFI,0
2,ID_5VVT3Q3M5B,0
3,ID_C2GTVS1H7K,0
4,ID_0YGC8V3PFT,0
