In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime

from tqdm import tqdm_notebook
from math import sin, cos, sqrt, atan2, radians
from polyline.codec import PolylineCodec

from sklearn.preprocessing import OneHotEncoder

# approximate radius of earth in km
R = 6373.0

In [38]:
def mape(y_true, y_pred):
    return np.mean(np.abs(y_true - y_pred)/y_true) * 100

Id

main_id_locality – Идентификатор города  
ETA – Ожидаемое время поездки  
OrderedDate – Время заказа  
latitude – Широта старта заказа  
del_latitude – Широта конца заказа  
longitude – Долгота старта заказа  
del_longitude – Долгота конца заказа  
EDA – Ожидаемая дистанция заказа   
center_latitude – Широта центра города  
center_longitude – Долгота центра города  
route – расчетный трек  


RTA – Реальное время поездки / Требуется предсказать  

track – фактический трек / Нет на момент создания заказа 

RDA – Фактическая дистанция заказа / Нет на момент создания заказа  
ReadyForCollection – Время события "Подъехал" / Нет на момент создания заказа  
ClientCollected – Время события "В пути" / Нет на момент создания заказа  
GoodArrived – Время события "Прибыл" / Нет на момент создания заказа  
ready_latitude – Широта события "Подъехал" / Нет на момент создания заказа  
ready_longitude – Долгота события "Подъехал" / Нет на момент создания заказа  
onway_latitude – Широта события "В пути" / Нет на момент создания заказа  
onway_longitude – Долгота события "В пути" / Нет на момент создания заказа  
arrived_latitude – Широта события "Прибыл" / Нет на момент создания заказа   
arrived_longitude – Долгота события "Прибыл" / Нет на момент создания заказа 

In [133]:
train = pd.read_csv('data/train.csv', index_col=0)
validation = pd.read_csv('data/validation.csv', index_col=0)
X_test = pd.read_csv('data/test.csv', index_col=0)

In [134]:
train.columns

Index(['main_id_locality', 'ETA', 'RTA', 'OrderedDate', 'latitude',
       'del_latitude', 'longitude', 'del_longitude', 'EDA', 'RDA',
       'ReadyForCollection', 'ClientCollected', 'GoodArrived',
       'ready_latitude', 'ready_longitude', 'onway_latitude',
       'onway_longitude', 'arrived_latitude', 'arrived_longitude',
       'center_latitude', 'center_longitude', 'route', 'track'],
      dtype='object')

In [135]:
def clean_d(data):
    #data = data.dropna()
    sub = data['RDA']-data['EDA']
    q1 = sub.quantile(0.05)
    q2 = sub.quantile(0.95)
    data = data[((sub < q2) & (sub > q1))]
    
    test_col = list(X_test.columns)
    test_col.append('RTA')
    data = data[test_col]
    return data

In [136]:
train = clean_d(train)

In [137]:
train.shape

(674104, 23)

In [138]:
train.columns

Index(['main_id_locality', 'ETA', 'RTA', 'OrderedDate', 'latitude',
       'del_latitude', 'longitude', 'del_longitude', 'EDA', 'RDA',
       'ReadyForCollection', 'ClientCollected', 'GoodArrived',
       'ready_latitude', 'ready_longitude', 'onway_latitude',
       'onway_longitude', 'arrived_latitude', 'arrived_longitude',
       'center_latitude', 'center_longitude', 'route', 'track'],
      dtype='object')

In [139]:
X_val, y_val = validation.drop(columns='RTA'), validation['RTA']
X_train, y_train = train.drop(columns='RTA'), train['RTA']

In [140]:
X_train = X_train[X_test.columns]
X_val = X_val[X_test.columns]

In [141]:
X_train.head(100)

Unnamed: 0_level_0,main_id_locality,ETA,OrderedDate,latitude,del_latitude,longitude,del_longitude,EDA,RDA,ReadyForCollection,...,ready_latitude,ready_longitude,onway_latitude,onway_longitude,arrived_latitude,arrived_longitude,center_latitude,center_longitude,route,track
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1078,226.0,2020-02-12 19:12:06,55.826019,55.825581,49.134529,49.126949,1.0,1.0,2020-02-12 19:14:33,...,55.825647,49.134115,55.826261,49.134137,55.825707,49.127136,55.794388,49.111531,gnvsIaq{jHChA??uC???OPG^F^NRzKBd@AN[r@???`@`@`...,io{jHkivsID_A??A}@i@m@I]^SFKBK@AD@@????A?@??A@...
2,1078,612.0,2020-02-12 19:12:44,55.791050,55.819962,49.226070,49.176628,5.0,5.0,2020-02-12 19:15:21,...,55.791099,49.226066,55.791099,49.226066,55.819765,49.177432,55.794388,49.111531,auosI}mmkH?LHd@KhC??o@w@[g@m@iAUk@??{G|OiB`Ek@...,}mmkHkuosI????????????????????????????????K?O@...
7,1078,1229.0,2020-02-12 19:12:56,55.835281,55.751598,49.132278,49.180080,12.0,12.0,2020-02-12 19:18:25,...,55.835314,49.131138,55.835239,49.131444,55.752243,49.180174,55.794388,49.111531,qixsI_d{jHRIbCDj@C??Cc@?w@Es@??jJK|BF~@???Pi@L...,o~zjHkixsI?B?@?@?????????????C@C@?????CLIP?B?@...
8,1078,541.0,2020-02-12 19:13:07,55.755779,55.772110,49.227730,49.228371,2.0,2.0,2020-02-12 19:17:20,...,55.756178,49.228142,55.756235,49.227353,55.771960,49.229215,55.794388,49.111531,sxhsIixmkH?KWe@???gA??gIP??C_ISaH??qDEiC?aFMaE...,etmkHiyhsI?A@M?S?Sk@SS@G@B?H@@BG????A????A???@...
9,1078,1234.0,2020-02-12 19:13:09,55.818020,55.802101,49.081909,49.200531,10.0,10.0,2020-02-12 19:20:14,...,55.818455,49.081742,55.818193,49.081961,55.802402,49.200245,55.794388,49.111531,m~tsIwhqjHQ]??MXkAtA??cBwFyAwFuAoE??gEtFyCnD{E...,ciqjHs~tsICACAKADENILIVOLMTMDGDIAG]MSGa@K[IOEY...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131,1078,772.0,2020-02-19 00:13:14,55.832642,55.870789,49.090900,49.061810,6.0,6.0,2020-02-19 00:20:30,...,55.832750,49.090999,55.832740,49.091024,55.870680,49.061900,55.794388,49.111531,_ywsIu`sjHaKfEc@R??aA^wDfAuIvC}JzCkH`CsFnBYLcC...,{asjHuywsI?@A???????????JGZ]b@m@X{@Rg@Pc@L]LYH...
132,1078,501.0,2020-02-19 00:13:31,55.743568,55.751530,49.203800,49.242031,4.0,4.0,2020-02-19 00:20:17,...,55.743595,49.203906,55.743595,49.203906,55.751563,49.241880,55.794388,49.111531,ilfsIwbikHr@vB??mHrN]~@WlA??KYsCkGy@_BkFmLwEyK...,kcikHmlfsI?@?????????????????????????@????????...
133,1078,826.0,2020-02-19 00:15:42,55.860851,55.835560,49.098881,49.151958,7.0,6.0,2020-02-19 00:21:13,...,55.860649,49.099116,55.860618,49.099114,55.835102,49.152699,55.794388,49.111531,qi}sIettjH\UhACp@P\P??^}E`AqJTiAV_AFY??zA`B~Hn...,_ttjHak}sI]DMD?FHNFJ@N?N@HDD@??@?@???@??AB?@??...
134,1078,459.0,2020-02-19 00:16:03,55.765659,55.750431,49.220482,49.222672,3.0,3.0,2020-02-19 00:22:23,...,55.765792,49.220382,55.765761,49.220466,55.750334,49.222601,55.794388,49.111531,gwjsI_klkH?hA??rHFjE@r@ApCD`B???HcMGuCIc@mBsEG...,yjlkHawjsI@????????A@??????@A?A?@?H?F@B?B?@?DG...


## CATBOOST!!!

In [142]:
from catboost import CatBoostRegressor, Pool

In [143]:
def encode_weekday_hour_by_speed(train_data):
    train_data['RSA'] = train_data['EDA']/train_data['RTA']
    datetime_objects = [datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in train_data['OrderedDate']]
    train_data['OrderedWeekDay'] = [time.weekday() for time in datetime_objects]
    train_data['OrderedHour'] = [time.hour for time in datetime_objects]
    return train_data[['RSA', 'OrderedHour', 'OrderedWeekDay', 
                 'main_id_locality']].groupby(['OrderedHour', 'OrderedWeekDay', 'main_id_locality']).mean()

In [144]:
weekday_hour_by_speed = encode_weekday_hour_by_speed(train.copy())

In [145]:
weekday_hour_by_speed.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,RSA
OrderedHour,OrderedWeekDay,main_id_locality,Unnamed: 3_level_1
0,0,1078,0.009754
0,0,22390,0.008266
0,0,22430,0.008691
0,0,22438,0.008436
0,1,1078,0.009876


In [146]:
def distance(lat1, lon1, lat2, lon2):
    lat1 = lat1.apply(radians)
    lon1 = lon1.apply(radians)
    lat2 = lat2.apply(radians)
    lon2 = lon2.apply(radians)

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = dlat.apply(lambda x: sin(x / 2)**2) + \
        lat1.apply(cos) * lat2.apply(cos) * dlon.apply(lambda x: sin(x / 2)**2)
    c = a.apply(lambda x: 2 * atan2(sqrt(x), sqrt(1 - x)))

    distance = R * c
    return distance

In [150]:
def preprocess_data(data):
    distance_to_center = distance(data['latitude'], data['longitude'],
                                  data['center_latitude'], data['center_longitude'])
    distance_del_to_center =  distance(data['del_latitude'], data['del_longitude'],
                                  data['center_latitude'], data['center_longitude'])
    
    distance_bw_ready = distance(data['latitude'], data['longitude'],
                                  data['ready_latitude'], data['ready_longitude'])
    
    distance_bw_arrived =  distance(data['del_latitude'], data['del_longitude'],
                                  data['arrived_latitude'], data['arrived_longitude'])
    
    diff_start_rout = []
    diff_end_rout = []
    diff_start_track = []
    diff_end_track = []

    data['distance_to_center'] = distance_to_center
    data['distance_del_to_center'] = distance_del_to_center
    
    data['distance_bw_ready'] = distance_bw_ready
    data['distance_bw_arrived'] = distance_bw_arrived
    
    data['OrderedHour'] = pd.to_datetime(data['OrderedDate']).dt.hour
    
    datetime_objects = [datetime.strptime(x, '%Y-%m-%d %H:%M:%S') for x in data['OrderedDate']]
    data['OrderedWeekDay'] = [time.weekday() for time in datetime_objects]
    data['ESA'] = data['EDA']/data['ETA']
    data = data.join(weekday_hour_by_speed, on=['OrderedHour', 'OrderedWeekDay', 'main_id_locality'])
    
    data = pd.get_dummies(data, columns=['OrderedHour', 'OrderedWeekDay'])
    
    days = pd.to_datetime(data['OrderedDate']).dt.day.values
    monthes = pd.to_datetime(data['OrderedDate']).dt.month.values
    mask1 = (monthes == 1) & np.in1d(days, [1, 2, 3, 4, 5, 6, 7, 8, 11, 12, 18, 19, 25, 26])
    mask2 = (monthes == 2) & np.in1d(days, [1, 2, 8, 9, 15, 16, 22, 23, 24, 29])
    mask3 = (monthes == 3) & np.in1d(days, [1, 7, 8, 9, 14, 15, 21, 22, 28, 29])
    data['IsHoliday'] = mask1 | mask2 | mask3
    
    data = data.drop(columns=['OrderedDate'])
    return data

In [151]:
data = X_train.copy()
X_train = preprocess_data(data)

In [57]:
data = X_val.copy()
X_val = preprocess_data(data)

In [58]:
data = X_test.copy()
X_test = preprocess_data(data)

In [62]:
X_train.columns

Index(['main_id_locality', 'ETA', 'latitude', 'del_latitude', 'longitude',
       'del_longitude', 'EDA', 'center_latitude', 'center_longitude',
       'distance_to_center', 'distance_del_to_center', 'ESA', 'RSA',
       'OrderedHour_0', 'OrderedHour_1', 'OrderedHour_2', 'OrderedHour_3',
       'OrderedHour_4', 'OrderedHour_5', 'OrderedHour_6', 'OrderedHour_7',
       'OrderedHour_8', 'OrderedHour_9', 'OrderedHour_10', 'OrderedHour_11',
       'OrderedHour_12', 'OrderedHour_13', 'OrderedHour_14', 'OrderedHour_15',
       'OrderedHour_16', 'OrderedHour_17', 'OrderedHour_18', 'OrderedHour_19',
       'OrderedHour_20', 'OrderedHour_21', 'OrderedHour_22', 'OrderedHour_23',
       'OrderedWeekDay_0', 'OrderedWeekDay_1', 'OrderedWeekDay_2',
       'OrderedWeekDay_3', 'OrderedWeekDay_4', 'OrderedWeekDay_5',
       'OrderedWeekDay_6', 'IsHoliday'],
      dtype='object')

In [104]:
len(X_train.columns)

45

In [153]:
cat_features_index = np.append(np.array([0]), np.arange(15, len(X_train.columns)))

In [81]:
train_pool = Pool(pd.concat([X_train, X_val]), label=pd.concat([y_train, y_val]), cat_features=cat_features_index, 
                  feature_names=list(X_train.columns))

In [82]:
model = CatBoostRegressor(loss_function='MAE', eval_metric='MAE', custom_metric='MAPE', iterations=2500,
                          random_seed=0)

model.fit(train_pool, metric_period=100)
# y_pred = model.predict_proba(X_test)
# roc_auc_score(y_test, y_pred[:, 1])

0:	learn: 279.5730293	total: 181ms	remaining: 15m 4s
100:	learn: 104.9123803	total: 24.3s	remaining: 19m 40s
200:	learn: 99.3571924	total: 47.8s	remaining: 19m
300:	learn: 98.0898205	total: 1m 14s	remaining: 19m 26s
400:	learn: 97.3121278	total: 1m 38s	remaining: 18m 52s
500:	learn: 96.6951010	total: 2m 5s	remaining: 18m 47s
600:	learn: 96.1997164	total: 2m 29s	remaining: 18m 14s
700:	learn: 95.7721730	total: 2m 54s	remaining: 17m 48s
800:	learn: 95.4110653	total: 3m 17s	remaining: 17m 17s
900:	learn: 95.0866431	total: 3m 46s	remaining: 17m 10s
1000:	learn: 94.7992292	total: 4m 13s	remaining: 16m 53s
1100:	learn: 94.5535670	total: 4m 39s	remaining: 16m 28s
1200:	learn: 94.3191028	total: 4m 59s	remaining: 15m 48s
1300:	learn: 94.1147207	total: 5m 20s	remaining: 15m 12s
1400:	learn: 93.9291054	total: 5m 44s	remaining: 14m 45s
1500:	learn: 93.7546117	total: 6m 15s	remaining: 14m 35s
1600:	learn: 93.5902909	total: 6m 40s	remaining: 14m 9s
1700:	learn: 93.4357707	total: 7m 3s	remaining: 13m

<catboost.core.CatBoostRegressor at 0x7fa32f3c6b38>

In [83]:
sorted(zip(model.feature_importances_, model.feature_names_), reverse=True)

[(66.09356741408192, 'ETA'),
 (10.360093916637533, 'RSA'),
 (6.986679786002161, 'EDA'),
 (2.796346146243818, 'del_longitude'),
 (2.32504247452189, 'longitude'),
 (2.2028206725300152, 'del_latitude'),
 (2.1200707421853275, 'latitude'),
 (1.3023298276735091, 'distance_del_to_center'),
 (1.2658424059510076, 'distance_to_center'),
 (1.1166206613076735, 'ESA'),
 (0.5895319432331007, 'IsHoliday'),
 (0.5559520235879768, 'center_latitude'),
 (0.4022178007379855, 'center_longitude'),
 (0.30980031660810414, 'main_id_locality'),
 (0.295555135255524, 'OrderedHour_6'),
 (0.2838402303043711, 'OrderedHour_7'),
 (0.1943462501181951, 'OrderedHour_16'),
 (0.13416389108317198, 'OrderedWeekDay_0'),
 (0.1196247818992101, 'OrderedHour_15'),
 (0.09950259737686236, 'OrderedHour_5'),
 (0.08611285652550713, 'OrderedHour_18'),
 (0.07083657209027931, 'OrderedHour_17'),
 (0.050110261772668946, 'OrderedHour_8'),
 (0.04651864283899719, 'OrderedWeekDay_5'),
 (0.03773982970730755, 'OrderedHour_19'),
 (0.02263314515091

In [84]:
y_pred = model.predict(X_val)

In [85]:
print("MAPE на валидации: ", mape(y_val, y_pred))

MAPE на валидации:  14.093760249732023


In [87]:
y_pred = model.predict(X_test)

In [88]:
submission0 = pd.DataFrame(np.array([np.arange(len(y_pred)), y_pred]).T, 
                           columns=['Id', 'Prediction'])
submission0['Id'] = submission0['Id'].astype(int)
submission0 = submission0.set_index('Id')

In [89]:
submission0.to_csv('submission0.csv')

In [90]:
submission0

Unnamed: 0_level_0,Prediction
Id,Unnamed: 1_level_1
0,922.474119
1,720.275561
2,408.419962
3,749.561150
4,1268.369217
...,...
89933,317.587015
89934,269.367924
89935,1249.842768
89936,287.829065
