In [27]:
import pandas as pd
import numpy as np
import pickle
import datetime
import xgboost as xgb


In [28]:
HOUR_IN_MINUTES = 60
DAY_IN_MINUTES = 24 * HOUR_IN_MINUTES
WEEK_IN_MINUTES = 7 * DAY_IN_MINUTES

MAX_TIME = DAY_IN_MINUTES

In [29]:
set_name = 'set2'
path_train_set = '../../data/train/{}.csv'.format(set_name)

data = pd.read_csv(path_train_set)
data.datetime = data.datetime.apply(
    lambda x: datetime.datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
data = data.sort_values('datetime')
data.head()

Unnamed: 0,datetime,num_orders
0,2018-04-01 00:00:00,0
1,2018-04-01 00:01:00,0
2,2018-04-01 00:02:00,0
3,2018-04-01 00:03:00,0
4,2018-04-01 00:04:00,0


In [30]:
target_positions = {
    'set1': [10, 30, 45, 60, 75],
    'set2': [5, 10, 15, 20, 25],
    'set3': [5, 7, 9, 11, 13]
}[set_name]

In [31]:
samples = {
    'datetime': [],
    'history': []}

for position in target_positions:
    samples['target_{}'.format(position)] = []
    
num_orders = data.num_orders.values

In [32]:
# start after 2 weeks because of history
# finish earlier because of target calculation
for i in range(2 * WEEK_IN_MINUTES,
               len(num_orders) - 2 * DAY_IN_MINUTES):
    
    samples['datetime'].append(data.datetime[i])
    samples['history'].append(num_orders[i-2*WEEK_IN_MINUTES:i])
    
    # cumsum not for all array because of time economy
    cumsum_num_orders = num_orders[i+1:i+1+2*DAY_IN_MINUTES].cumsum()
    for position in target_positions:
        orders_by_positions = np.where(cumsum_num_orders >= position)[0]
        if len(orders_by_positions):
            time = orders_by_positions[0] + 1
        else:
            # if no orders in last days
            time = MAX_TIME
        samples['target_{}'.format(position)].append(time)

In [33]:
df = pd.DataFrame.from_dict(samples)
df.head()

Unnamed: 0,datetime,history,target_5,target_10,target_15,target_20,target_25
0,2018-04-15 00:00:00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",205,221,247,285,297
1,2018-04-15 00:01:00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",204,220,246,284,296
2,2018-04-15 00:02:00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",203,219,245,283,295
3,2018-04-15 00:03:00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",202,218,244,282,294
4,2018-04-15 00:04:00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",201,217,243,281,293


In [34]:
df['weekday'] = df.datetime.apply(lambda x: x.weekday())
df['hour'] = df.datetime.apply(lambda x: x.hour)
df['minute'] = df.datetime.apply(lambda x: x.minute)


In [35]:
model_to_save = {
    'models': {}
}

In [36]:
one_target = df[['target_{}'.format(position), 'weekday', 'hour', 'minute']].copy()


In [37]:

for position in target_positions:


    one_target = df[['target_{}'.format(position), 'weekday', 'hour', 'minute']].copy()
    tt = one_target['target_{}'.format(position)].values
    one_target = one_target.loc[WEEK_IN_MINUTES-DAY_IN_MINUTES:]



    one_target['t-1'] = tt[5*DAY_IN_MINUTES:-1*DAY_IN_MINUTES]
    one_target['t-2'] = tt[4*DAY_IN_MINUTES:-2*DAY_IN_MINUTES]
    one_target['t-3'] = tt[3*DAY_IN_MINUTES:-3*DAY_IN_MINUTES]
    one_target['t-4'] = tt[2*DAY_IN_MINUTES:-4*DAY_IN_MINUTES]
    one_target['t-5'] = tt[1*DAY_IN_MINUTES:-5*DAY_IN_MINUTES]

    one_target = one_target.reset_index(drop=True)

    reg = xgb.XGBRegressor(n_estimators=110)
    reg.fit(one_target.iloc[:, 1:], one_target.iloc[:, 0])
    
    model_to_save['models'][position] = reg


In [38]:
pickle.dump(model_to_save, open('models.pkl', 'wb'))

In [39]:
one_target

Unnamed: 0,target_25,weekday,hour,minute,t-1,t-2,t-3,t-4,t-5
0,243,5,0,0,232,256,215,227,313
1,242,5,0,1,232,255,214,226,312
2,241,5,0,2,231,254,213,225,311
3,240,5,0,3,231,253,212,224,310
4,239,5,0,4,230,252,211,223,309
5,238,5,0,5,229,251,210,222,308
6,237,5,0,6,228,250,209,221,307
7,236,5,0,7,227,249,208,220,306
8,235,5,0,8,226,248,207,219,305
9,234,5,0,9,225,247,206,218,304
