In [1]:
import pandas as pd
import itertools
from sklearn.neighbors import KNeighborsRegressor, NearestNeighbors
from dateutil.relativedelta import relativedelta
from sklearn.metrics import mean_absolute_error
import numpy as np
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint as sp_randint
%matplotlib inline

In [2]:
data = pd.read_csv('/dsg/demand_anonymized_20170802.csv', delimiter=';', parse_dates=["Month"])
eval = pd.read_csv('/dsg/eval.csv')

In [3]:
# create aggregated by month series
series = data.groupby(["SalOrg", "Material", "Month"])["OrderQty"].sum().reset_index()
series = series.sort_values(by=["SalOrg", "Material", "Month"])
series.head()

Unnamed: 0,SalOrg,Material,Month,OrderQty
0,97LK,00IYcj,2012-05-01,2
1,97LK,00IYcj,2012-06-01,13
2,97LK,00IYcj,2012-07-01,1
3,97LK,00IYcj,2012-09-01,30
4,97LK,00IYcj,2012-11-01,1


In [4]:
# fill empty
eval_comb = eval[['Material', 'SalOrg']]
eval_comb = list(set([tuple(x) for x in eval_comb.values]))

comb = list(itertools.product(*[eval_comb, list(series['Month'].unique())]))
comb = [(t[0], t[1], m) for t, m in comb]

series2 = pd.DataFrame(comb, columns=['Material', 'SalOrg', 'Month'])
series2 = series2.sort_values(by=['Material', 'SalOrg', 'Month' ])
series2 = series2.merge(series, on=['Month', 'Material', 'SalOrg'], how='left')
series2 = series2.fillna(0)

In [5]:
series2['OrderQtyLog'] = np.log1p(series2['OrderQty'])

In [6]:
# create lagged features
def lag_feature(df, colname, lag, adv):
    cols_lagged = []
    for i in range(lag, 0, -1):
        col_lagged = '{}(t-{})'.format(colname, i)
        df[col_lagged] = df.groupby(by=['SalOrg', 'Material'])[colname].shift(i)
        cols_lagged.append(col_lagged)
    cols_adv = []
    for i in range(1, adv):
        col_adv = '{}(t+{})'.format(colname, i)
        df[col_adv] = df.groupby(by=['SalOrg', 'Material'])[colname].shift(-i)
        cols_adv.append(col_adv) 
        
    return df, cols_lagged, cols_adv

series, lagged, adv = lag_feature(series2, 'OrderQtyLog', 12, 3)
series.head()

Unnamed: 0,Material,SalOrg,Month,OrderQty,OrderQtyLog,OrderQtyLog(t-12),OrderQtyLog(t-11),OrderQtyLog(t-10),OrderQtyLog(t-9),OrderQtyLog(t-8),OrderQtyLog(t-7),OrderQtyLog(t-6),OrderQtyLog(t-5),OrderQtyLog(t-4),OrderQtyLog(t-3),OrderQtyLog(t-2),OrderQtyLog(t-1),OrderQtyLog(t+1),OrderQtyLog(t+2)
0,00GB1f,yqSu,2012-01-01,0.0,0.0,,,,,,,,,,,,,0.0,0.693147
1,00GB1f,yqSu,2012-02-01,0.0,0.0,,,,,,,,,,,,0.0,0.693147,1.386294
2,00GB1f,yqSu,2012-03-01,1.0,0.693147,,,,,,,,,,,0.0,0.0,1.386294,1.098612
3,00GB1f,yqSu,2012-04-01,3.0,1.386294,,,,,,,,,,0.0,0.0,0.693147,1.098612,0.693147
4,00GB1f,yqSu,2012-05-01,2.0,1.098612,,,,,,,,,0.0,0.0,0.693147,1.386294,0.693147,1.386294


In [7]:
def create_agg_features(data, columns, name):
    temp = data.loc[(data["Month"] > pd.to_datetime('2014-09-01'))&
                    (data["Month"] < pd.to_datetime('2016-09-01'))].groupby(columns)["OrderQty"].agg(["median", "mean", "std", "min", "max"]).reset_index()
    new_cols = [col + name for col in ["median", "mean", "std", "min", "max"]]
    temp.columns = columns + new_cols
    return temp, new_cols

sm_temp, sm_cols = create_agg_features(series, ["SalOrg", "Material"], "_s_m_")
series = series.merge(sm_temp, how='left', on=["SalOrg", "Material"])

s_temp, s_cols = create_agg_features(series, ["SalOrg"], "_s_")
series = series.merge(s_temp, how='left', on=["SalOrg"])

m_temp, m_cols = create_agg_features(series, ["Material"], "_m_")
series = series.merge(m_temp, how='left', on=["Material"])

series.head()

Unnamed: 0,Material,SalOrg,Month,OrderQty,OrderQtyLog,OrderQtyLog(t-12),OrderQtyLog(t-11),OrderQtyLog(t-10),OrderQtyLog(t-9),OrderQtyLog(t-8),...,median_s_,mean_s_,std_s_,min_s_,max_s_,median_m_,mean_m_,std_m_,min_m_,max_m_
0,00GB1f,yqSu,2012-01-01,0.0,0.0,,,,,,...,0.0,25.389641,202.069177,0.0,13540.0,0.0,0.26087,0.448978,0.0,1.0
1,00GB1f,yqSu,2012-02-01,0.0,0.0,,,,,,...,0.0,25.389641,202.069177,0.0,13540.0,0.0,0.26087,0.448978,0.0,1.0
2,00GB1f,yqSu,2012-03-01,1.0,0.693147,,,,,,...,0.0,25.389641,202.069177,0.0,13540.0,0.0,0.26087,0.448978,0.0,1.0
3,00GB1f,yqSu,2012-04-01,3.0,1.386294,,,,,,...,0.0,25.389641,202.069177,0.0,13540.0,0.0,0.26087,0.448978,0.0,1.0
4,00GB1f,yqSu,2012-05-01,2.0,1.098612,,,,,,...,0.0,25.389641,202.069177,0.0,13540.0,0.0,0.26087,0.448978,0.0,1.0


In [8]:
# validation folds
def create_validation(data, date, m):
    return data[(data["Month"] > pd.to_datetime(date) - relativedelta(months=m))&
                (data["Month"] < pd.to_datetime(date))].index, data[(data["Month"] >= pd.to_datetime(date)) & 
                (data["Month"] < pd.to_datetime(date) + relativedelta(months=3))].index

validation_months = ['2016-08-01','2016-09-01', '2016-10-01', '2016-11-01']

folds = []
for month in validation_months:
    fold = create_validation(series, month, 3)
    folds.append(fold)

In [9]:
# knn model
nn = 7
metric = 'canberra'
features = lagged + sm_cols + m_cols + s_cols

all_errors = []
for fold in folds:
    train = series.loc[fold[0]]
    test = series.loc[fold[1]]
    
    train_x = train[features]
    train_y = train[['OrderQtyLog'] + adv]
    test_x = test[features]
    test_y = test[['OrderQtyLog'] + adv]
    
    model = KNeighborsRegressor(n_neighbors=nn, metric=metric, n_jobs=8).fit(train_x, train_y)
    test_y_predicted = model.predict(test_x)
    
    error = mean_absolute_error(np.expm1(test_y), np.expm1(test_y_predicted))
    print('Fold error: {}'.format(error))
    
    all_errors.append(error)
    
np.mean(all_errors)

Fold error: 11.789415832943371
Fold error: 11.93793563794415
Fold error: 11.189606657683562
Fold error: 11.036132341888708


11.488272617614948

6 Fold error: 11.725605605485962
Fold error: 11.711877732517317

5 Fold error: 11.789457282174423
Fold error: 11.742792806595952
Fold error: 10.972295577868897
Fold error: 10.747072851463564

4
Fold error: 11.894426344531501
Fold error: 11.857534034559249
Fold error: 11.060203934968207
Fold error: 10.960681228838304

In [None]:
# all dataset 
train_x = test[lagged]
train_y = test[['OrderQtyLog'] + adv]
model = KNeighborsRegressor(n_neighbors=nn, metric=metric, n_jobs=-1).fit(train_x, train_y)

In [None]:
lagged_final = lagged
lagged_final.append('OrderQtyLog')
lagged_final.pop(0)
lagged_final

In [None]:
test_i = series[series["Month"]==pd.to_datetime('2017-03-01')].index

test_x = series.loc[test_i][lagged_final]
test_prediction = model.predict(test_x)
prediction = np.expm1(test_prediction)
prediction

In [None]:
tst = series.loc[test_i].copy()
tst['OrderQty(t+1)'] = prediction.T[0]
tst['OrderQty(t+2)'] = prediction.T[1]
tst['OrderQty(t+3)'] = prediction.T[2]

In [None]:
ml = pd.melt(tst, id_vars=['Material', 'SalOrg'], value_vars=['OrderQty(t+1)', 'OrderQty(t+2)','OrderQty(t+3)'])

In [None]:
ml['date'] = ml.variable.replace({'OrderQty(t+1)':'2017-04', 'OrderQty(t+2)':'2017-05', 'OrderQty(t+3)':'2017-06'})
ml.head()

In [None]:
result = eval.merge(ml, on=['Material', 'SalOrg', 'date'])
result

In [None]:
result['demand'] = result['value']
result[['ID', 'demand']].to_csv('knn8_9.62.csv', index=False)