In [None]:
import pandas as pd
import itertools
from sklearn.neighbors import KNeighborsRegressor, NearestNeighbors
from dateutil.relativedelta import relativedelta
from sklearn.metrics import mean_absolute_error
import numpy as np
%matplotlib inline

In [2]:
data = pd.read_csv('/dsg/demand_anonymized_20170802.csv', delimiter=';', parse_dates=["Month"])
eval = pd.read_csv('/dsg/eval.csv')

In [26]:
# create aggregated by month series
series = data.groupby(["SalOrg", "Material", "Month"])["OrderQty"].sum().reset_index()
series = series.sort_values(by=["SalOrg", "Material", "Month"])
series.head()

Unnamed: 0,SalOrg,Material,Month,OrderQty
0,97LK,00IYcj,2012-05-01,2
1,97LK,00IYcj,2012-06-01,13
2,97LK,00IYcj,2012-07-01,1
3,97LK,00IYcj,2012-09-01,30
4,97LK,00IYcj,2012-11-01,1


In [27]:
# fill empty
eval_comb = eval[['Material', 'SalOrg']]
eval_comb = list(set([tuple(x) for x in eval_comb.values]))

comb = list(itertools.product(*[eval_comb, list(series['Month'].unique())]))
comb = [(t[0], t[1], m) for t, m in comb]

series2 = pd.DataFrame(comb, columns=['Material', 'SalOrg', 'Month'])
series2 = series2.sort_values(by=['Material', 'SalOrg', 'Month' ])
series2 = series2.merge(series, on=['Month', 'Material', 'SalOrg'], how='left')
series2 = series2.fillna(0)

In [28]:
series2['OrderQtyLog'] = np.log1p(series2['OrderQty'])

In [31]:
# create lagged features
def lag_feature(df, colname, lag, adv):
    cols_lagged = []
    for i in range(lag, 0, -1):
        col_lagged = '{}(t-{})'.format(colname, i)
        df[col_lagged] = df.groupby(by=['SalOrg', 'Material'])[colname].shift(i)
        cols_lagged.append(col_lagged)
    cols_adv = []
    for i in range(1, adv):
        col_adv = '{}(t+{})'.format(colname, i)
        df[col_adv] = df.groupby(by=['SalOrg', 'Material'])[colname].shift(-i)
        cols_adv.append(col_adv) 
        
    return df, cols_lagged, cols_adv

series, lagged, adv = lag_feature(series2, 'OrderQtyLog', 12, 3)
series.head()

Unnamed: 0,Material,SalOrg,Month,OrderQty,OrderQtyLog,OrderQtyLog(t+1),OrderQtyLog(t+2),OrderQtyLog(t-12),OrderQtyLog(t-11),OrderQtyLog(t-10),OrderQtyLog(t-9),OrderQtyLog(t-8),OrderQtyLog(t-7),OrderQtyLog(t-6),OrderQtyLog(t-5),OrderQtyLog(t-4),OrderQtyLog(t-3),OrderQtyLog(t-2),OrderQtyLog(t-1)
0,00GB1f,yqSu,2012-01-01,0.0,0.0,0.0,0.693147,,,,,,,,,,,,
1,00GB1f,yqSu,2012-02-01,0.0,0.0,0.693147,1.386294,,,,,,,,,,,,0.0
2,00GB1f,yqSu,2012-03-01,1.0,0.693147,1.386294,1.098612,,,,,,,,,,,0.0,0.0
3,00GB1f,yqSu,2012-04-01,3.0,1.386294,1.098612,0.693147,,,,,,,,,,0.0,0.0,0.693147
4,00GB1f,yqSu,2012-05-01,2.0,1.098612,0.693147,1.386294,,,,,,,,,0.0,0.0,0.693147,1.386294


In [32]:
# validation folds
def create_validation(data, date, m):
    return data[(data["Month"] > pd.to_datetime(date) - relativedelta(months=m))&
                (data["Month"] < pd.to_datetime(date))].index, data[(data["Month"] >= pd.to_datetime(date)) & 
                (data["Month"] < pd.to_datetime(date) + relativedelta(months=3))].index

validation_months = ['2016-08-01', '2016-09-01', '2016-10-01', '2016-11-01']

folds = []
for month in validation_months:
    folds.append(create_validation(series, month, 4))

In [33]:
for fold in folds:
    print("Train:", str(series.loc[fold[0], "Month"].min())[:10], "–", str(series.loc[fold[0], "Month"].max())[:10],
          "    Validation:", str(series.loc[fold[1], "Month"].min())[:10], "–", str(series.loc[fold[1], "Month"].max())[:10])

Train: 2016-05-01 – 2016-07-01     Validation: 2016-08-01 – 2016-10-01
Train: 2016-06-01 – 2016-08-01     Validation: 2016-09-01 – 2016-11-01
Train: 2016-07-01 – 2016-09-01     Validation: 2016-10-01 – 2016-12-01
Train: 2016-08-01 – 2016-10-01     Validation: 2016-11-01 – 2017-01-01


In [34]:
# knn model
nn = 3
all_errors = []

for fold in folds:   
    train = series.loc[fold[0]]
    test = series.loc[fold[1]]
    
    train_x = train[lagged]
    train_y = train[['OrderQtyLog'] + adv]
    test_x = test[lagged]
    test_y = test[['OrderQtyLog'] + adv]
    
    model = KNeighborsRegressor(n_neighbors=nn, metric='canberra', n_jobs=-1).fit(train_x, train_y)
    test_y_predicted = model.predict(test_x)
    
    error = mean_absolute_error(np.expm1(test_y), np.expm1(test_y_predicted))
    print('Fold error: {}'.format(error))
    
    all_errors.append(error)
    
np.mean(all_errors)

Fold error: 11.716928436456696
Fold error: 11.851716093985331
Fold error: 10.765838288385781
Fold error: 10.430166251141545


11.191162267492338

In [45]:
# all dataset 
train_x = test[lagged]
train_y = test[['OrderQtyLog'] + adv]
model = KNeighborsRegressor(n_neighbors=nn, metric='canberra', n_jobs=-1).fit(train_x, train_y)

In [None]:
lagged_final = lagged
lagged_final.append('OrderQtyLog')
lagged_final.pop(0)
lagged_final

In [72]:
test_i = series[series["Month"]==pd.to_datetime('2017-03-01')].index

test_x = series.loc[test_i][lagged]
test_prediction = model.predict(test_x)
prediction = np.expm1(test_prediction)
prediction

array([[ 0.12246205,  0.        ,  0.12246205],
       [ 2.32200191,  5.02947112,  2.83716505],
       [ 1.44948974,  1.68894529,  1.83574532],
       ..., 
       [ 1.        ,  0.58740105,  0.25992105],
       [ 2.1408356 ,  2.48775056,  2.46410162],
       [ 0.46779927,  0.34800615,  0.78179744]])

In [75]:
tst = series.loc[test_i].copy()
tst['OrderQty(t+1)'] = prediction.T[0]
tst['OrderQty(t+2)'] = prediction.T[1]
tst['OrderQty(t+3)'] = prediction.T[2]

In [76]:
tst.head()

Unnamed: 0,Material,SalOrg,Month,OrderQty,OrderQtyLog,OrderQtyLog(t+1),OrderQtyLog(t+2),OrderQtyLog(t-12),OrderQtyLog(t-11),OrderQtyLog(t-10),...,OrderQtyLog(t-7),OrderQtyLog(t-6),OrderQtyLog(t-5),OrderQtyLog(t-4),OrderQtyLog(t-3),OrderQtyLog(t-2),OrderQtyLog(t-1),OrderQty(t+1),OrderQty(t+2),OrderQty(t+3)
62,00GB1f,yqSu,2017-03-01,0.0,0.0,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.693147,0.0,0.0,0.122462,0.0,0.122462
125,00IYcj,97LK,2017-03-01,1.0,0.693147,,,2.302585,0.0,0.693147,...,1.94591,1.94591,0.0,2.397895,2.302585,2.197225,2.639057,2.322002,5.029471,2.837165
188,00IYcj,OQfZ,2017-03-01,1.0,0.693147,,,3.091042,2.772589,0.693147,...,2.639057,0.693147,2.70805,1.609438,2.484907,3.218876,2.397895,1.44949,1.688945,1.835745
251,00IYcj,U12J,2017-03-01,18.0,2.944439,,,3.044522,2.197225,2.639057,...,2.639057,2.302585,3.135494,2.564949,2.302585,2.833213,2.944439,17.349843,14.897754,16.763361
314,00IYcj,yqSu,2017-03-01,39.0,3.688879,,,2.564949,1.94591,2.639057,...,1.94591,3.496508,3.218876,2.995732,3.713572,3.295837,3.401197,20.329688,32.785184,31.036805


In [84]:
ml = pd.melt(tst, id_vars=['Material', 'SalOrg'], value_vars=['OrderQty(t+1)', 'OrderQty(t+2)','OrderQty(t+3)'])

In [85]:
ml['date'] = ml.variable.replace({'OrderQty(t+1)':'2017-04', 'OrderQty(t+2)':'2017-05', 'OrderQty(t+3)':'2017-06'})
del ml['variable']
ml.head()

Unnamed: 0,Material,SalOrg,variable,value,date
0,00GB1f,yqSu,OrderQty(t+1),0.122462,2017-04
1,00IYcj,97LK,OrderQty(t+1),2.322002,2017-04
2,00IYcj,OQfZ,OrderQty(t+1),1.44949,2017-04
3,00IYcj,U12J,OrderQty(t+1),17.349843,2017-04
4,00IYcj,yqSu,OrderQty(t+1),20.329688,2017-04


In [88]:
eval.head()

Unnamed: 0,ID,SalOrg,Material,date
0,0,97LK,00IYcj,2017-04
1,1,97LK,00lqzT,2017-04
2,2,97LK,00MFcK,2017-04
3,3,97LK,00mt9e,2017-04
4,4,97LK,00Ok8y,2017-04


In [87]:
result = eval.merge(ml, on=['Material', 'SalOrg', 'date'])
result

Unnamed: 0,ID,SalOrg,Material,date,variable,value
0,0,97LK,00IYcj,2017-04,OrderQty(t+1),2.322002
1,1,97LK,00lqzT,2017-04,OrderQty(t+1),2.387859
2,2,97LK,00MFcK,2017-04,OrderQty(t+1),0.000000
3,3,97LK,00mt9e,2017-04,OrderQty(t+1),2.301927
4,4,97LK,00Ok8y,2017-04,OrderQty(t+1),0.808609
5,5,97LK,00W03x,2017-04,OrderQty(t+1),0.817121
6,6,97LK,02jRc8,2017-04,OrderQty(t+1),0.000000
7,7,97LK,02QhQT,2017-04,OrderQty(t+1),0.000000
8,8,97LK,0355Np,2017-04,OrderQty(t+1),0.348006
9,9,97LK,03eCyI,2017-04,OrderQty(t+1),331.560902


In [91]:
result['demand'] = result['value']
result[['ID', 'demand']].to_csv('knn6.csv', index=False)