In [1]:
import pandas as pd
import itertools
from sklearn.neighbors import KNeighborsRegressor, NearestNeighbors
from dateutil.relativedelta import relativedelta
from sklearn.metrics import mean_absolute_error
import numpy as np
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint as sp_randint
%matplotlib inline

In [2]:
data = pd.read_csv('/dsg/demand_anonymized_20170802.csv', delimiter=';', parse_dates=["Month"])
eval = pd.read_csv('/dsg/eval.csv')

In [3]:
# features
clients = data['Ship_To']
cvc = clients.value_counts()
corporate_clients = list(cvc[cvc > 500].index)
regular_clients = list(cvc[cvc <= 500].index)
data['is_corporate'] = data['Ship_To'].isin(corporate_clients)
data['is_regular'] = data['Ship_To'].isin(regular_clients)

In [4]:
# create aggregated by month series
series = data.groupby(["SalOrg", "Material", "Month"]).agg({
    'OrderQty':'sum', 
    'is_corporate' : 'sum', 
    'is_regular' : 'sum',
    "PL": 'first'}).reset_index()
series = series.sort_values(by=["SalOrg", "Material", "Month"])
series.head()

Unnamed: 0,SalOrg,Material,Month,PL,is_corporate,OrderQty,is_regular
0,97LK,00IYcj,2012-05-01,ss6l,1.0,2,1.0
1,97LK,00IYcj,2012-06-01,ss6l,2.0,13,0.0
2,97LK,00IYcj,2012-07-01,ss6l,1.0,1,0.0
3,97LK,00IYcj,2012-09-01,ss6l,2.0,30,0.0
4,97LK,00IYcj,2012-11-01,ss6l,1.0,1,0.0


In [5]:
def agg_features(data, columns):
    functions = ["size", "min", "max", "sum", "std", "mean", "median"]
    temp = data.groupby(columns + ['Month'])["OrderQty"].agg(functions).reset_index()
    col = "_".join(columns)
    cols = [col + func for func in functions]
    temp.columns = columns + ['Month'] + cols
    data = data.merge(temp, how='left')
    return data, cols

In [6]:
# fill empty
eval_comb = eval[['Material', 'SalOrg']]
eval_comb = list(set([tuple(x) for x in eval_comb.values]))

comb = list(itertools.product(*[eval_comb, list(series['Month'].unique())]))
comb = [(t[0], t[1], m) for t, m in comb]

series2 = pd.DataFrame(comb, columns=['Material', 'SalOrg', 'Month'])
series2 = series2.sort_values(by=['Material', 'SalOrg', 'Month' ])
series2 = series2.merge(series, on=['Month', 'Material', 'SalOrg'], how='left')
series2 = series2.fillna(0)
series = series2

In [7]:
series, pl_agg = agg_features(series, ["PL"])

In [8]:
series['OrderQtyLog'] = np.log1p(series['OrderQty'])

In [14]:
# create lagged features
def lag_feature(df, colname, lag, adv):
    cols_lagged = []
    for i in range(lag, 0, -1):
        col_lagged = '{}(t-{})'.format(colname, i)
        df[col_lagged] = df.groupby(by=['SalOrg', 'Material'])[colname].shift(i)
        cols_lagged.append(col_lagged)
    cols_adv = []
    for i in range(1, adv):
        col_adv = '{}(t+{})'.format(colname, i)
        df[col_adv] = df.groupby(by=['SalOrg', 'Material'])[colname].shift(-i)
        cols_adv.append(col_adv) 
        
    return df, cols_lagged, cols_adv

series, lagged_qty, adv = lag_feature(series, 'OrderQtyLog', 12, 3)
series, lagged_corp, _ = lag_feature(series, 'is_corporate', 2, 0)
series, lagged_reg, _ = lag_feature(series, 'is_regular', 2, 0)
pl_agg_lagged = []
pl_agg_lagged_final = []
for pl_agg_col in pl_agg:
    series, pl_agg_lag, _ = lag_feature(series, pl_agg_col, 3, 0)
    pl_agg_lagged += pl_agg_lag
    # create set of fields for last prediction in advance
    pl_agg_lag.pop(0)
    pl_agg_lag.append(pl_agg_col)
    pl_agg_lagged_final += pl_agg_lag
    
lagged = lagged_qty #+ lagged_corp + lagged_reg + pl_agg_lagged

In [15]:
# validation folds
def create_validation(data, date, m):
    return data[(data["Month"] > pd.to_datetime(date) - relativedelta(months=m))&
                (data["Month"] < pd.to_datetime(date))].index, data[(data["Month"] >= pd.to_datetime(date)) & 
                (data["Month"] < pd.to_datetime(date) + relativedelta(months=3))].index

validation_months = ['2016-08-01','2016-09-01', '2016-10-01', '2016-11-01']

folds = []
for month in validation_months:
    fold = create_validation(series, month, 3)
    folds.append(fold)

In [16]:
lagged

['OrderQtyLog(t-12)',
 'OrderQtyLog(t-11)',
 'OrderQtyLog(t-10)',
 'OrderQtyLog(t-9)',
 'OrderQtyLog(t-8)',
 'OrderQtyLog(t-7)',
 'OrderQtyLog(t-6)',
 'OrderQtyLog(t-5)',
 'OrderQtyLog(t-4)',
 'OrderQtyLog(t-3)',
 'OrderQtyLog(t-2)',
 'OrderQtyLog(t-1)']

In [17]:
# knn model
nn = 10
metric = 'canberra'

all_errors = []
for fold in folds:
    train = series.loc[fold[0]]
    test = series.loc[fold[1]]
    
    train_x = train[lagged]
    train_y = train[['OrderQtyLog'] + adv]
    test_x = test[lagged]
    test_y = test[['OrderQtyLog'] + adv]
    
    model = KNeighborsRegressor(n_neighbors=nn, metric=metric, n_jobs=8).fit(train_x, train_y)
    test_y_predicted = model.predict(test_x)
    
    error = mean_absolute_error(np.expm1(test_y), np.expm1(test_y_predicted))
    print('Fold error: {}'.format(error))
    
    all_errors.append(error)
    
np.mean(all_errors)

Fold error: 11.61452034653719
Fold error: 11.583466877724211
Fold error: 10.534432802659632
Fold error: 10.33349956942523


11.016479899086566

7 Fold error: 11.696891070946064
Fold error: 11.65038871982667
Fold error: 10.684259300479093
Fold error: 10.490920642188568
11.1306149333601

6 Fold error: 11.725605605485962
Fold error: 11.711877732517317

5 Fold error: 11.789457282174423
Fold error: 11.742792806595952
Fold error: 10.972295577868897
Fold error: 10.747072851463564

4
Fold error: 11.894426344531501
Fold error: 11.857534034559249
Fold error: 11.060203934968207
Fold error: 10.960681228838304

In [17]:
# all dataset 
train_x = test[lagged]
train_y = test[['OrderQtyLog'] + adv]
model = KNeighborsRegressor(n_neighbors=nn, metric=metric, n_jobs=-1).fit(train_x, train_y)

In [None]:
lagged_corp_final = list(lagged_corp)
lagged_corp_final.append('is_corporate')
lagged_corp_final.pop(0)
lagged_reg_final = list(lagged_reg)
lagged_reg_final.append('is_regular')
lagged_reg_final.pop(0)
lagged_final_qty = list(lagged_qty)
lagged_final_qty.append('OrderQtyLog')
lagged_final_qty.pop(0)
lagged_final = lagged_final_qty + lagged_corp_final + lagged_reg_final + pl_agg_lagged_final
lagged_final

In [19]:
test_i = series[series["Month"]==pd.to_datetime('2017-03-01')].index

test_x = series.loc[test_i][lagged_final]
test_prediction = model.predict(test_x)
prediction = np.expm1(test_prediction)
prediction

array([[ 0.21901365,  0.        ,  0.10408951],
       [ 3.29314552,  5.02525211,  2.86001491],
       [ 1.84591343,  2.01562669,  2.48473075],
       ..., 
       [ 0.81144733,  0.73851051,  0.42616164],
       [ 2.58964584,  2.55670217,  2.83031543],
       [ 0.38949549,  0.29170834,  0.64067071]])

In [20]:
tst = series.loc[test_i].copy()
tst['OrderQty(t+1)'] = prediction.T[0]
tst['OrderQty(t+2)'] = prediction.T[1]
tst['OrderQty(t+3)'] = prediction.T[2]

In [21]:
ml = pd.melt(tst, id_vars=['Material', 'SalOrg'], value_vars=['OrderQty(t+1)', 'OrderQty(t+2)','OrderQty(t+3)'])

In [22]:
ml['date'] = ml.variable.replace({'OrderQty(t+1)':'2017-04', 'OrderQty(t+2)':'2017-05', 'OrderQty(t+3)':'2017-06'})
ml.head()

Unnamed: 0,Material,SalOrg,variable,value,date
0,00GB1f,yqSu,OrderQty(t+1),0.219014,2017-04
1,00IYcj,97LK,OrderQty(t+1),3.293146,2017-04
2,00IYcj,OQfZ,OrderQty(t+1),1.845913,2017-04
3,00IYcj,U12J,OrderQty(t+1),17.150636,2017-04
4,00IYcj,yqSu,OrderQty(t+1),22.416785,2017-04


In [23]:
result = eval.merge(ml, on=['Material', 'SalOrg', 'date'])
result

Unnamed: 0,ID,SalOrg,Material,date,variable,value
0,0,97LK,00IYcj,2017-04,OrderQty(t+1),3.293146
1,1,97LK,00lqzT,2017-04,OrderQty(t+1),2.142143
2,2,97LK,00MFcK,2017-04,OrderQty(t+1),0.000000
3,3,97LK,00mt9e,2017-04,OrderQty(t+1),2.257002
4,4,97LK,00Ok8y,2017-04,OrderQty(t+1),0.661809
5,5,97LK,00W03x,2017-04,OrderQty(t+1),0.668510
6,6,97LK,02jRc8,2017-04,OrderQty(t+1),0.000000
7,7,97LK,02QhQT,2017-04,OrderQty(t+1),0.000000
8,8,97LK,0355Np,2017-04,OrderQty(t+1),0.291708
9,9,97LK,03eCyI,2017-04,OrderQty(t+1),281.444607


In [25]:
result['demand'] = result['value']
result[['ID', 'demand']].to_csv('knn7_11.13.csv', index=False)