In [1]:
import pandas as pd
import itertools
from sklearn.linear_model import Ridge
from dateutil.relativedelta import relativedelta
from sklearn.metrics import mean_absolute_error
import numpy as np
%matplotlib inline



In [2]:
data = pd.read_csv('/dsg/demand_anonymized_20170802.csv', delimiter=';', parse_dates=["Month"])
eval = pd.read_csv('/dsg/eval.csv')

In [3]:
# create aggregated by month series
series = data.groupby(["SalOrg", "Material", "Month"])["OrderQty"].sum().reset_index()
series = series.sort_values(by=["SalOrg", "Material", "Month"])
series.head()

Unnamed: 0,SalOrg,Material,Month,OrderQty
0,97LK,00IYcj,2012-05-01,2
1,97LK,00IYcj,2012-06-01,13
2,97LK,00IYcj,2012-07-01,1
3,97LK,00IYcj,2012-09-01,30
4,97LK,00IYcj,2012-11-01,1


In [4]:
# fill empty
eval_comb = eval[['Material', 'SalOrg']]
eval_comb = list(set([tuple(x) for x in eval_comb.values]))

comb = list(itertools.product(*[eval_comb, list(series['Month'].unique())]))
comb = [(t[0], t[1], m) for t, m in comb]

series2 = pd.DataFrame(comb, columns=['Material', 'SalOrg', 'Month'])
series2 = series2.sort_values(by=['Material', 'SalOrg', 'Month' ])
series2 = series2.merge(series, on=['Month', 'Material', 'SalOrg'], how='left')
series2 = series2.fillna(0)

In [5]:
series2['OrderQtyLog'] = np.log1p(series2['OrderQty'])

In [6]:
# create lagged features
def lag_feature(df, colname, lag, adv):
    cols_lagged = []
    for i in range(lag, 0, -1):
        col_lagged = '{}(t-{})'.format(colname, i)
        df[col_lagged] = df.groupby(by=['SalOrg', 'Material'])[colname].shift(i)
        cols_lagged.append(col_lagged)
    cols_adv = []
    for i in range(1, adv):
        col_adv = '{}(t+{})'.format(colname, i)
        df[col_adv] = df.groupby(by=['SalOrg', 'Material'])[colname].shift(-i)
        cols_adv.append(col_adv) 
        
    return df, cols_lagged, cols_adv

series, lagged, adv = lag_feature(series2, 'OrderQtyLog', 12, 3)
series.head()

Unnamed: 0,Material,SalOrg,Month,OrderQty,OrderQtyLog,OrderQtyLog(t-12),OrderQtyLog(t-11),OrderQtyLog(t-10),OrderQtyLog(t-9),OrderQtyLog(t-8),OrderQtyLog(t-7),OrderQtyLog(t-6),OrderQtyLog(t-5),OrderQtyLog(t-4),OrderQtyLog(t-3),OrderQtyLog(t-2),OrderQtyLog(t-1),OrderQtyLog(t+1),OrderQtyLog(t+2)
0,00GB1f,yqSu,2012-01-01,0.0,0.0,,,,,,,,,,,,,0.0,0.693147
1,00GB1f,yqSu,2012-02-01,0.0,0.0,,,,,,,,,,,,0.0,0.693147,1.386294
2,00GB1f,yqSu,2012-03-01,1.0,0.693147,,,,,,,,,,,0.0,0.0,1.386294,1.098612
3,00GB1f,yqSu,2012-04-01,3.0,1.386294,,,,,,,,,,0.0,0.0,0.693147,1.098612,0.693147
4,00GB1f,yqSu,2012-05-01,2.0,1.098612,,,,,,,,,0.0,0.0,0.693147,1.386294,0.693147,1.386294


In [7]:
# validation folds
def create_validation(data, date, m):
    return data[(data["Month"] > pd.to_datetime(date) - relativedelta(months=m))&
                (data["Month"] < pd.to_datetime(date))].index, data[(data["Month"] >= pd.to_datetime(date)) & 
                (data["Month"] < pd.to_datetime(date) + relativedelta(months=3))].index

validation_months = ['2016-08-01', '2016-09-01', '2016-10-01', '2016-11-01']

folds = []
for month in validation_months:
    folds.append(create_validation(series, month, 6))

In [8]:
all = len(series.groupby(by=['SalOrg', 'Material']))

In [10]:
errors = {}
all_errors = []

for fold in folds:   
    fold_errors = []
    
    processed = 1
    prev_mean_error = 1000
    for index, group in series.groupby(by=['SalOrg', 'Material']):
        #model_name = index[0] + '_' + index[1]
        if index not in errors:
            errors[index] = []
        
        train_s = group.loc[group.index.intersection(fold[0])]
        test_s = group.loc[group.index.intersection(fold[1])]
        
        assert (train_s['SalOrg'] == index[0]).all()
        assert (train_s['Material'] == index[1]).all()
        
        assert (test_s['SalOrg'] == index[0]).all()
        assert (test_s['Material'] == index[1]).all()
        
        train_y = train_s[['OrderQtyLog'] + adv]
        train_x = train_s[lagged]

        test_y = test_s[['OrderQtyLog'] + adv]
        test_x = test_s[lagged]

        model = Ridge(alpha=5)
        model.fit(train_x, train_y)

        test_y_predicted = model.predict(test_x)
        test_y_predicted[test_y_predicted < 0] = 0
        
        try:
            error = mean_absolute_error(np.expm1(test_y), np.expm1(test_y_predicted))
        except:
            print(test_y)
            print(test_y_predicted)
        
        errors[index].append(error)
        all_errors.append(error)
        fold_errors.append(error)
        
        processed += 1
        if(processed % 1000 == 0):
            print('Trained {}/{} models, mean fold error: {}'.format(processed, all, np.mean(fold_errors)))
        
    print('Mean fold error: {}'.format(np.mean(fold_errors)))
    
np.mean(all_errors)

Trained 1000/38676 models, mean fold error: 12.02466462819433
Trained 2000/38676 models, mean fold error: 13.079447247778694
Trained 3000/38676 models, mean fold error: 10.909124072681525
Trained 4000/38676 models, mean fold error: 12.083152183257338
Trained 5000/38676 models, mean fold error: 12.081004247002847
Trained 6000/38676 models, mean fold error: 12.365692047743586
Trained 7000/38676 models, mean fold error: 11.516320367860095
Trained 8000/38676 models, mean fold error: 12.670369452124513
Trained 9000/38676 models, mean fold error: 12.308668348423506
Trained 10000/38676 models, mean fold error: 12.063126797799256
Trained 11000/38676 models, mean fold error: 12.922863119583964
Trained 12000/38676 models, mean fold error: 12.417167639032902
Trained 13000/38676 models, mean fold error: 11.835978349215397
Trained 14000/38676 models, mean fold error: 11.59470430953959
Trained 15000/38676 models, mean fold error: 11.5787469674509
Trained 16000/38676 models, mean fold error: 11.29581

15.010201198677116