In [1]:
import pandas as pd
import itertools
from sklearn.linear_model import Ridge
from dateutil.relativedelta import relativedelta
from sklearn.metrics import mean_absolute_error
import numpy as np
%matplotlib inline

In [2]:
data = pd.read_csv('/dsg/demand_anonymized_20170802.csv', delimiter=';', parse_dates=["Month"])
eval = pd.read_csv('/dsg/eval.csv')

In [3]:
# create aggregated by month series
series = data.groupby(["SalOrg", "Material", "Month"])["OrderQty"].sum().reset_index()
series = series.sort_values(by=["SalOrg", "Material", "Month"])
series.head()

Unnamed: 0,SalOrg,Material,Month,OrderQty
0,97LK,00IYcj,2012-05-01,2
1,97LK,00IYcj,2012-06-01,13
2,97LK,00IYcj,2012-07-01,1
3,97LK,00IYcj,2012-09-01,30
4,97LK,00IYcj,2012-11-01,1


In [4]:
# fill empty
eval_comb = eval[['Material', 'SalOrg']]
eval_comb = list(set([tuple(x) for x in eval_comb.values]))

comb = list(itertools.product(*[eval_comb, list(series['Month'].unique())]))
comb = [(t[0], t[1], m) for t, m in comb]

series2 = pd.DataFrame(comb, columns=['Material', 'SalOrg', 'Month'])
series2 = series2.sort_values(by=['Material', 'SalOrg', 'Month' ])
series2 = series2.merge(series, on=['Month', 'Material', 'SalOrg'], how='left')
series2 = series2.fillna(0)

In [5]:
series2['OrderQtyLog'] = np.log1p(series2['OrderQty'])

In [6]:
# create lagged features
def lag_feature(df, colname, lag, adv):
    cols_lagged = []
    for i in range(lag, 0, -1):
        col_lagged = '{}(t-{})'.format(colname, i)
        df[col_lagged] = df.groupby(by=['SalOrg', 'Material'])[colname].shift(i)
        cols_lagged.append(col_lagged)
    cols_adv = []
    for i in range(1, adv):
        col_adv = '{}(t+{})'.format(colname, i)
        df[col_adv] = df.groupby(by=['SalOrg', 'Material'])[colname].shift(-i)
        cols_adv.append(col_adv) 
        
    return df, cols_lagged, cols_adv

series, lagged, adv = lag_feature(series2, 'OrderQtyLog', 12, 3)
series.head()

Unnamed: 0,Material,SalOrg,Month,OrderQty,OrderQtyLog,OrderQtyLog(t-12),OrderQtyLog(t-11),OrderQtyLog(t-10),OrderQtyLog(t-9),OrderQtyLog(t-8),OrderQtyLog(t-7),OrderQtyLog(t-6),OrderQtyLog(t-5),OrderQtyLog(t-4),OrderQtyLog(t-3),OrderQtyLog(t-2),OrderQtyLog(t-1),OrderQtyLog(t+1),OrderQtyLog(t+2)
0,00GB1f,yqSu,2012-01-01,0.0,0.0,,,,,,,,,,,,,0.0,0.693147
1,00GB1f,yqSu,2012-02-01,0.0,0.0,,,,,,,,,,,,0.0,0.693147,1.386294
2,00GB1f,yqSu,2012-03-01,1.0,0.693147,,,,,,,,,,,0.0,0.0,1.386294,1.098612
3,00GB1f,yqSu,2012-04-01,3.0,1.386294,,,,,,,,,,0.0,0.0,0.693147,1.098612,0.693147
4,00GB1f,yqSu,2012-05-01,2.0,1.098612,,,,,,,,,0.0,0.0,0.693147,1.386294,0.693147,1.386294


In [7]:
def create_agg_features(data, columns, name):
    temp = data.loc[(data["Month"] > pd.to_datetime('2014-09-01'))&
                    (data["Month"] < pd.to_datetime('2016-09-01'))].groupby(columns)["OrderQty"].agg(["median", "mean", "std", "min", "max"]).reset_index()
    new_cols = [col + name for col in ["median", "mean", "std", "min", "max"]]
    temp.columns = columns + new_cols
    return temp, new_cols

sm_temp, sm_cols = create_agg_features(series, ["SalOrg", "Material"], "_s_m_")
series = series.merge(sm_temp, how='left', on=["SalOrg", "Material"])

s_temp, s_cols = create_agg_features(series, ["SalOrg"], "_s_")
series = series.merge(s_temp, how='left', on=["SalOrg"])

m_temp, m_cols = create_agg_features(series, ["Material"], "_m_")
series = series.merge(m_temp, how='left', on=["Material"])

series.head()

Unnamed: 0,Material,SalOrg,Month,OrderQty,OrderQtyLog,OrderQtyLog(t-12),OrderQtyLog(t-11),OrderQtyLog(t-10),OrderQtyLog(t-9),OrderQtyLog(t-8),...,median_s_,mean_s_,std_s_,min_s_,max_s_,median_m_,mean_m_,std_m_,min_m_,max_m_
0,00GB1f,yqSu,2012-01-01,0.0,0.0,,,,,,...,0.0,25.389641,202.069177,0.0,13540.0,0.0,0.26087,0.448978,0.0,1.0
1,00GB1f,yqSu,2012-02-01,0.0,0.0,,,,,,...,0.0,25.389641,202.069177,0.0,13540.0,0.0,0.26087,0.448978,0.0,1.0
2,00GB1f,yqSu,2012-03-01,1.0,0.693147,,,,,,...,0.0,25.389641,202.069177,0.0,13540.0,0.0,0.26087,0.448978,0.0,1.0
3,00GB1f,yqSu,2012-04-01,3.0,1.386294,,,,,,...,0.0,25.389641,202.069177,0.0,13540.0,0.0,0.26087,0.448978,0.0,1.0
4,00GB1f,yqSu,2012-05-01,2.0,1.098612,,,,,,...,0.0,25.389641,202.069177,0.0,13540.0,0.0,0.26087,0.448978,0.0,1.0


In [8]:
# validation folds
def create_validation(data, date, m):
    return data[(data["Month"] > pd.to_datetime(date) - relativedelta(months=m))&
                (data["Month"] < pd.to_datetime(date))].index, data[(data["Month"] >= pd.to_datetime(date)) & 
                (data["Month"] < pd.to_datetime(date) + relativedelta(months=3))].index

validation_months = ['2016-09-01', '2016-10-01', '2016-11-01']

folds = []
for month in validation_months:
    fold = create_validation(series, month, 3)
    folds.append(fold)

In [9]:
all = len(series.groupby(by=['SalOrg', 'Material']))

In [10]:
lagged

['OrderQtyLog(t-12)',
 'OrderQtyLog(t-11)',
 'OrderQtyLog(t-10)',
 'OrderQtyLog(t-9)',
 'OrderQtyLog(t-8)',
 'OrderQtyLog(t-7)',
 'OrderQtyLog(t-6)',
 'OrderQtyLog(t-5)',
 'OrderQtyLog(t-4)',
 'OrderQtyLog(t-3)',
 'OrderQtyLog(t-2)',
 'OrderQtyLog(t-1)']

In [11]:
errors = {}
all_errors = []
features = lagged + sm_cols + m_cols + s_cols

for fold in folds:
    fold_errors = []
    
    processed = 1
    for index, group in series.groupby(by=['SalOrg', 'Material']):
        if index not in errors:
            errors[index] = []
        
        train_s = group.loc[group.index.intersection(fold[0])]
        test_s = group.loc[group.index.intersection(fold[1])]
        
        train_y = train_s[['OrderQtyLog'] + adv]
        train_x = train_s[features]

        test_y = test_s[['OrderQtyLog'] + adv]
        test_x = test_s[features]

        model = Ridge(alpha=8)
        model.fit(train_x, train_y)

        test_y_predicted = model.predict(test_x)
        test_y_predicted[test_y_predicted < 0] = 0

        error = mean_absolute_error(np.expm1(test_y), np.expm1(test_y_predicted))
        
        errors[index].append(error)
        all_errors.append(error)
        fold_errors.append(error)
        
        processed += 1
        if(processed % 1000 == 0):
            print('Trained {}/{} models, mean fold error: {}'.format(processed, all, np.mean(fold_errors)))
        
    print('Mean fold error: {}'.format(np.mean(fold_errors)))
    
np.mean(all_errors)

Trained 1000/38676 models, mean fold error: 14.871181051397219
Trained 2000/38676 models, mean fold error: 15.790178004990246
Trained 3000/38676 models, mean fold error: 13.113671018414042
Trained 4000/38676 models, mean fold error: 13.695242949161422
Trained 5000/38676 models, mean fold error: 13.571079221651452
Trained 6000/38676 models, mean fold error: 13.982033366752134
Trained 7000/38676 models, mean fold error: 13.067143519749418
Trained 8000/38676 models, mean fold error: 14.02602147943628
Trained 9000/38676 models, mean fold error: 13.696219257240436
Trained 10000/38676 models, mean fold error: 13.422000251931541
Trained 11000/38676 models, mean fold error: 13.22391869589784
Trained 12000/38676 models, mean fold error: 12.76086203452854
Trained 13000/38676 models, mean fold error: 12.179530383269386
Trained 14000/38676 models, mean fold error: 11.899755375910184
Trained 15000/38676 models, mean fold error: 11.870829038588553
Trained 16000/38676 models, mean fold error: 11.5980

11.982792696511956

In [12]:
models = {}
for index, group in series.groupby(by=['SalOrg', 'Material']):
    train = group.loc[group.index.intersection(fold[1])]

    train_y = train[['OrderQtyLog'] + adv]
    train_x = train[features]

    model = Ridge(alpha=8)
    model.fit(train_x, train_y)
    models[index] = model

In [13]:
lagged_final = lagged
lagged_final.append('OrderQtyLog')
lagged_final.pop(0)
fetures_final = lagged_final + sm_cols + m_cols + s_cols
fetures_final

['OrderQtyLog(t-11)',
 'OrderQtyLog(t-10)',
 'OrderQtyLog(t-9)',
 'OrderQtyLog(t-8)',
 'OrderQtyLog(t-7)',
 'OrderQtyLog(t-6)',
 'OrderQtyLog(t-5)',
 'OrderQtyLog(t-4)',
 'OrderQtyLog(t-3)',
 'OrderQtyLog(t-2)',
 'OrderQtyLog(t-1)',
 'OrderQtyLog',
 'median_s_m_',
 'mean_s_m_',
 'std_s_m_',
 'min_s_m_',
 'max_s_m_',
 'median_m_',
 'mean_m_',
 'std_m_',
 'min_m_',
 'max_m_',
 'median_s_',
 'mean_s_',
 'std_s_',
 'min_s_',
 'max_s_']

In [37]:
test_i = series[series["Month"]==pd.to_datetime('2017-03-01')].index

predictions = []
for index, group in series.groupby(by=['SalOrg', 'Material']):
    model = models[index]
    test = group.loc[group.index.intersection(test_i)]
    test_x = test[fetures_final]
    test_prediction = model.predict(test_x)
    prediction = np.expm1(test_prediction)
    prediction[prediction < 0.0] = 0
    row = list(index) + prediction[0].tolist()
    predictions.append(row)
    #print('Prediction done: {}'.format(index))
predictions

[['97LK', '00IYcj', 9.211688668632357, 9.907924080734754, 5.077524719308683],
 ['97LK', '00MFcK', 0.28234277251735596, 0.0, 0.0],
 ['97LK',
  '00Ok8y',
  0.32444846563687674,
  0.1941828634599879,
  0.2645153230257497],
 ['97LK',
  '00W03x',
  0.7099759466766971,
  0.7099759466766971,
  0.7099759466766971],
 ['97LK', '00lqzT', 3.7536915611244286, 2.555910636105223, 2.577584219888367],
 ['97LK',
  '00mt9e',
  1.4883370469764001,
  2.8278694156765947,
  0.09736978883405385],
 ['97LK', '02QhQT', 0.0, 0.0, 0.0],
 ['97LK',
  '02jRc8',
  0.5874010519681994,
  0.5874010519681994,
  0.5874010519681994],
 ['97LK', '0355Np', 0.0, 0.2867266156331133, 0.6165701383494209],
 ['97LK', '03GYN7', 0.0, 0.25992104989487314, 0.25992104989487314],
 ['97LK', '03eCyI', 366.5498014233658, 284.83118611021644, 360.9356475178604],
 ['97LK', '041azJ', 2.877607048405115, 0.0, 0.2932719298802463],
 ['97LK', '04ocTC', 0.7980727667317005, 1.5535601038460833, 2.273651957745644],
 ['97LK', '055wMw', 0.0, 0.0, 0.0],
 ['

In [46]:
pre_df = pd.DataFrame(predictions, columns=['SalOrg', 'Material', '2017-04', '2017-05', '2017-06'])
ml = pd.melt(pre_df, id_vars=['Material', 'SalOrg'], value_vars=['2017-04', '2017-05', '2017-06'])
ml['date'] = ml['variable']
del ml['variable']
ml['demand'] = ml['value']
del ml['value']
ml.head()

Unnamed: 0,Material,SalOrg,date,demand
0,00IYcj,97LK,2017-04,9.211689
1,00MFcK,97LK,2017-04,0.282343
2,00Ok8y,97LK,2017-04,0.324448
3,00W03x,97LK,2017-04,0.709976
4,00lqzT,97LK,2017-04,3.753692


In [47]:
result = eval.merge(ml, on=['Material', 'SalOrg', 'date'])

In [51]:
result[['ID', 'demand']].to_csv('linear_models_11.98.csv', index=False)