In [17]:
import pandas as pd
import itertools
from sklearn.linear_model import Ridge
from dateutil.relativedelta import relativedelta
from sklearn.metrics import mean_absolute_error
import numpy as np
%matplotlib inline

In [2]:
data = pd.read_csv('/dsg/demand_anonymized_20170802.csv', delimiter=';', parse_dates=["Month"])
eval = pd.read_csv('/dsg/eval.csv')

In [6]:
# create aggregated by month series
series = data.groupby(["SalOrg", "Material", "Month"]).agg({ "OrderQty" : 'sum',
                                                             "PL": 'first',
                                                             "MktABC": 'first',
                                                             "SubFct": 'first' }).reset_index()
series = series.sort_values(by=["SalOrg", "Material", "Month"])
series.head()

Unnamed: 0,SalOrg,Material,Month,OrderQty,MktABC,PL,SubFct
0,97LK,00IYcj,2012-05-01,2,vegq,ss6l,PjXe
1,97LK,00IYcj,2012-06-01,13,vegq,ss6l,PjXe
2,97LK,00IYcj,2012-07-01,1,vegq,ss6l,PjXe
3,97LK,00IYcj,2012-09-01,30,vegq,ss6l,PjXe
4,97LK,00IYcj,2012-11-01,1,vegq,ss6l,PjXe


In [7]:
# fill empty
eval_comb = eval[['Material', 'SalOrg']]
eval_comb = list(set([tuple(x) for x in eval_comb.values]))

comb = list(itertools.product(*[eval_comb, list(series['Month'].unique())]))
comb = [(t[0], t[1], m) for t, m in comb]

series2 = pd.DataFrame(comb, columns=['Material', 'SalOrg', 'Month'])
series2 = series2.sort_values(by=['Material', 'SalOrg', 'Month' ])
series2 = series2.merge(series, on=['Month', 'Material', 'SalOrg'], how='left')
series2 = series2.fillna(0)

In [8]:
series2['OrderQtyLog'] = np.log1p(series2['OrderQty'])

In [9]:
# create lagged features
def lag_feature(df, colname, lag, adv):
    cols_lagged = []
    for i in range(lag, 0, -1):
        col_lagged = '{}(t-{})'.format(colname, i)
        df[col_lagged] = df.groupby(by=['SalOrg', 'Material'])[colname].shift(i)
        cols_lagged.append(col_lagged)
    cols_adv = []
    for i in range(1, adv):
        col_adv = '{}(t+{})'.format(colname, i)
        df[col_adv] = df.groupby(by=['SalOrg', 'Material'])[colname].shift(-i)
        cols_adv.append(col_adv) 
        
    return df, cols_lagged, cols_adv

series, lagged, adv = lag_feature(series2, 'OrderQtyLog', 12, 3)
series.head()

Unnamed: 0,Material,SalOrg,Month,OrderQty,MktABC,PL,SubFct,OrderQtyLog,OrderQtyLog(t-12),OrderQtyLog(t-11),...,OrderQtyLog(t-8),OrderQtyLog(t-7),OrderQtyLog(t-6),OrderQtyLog(t-5),OrderQtyLog(t-4),OrderQtyLog(t-3),OrderQtyLog(t-2),OrderQtyLog(t-1),OrderQtyLog(t+1),OrderQtyLog(t+2)
0,00GB1f,yqSu,2012-01-01,0.0,0,0,0,0.0,,,...,,,,,,,,,0.0,0.693147
1,00GB1f,yqSu,2012-02-01,0.0,0,0,0,0.0,,,...,,,,,,,,0.0,0.693147,1.386294
2,00GB1f,yqSu,2012-03-01,1.0,ARa9,NmYB,PjXe,0.693147,,,...,,,,,,,0.0,0.0,1.386294,1.098612
3,00GB1f,yqSu,2012-04-01,3.0,ARa9,NmYB,PjXe,1.386294,,,...,,,,,,0.0,0.0,0.693147,1.098612,0.693147
4,00GB1f,yqSu,2012-05-01,2.0,ARa9,NmYB,PjXe,1.098612,,,...,,,,,0.0,0.0,0.693147,1.386294,0.693147,1.386294


In [10]:
def create_agg_features(data, columns, name):
    temp = data.loc[(data["Month"] > pd.to_datetime('2014-09-01'))&
                    (data["Month"] < pd.to_datetime('2016-09-01'))].groupby(columns)["OrderQty"].agg(["median", "mean", "std", "min", "max"]).reset_index()
    new_cols = [col + name for col in ["median", "mean", "std", "min", "max"]]
    temp.columns = columns + new_cols
    return temp, new_cols

sm_temp, sm_cols = create_agg_features(series, ["SalOrg", "Material"], "_s_m_")
series = series.merge(sm_temp, how='left', on=["SalOrg", "Material"])

s_temp, s_cols = create_agg_features(series, ["SalOrg"], "_s_")
series = series.merge(s_temp, how='left', on=["SalOrg"])

m_temp, m_cols = create_agg_features(series, ["Material"], "_m_")
series = series.merge(m_temp, how='left', on=["Material"])

pl_temp, pl_cols = create_agg_features(series, ["PL"], "_pl_")
series = series.merge(pl_temp, how='left', on=["PL"])

abc_temp, abc_cols = create_agg_features(series, ["MktABC"], "_abc_")
series = series.merge(abc_temp, how='left', on=["MktABC"])

sf_temp, sf_cols = create_agg_features(series, ["SubFct"], "_sf_")
series = series.merge(sf_temp, how='left', on=["SubFct"])

series.head()

Unnamed: 0,Material,SalOrg,Month,OrderQty,MktABC,PL,SubFct,OrderQtyLog,OrderQtyLog(t-12),OrderQtyLog(t-11),...,median_abc_,mean_abc_,std_abc_,min_abc_,max_abc_,median_sf_,mean_sf_,std_sf_,min_sf_,max_sf_
0,00GB1f,yqSu,2012-01-01,0.0,0,0,0,0.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00GB1f,yqSu,2012-02-01,0.0,0,0,0,0.0,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00GB1f,yqSu,2012-03-01,1.0,ARa9,NmYB,PjXe,0.693147,,,...,2.0,14.605571,64.357519,1.0,4280.0,4.0,49.59637,257.834315,1.0,13540.0
3,00GB1f,yqSu,2012-04-01,3.0,ARa9,NmYB,PjXe,1.386294,,,...,2.0,14.605571,64.357519,1.0,4280.0,4.0,49.59637,257.834315,1.0,13540.0
4,00GB1f,yqSu,2012-05-01,2.0,ARa9,NmYB,PjXe,1.098612,,,...,2.0,14.605571,64.357519,1.0,4280.0,4.0,49.59637,257.834315,1.0,13540.0


In [11]:
# validation folds
def create_validation(data, date, m):
    return data[(data["Month"] > pd.to_datetime(date) - relativedelta(months=m))&
                (data["Month"] < pd.to_datetime(date))].index, data[(data["Month"] >= pd.to_datetime(date)) & 
                (data["Month"] < pd.to_datetime(date) + relativedelta(months=3))].index

validation_months = ['2016-09-01', '2016-10-01', '2016-11-01']

folds = []
for month in validation_months:
    fold = create_validation(series, month, 3)
    folds.append(fold)

In [12]:
all = len(series.groupby(by=['SalOrg', 'Material']))

In [13]:
lagged

['OrderQtyLog(t-12)',
 'OrderQtyLog(t-11)',
 'OrderQtyLog(t-10)',
 'OrderQtyLog(t-9)',
 'OrderQtyLog(t-8)',
 'OrderQtyLog(t-7)',
 'OrderQtyLog(t-6)',
 'OrderQtyLog(t-5)',
 'OrderQtyLog(t-4)',
 'OrderQtyLog(t-3)',
 'OrderQtyLog(t-2)',
 'OrderQtyLog(t-1)']

In [None]:
errors = {}
all_errors = []
features = lagged + sm_cols + m_cols + s_cols + pl_cols + abc_cols + sf_cols

for fold in folds:
    fold_errors = []
    
    processed = 1
    for index, group in series.groupby(by=['SalOrg', 'Material']):
        if index not in errors:
            errors[index] = []
        
        train_s = group.loc[group.index.intersection(fold[0])]
        test_s = group.loc[group.index.intersection(fold[1])]
        
        train_y = np.array(train_s[['OrderQtyLog'] + adv])
        train_x = train_s[features]

        test_y = np.array(test_s[['OrderQtyLog'] + adv])
        test_x = test_s[features]

        model = Ridge(alpha=5)
        model.fit(train_x, train_y)
        
        test_y_predicted = model.predict(test_x)
        test_y_predicted[test_y_predicted < 0] = 0
        
        error = mean_absolute_error(np.expm1(test_y), np.expm1(test_y_predicted))
        
        errors[index].append(error)
        all_errors.append(error)
        fold_errors.append(error)
        
        processed += 1
        if(processed % 1000 == 0):
            print('Trained {}/{} models, mean fold error: {}'.format(processed, all, np.mean(fold_errors)))
        
    print('Mean fold error: {}'.format(np.mean(fold_errors)))
    
np.mean(all_errors)

Trained 1000/38676 models, mean fold error: 14.92130225899087
Trained 2000/38676 models, mean fold error: 58.486497787845074
Trained 3000/38676 models, mean fold error: 42.39123366974497
Trained 4000/38676 models, mean fold error: 35.78112653074848
Trained 5000/38676 models, mean fold error: 31.292148365894196
Trained 6000/38676 models, mean fold error: 28.758939207556818
Trained 7000/38676 models, mean fold error: 25.75633590720141
Trained 8000/38676 models, mean fold error: 25.157622772335525
Trained 9000/38676 models, mean fold error: 23.607894329486502
Trained 10000/38676 models, mean fold error: 22.391635741805928
Trained 11000/38676 models, mean fold error: 21.380079334604222
Trained 12000/38676 models, mean fold error: 20.24070914202316
Trained 13000/38676 models, mean fold error: 19.08778815857
Trained 14000/38676 models, mean fold error: 18.307349193914458
Trained 15000/38676 models, mean fold error: 17.844246991133137
Trained 16000/38676 models, mean fold error: 17.1976601242

55885467.953593098

In [None]:
models = {}
for index, group in series.groupby(by=['SalOrg', 'Material']):
    train = group.loc[group.index.intersection(fold[1])]

    train_y = train[['OrderQtyLog'] + adv]
    train_x = train[features]

    model = ElasticNet(alpha=3)
    model.fit(train_x, train_y)
    models[index] = model



In [None]:
lagged_final = lagged
lagged_final.append('OrderQtyLog')
lagged_final.pop(0)
fetures_final = lagged_final + sm_cols + m_cols + s_cols
fetures_final

In [None]:
test_i = series[series["Month"]==pd.to_datetime('2017-03-01')].index

predictions = []
for index, group in series.groupby(by=['SalOrg', 'Material']):
    model = models[index]
    test = group.loc[group.index.intersection(test_i)]
    test_x = test[fetures_final]
    test_prediction = model.predict(test_x)
    prediction = np.expm1(test_prediction)
    prediction[prediction < 0.0] = 0
    row = list(index) + prediction[0].tolist()
    predictions.append(row)
    #print('Prediction done: {}'.format(index))
predictions

In [None]:
pre_df = pd.DataFrame(predictions, columns=['SalOrg', 'Material', '2017-04', '2017-05', '2017-06'])
ml = pd.melt(pre_df, id_vars=['Material', 'SalOrg'], value_vars=['2017-04', '2017-05', '2017-06'])
ml['date'] = ml['variable']
del ml['variable']
ml['demand'] = ml['value']
del ml['value']
ml.head()

In [47]:
result = eval.merge(ml, on=['Material', 'SalOrg', 'date'])

In [51]:
result[['ID', 'demand']].to_csv('linear_models_11.98.csv', index=False)