In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import gc
import warnings
warnings.filterwarnings('ignore')
import joblib

sales = pd.read_csv('sales_train_evaluation.csv')
sales.name = 'sales'
calendar = pd.read_csv('calendar.csv')
calendar.name = 'calendar'
prices = pd.read_csv('sell_prices.csv')
prices.name = 'prices'

for d in range(1942,1970): # fill the test day(1942-1969)
    col = 'd_' + str(d)
    sales[col] = 0
    sales[col] = sales[col].astype(np.int16)
    
def downcast(df): # change the datatype of dataframe to save memory
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i,t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    return df  

sales = downcast(sales)
prices = downcast(prices)
calendar = downcast(calendar)

# tranform horizontal representation to vertical view
df = pd.melt(sales, id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], \
             var_name='d', value_name='sold').dropna()

# combine the three tables together
df = pd.merge(df, calendar, on='d', how='left')
df = pd.merge(df, prices, on=['store_id','item_id','wm_yr_wk'], how='left') 

# save the memory
df.d = df['d'].apply(lambda x: x.split('_')[1]).astype(np.int16)        
df.drop('date',axis=1,inplace=True)

# add lags of 1 2 3 7 14 28 days
lags = [1,2,3,7,14,28]
for lag in lags:
    df['sold_lag_'+str(lag)] = \
    df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'], \
               as_index=False)['sold'].shift(lag).astype(np.float16)
    
# add sliding window for 7 30 60 days by mean
for i in [7,30,60]:
    df['rolling_mean_'+str(i)] = \
    df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])\
    ['sold'].transform(lambda x: x.rolling(window=i).mean()).astype(np.float16)


# add expanding window every 7 days 
df['expanding_mean'] = \
df.groupby(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'])\
['sold'].transform(lambda x: x.expanding(7).mean()).astype(np.float16)

# drop the rows with many NaN
df = df[df['d']>28]

for lag in lags:
    df['sold_lag_'+str(lag)] = df['sold_lag_'+str(lag)].astype(np.int16)

# add mean encoding for item for it's high categorical
df['item_avg'] = df.groupby('item_id')['sold'].transform('mean').astype(np.float16)
df['store_item_avg'] = df.groupby(['store_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['cat_item_avg'] = df.groupby(['cat_id','item_id'])['sold'].transform('mean').astype(np.float16)
df['dept_item_avg'] = df.groupby(['dept_id','item_id'])['sold'].transform('mean').astype(np.float16)

# add price momentum by week month year
df['price_momentum'] = df['sell_price']/df.groupby(['store_id','item_id'])['sell_price'].transform(lambda x: x.shift(1))
df['price_momentum_m'] = df['sell_price']/df.groupby(['store_id','item_id','month'])['sell_price'].transform('mean')
df['price_momentum_y'] = df['sell_price']/df.groupby(['store_id','item_id','year'])['sell_price'].transform('mean')

In [35]:
valid = df[(df['d']>=1914) & (df['d']<1942)][['id','d','sold']]
test = df[df['d']>=1942][['id','d','sold']]
eval_preds = test['sold']
valid_preds = valid['sold']

In [206]:
X_train, y_train = df[df['d']<1914].drop('sold',axis=1), df[df['d']<1914]['sold']
X_valid, y_valid = df[(df['d']>=1914) & (df['d']<1942)].drop('sold',axis=1), df[(df['d']>=1914) & (df['d']<1942)]['sold']
X_test = df[df['d']>=1942].drop('sold',axis=1)

In [37]:
# train LGBM
from lightgbm import LGBMRegressor
model = LGBMRegressor(
    n_estimators=2000,
    learning_rate=0.1,
    subsample=0.75,
    colsample_bytree=0.75,
    max_depth=200,
    num_leaves=200,
    #min_child_weight=300
)

model.fit(X_train, y_train, eval_set=[(X_train,y_train),(X_valid,y_valid)], eval_metric='rmse', verbose=20, early_stopping_rounds=20)
valid_preds[X_valid.index] = model.predict(X_valid)
eval_preds[X_test.index] = model.predict(X_test)

[20]	training's rmse: 1.82447	training's l2: 3.32867	valid_1's rmse: 1.76407	valid_1's l2: 3.11195
[40]	training's rmse: 1.69561	training's l2: 2.8751	valid_1's rmse: 1.67355	valid_1's l2: 2.80078
[60]	training's rmse: 1.64993	training's l2: 2.72226	valid_1's rmse: 1.64974	valid_1's l2: 2.72165
[80]	training's rmse: 1.62089	training's l2: 2.62729	valid_1's rmse: 1.63887	valid_1's l2: 2.68591
[100]	training's rmse: 1.59862	training's l2: 2.5556	valid_1's rmse: 1.63351	valid_1's l2: 2.66837
[120]	training's rmse: 1.58028	training's l2: 2.49728	valid_1's rmse: 1.62931	valid_1's l2: 2.65464
[140]	training's rmse: 1.5659	training's l2: 2.45206	valid_1's rmse: 1.62706	valid_1's l2: 2.64733
[160]	training's rmse: 1.55321	training's l2: 2.41246	valid_1's rmse: 1.62523	valid_1's l2: 2.64138
[180]	training's rmse: 1.54167	training's l2: 2.37674	valid_1's rmse: 1.62345	valid_1's l2: 2.6356
[200]	training's rmse: 1.53179	training's l2: 2.34639	valid_1's rmse: 1.6226	valid_1's l2: 2.63283
[220]	tra

In [38]:
# test validation root mean square error
from sklearn import metrics
valid_score = np.sqrt(metrics.mean_squared_error(valid_preds, y_valid))
print(valid_score)

1.6217941028912362


In [233]:
# construct submission file
submission = pd.read_csv('submission.csv')
#submission

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0.934120,0.875480,0.960220,0.863850,0.935296,0.889653,0.887393,0.891199,0.856659,...,0.877173,0.944752,0.902613,1.013883,0.935230,1.030481,1.002984,0.979618,0.990042,0.989291
1,HOBBIES_1_002_CA_1_validation,0.214365,0.201707,0.198055,0.187793,0.284975,0.215660,0.195512,0.299703,0.325807,...,0.234131,0.217755,0.215865,0.203077,0.199433,0.189073,0.184513,0.277574,0.313618,0.331972
2,HOBBIES_1_003_CA_1_validation,0.233748,0.192897,0.190265,0.154551,0.155238,0.123763,0.707727,0.764114,0.787339,...,0.476823,0.522266,0.479254,0.424747,0.386741,0.341988,0.312317,0.275442,0.349227,0.413436
3,HOBBIES_1_004_CA_1_validation,2.449046,1.671799,2.101133,2.326328,2.452698,2.315176,2.374683,2.354683,1.703705,...,1.841191,1.613033,1.298878,1.401027,1.708027,1.377338,1.699728,1.673557,1.250934,1.939163
4,HOBBIES_1_005_CA_1_validation,1.138673,1.019884,1.317490,1.093254,0.873546,0.822675,1.031086,0.837530,1.189845,...,0.855478,1.445591,1.390138,1.291050,1.399489,1.560374,1.263358,0.918241,0.957387,0.882702
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60975,FOODS_3_823_WI_3_evaluation,0.065917,0.057944,0.050858,0.044562,0.038968,0.033996,0.029579,0.025653,0.022165,...,0.142347,0.536319,0.285105,0.258043,0.226488,0.201522,0.178078,0.362556,0.226235,0.203020
60976,FOODS_3_824_WI_3_evaluation,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.095963,0.126238,0.262448,...,0.171575,0.167969,0.160165,0.156286,0.149397,0.145400,0.139252,0.135242,0.129705,0.236169
60977,FOODS_3_825_WI_3_evaluation,0.828208,0.983612,0.682440,0.665417,0.565309,0.538324,0.661201,0.738475,0.921502,...,0.929280,1.150936,1.157245,0.821300,1.195862,0.895139,0.930218,0.827916,0.651319,0.857733
60978,FOODS_3_826_WI_3_evaluation,1.392850,1.844864,0.875682,1.269794,0.752389,1.143532,0.644040,1.216610,0.793823,...,1.145058,1.127269,0.607711,1.203275,0.575682,0.956439,0.674103,0.925254,1.003206,1.203138


In [214]:
valid = df[(df['d']>=1914) & (df['d']<1942)][['id','d','sold']]
test = df[df['d']>=1942][['id','d','sold']]

In [215]:
valid['sold'] = valid_preds
test['sold'] = eval_preds

In [223]:
valid['id'] = valid['id'].apply(lambda x: x.replace("evaluation", "validation"))

In [225]:
predictions = valid[['id', 'd', 'sold']]
predictions = pd.pivot(predictions, index = 'id', columns = 'd', values = 'sold').reset_index()
#predictions = predictions.iloc[:, :30]
predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
validation_rows = [row for row in submission['id'] if 'validation' in row] 
validation = submission[submission['id'].isin(validation_rows)]
validation = submission[['id']].merge(predictions, on = 'id')

In [228]:
predictions = test[['id', 'd', 'sold']]
predictions = pd.pivot(predictions, index = 'id', columns = 'd', values = 'sold').reset_index()
#predictions = predictions.iloc[:, :30]
predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
evaluation_rows = [row for row in submission['id'] if 'evaluation' in row] 
evaluation = submission[submission['id'].isin(evaluation_rows)]
evaluation = submission[['id']].merge(predictions, on = 'id')
#final = pd.concat([validation, evaluation])

In [230]:
final = pd.concat([validation, evaluation])

In [232]:
final.to_csv('mysubmission.csv', index = False)