# Forecasting Prediction Using LightGBM Model

In [16]:
import pandas as pd
import numpy as np
import datetime
import warnings
#import lightgbm as lgb
import matplotlib.pyplot as plt
import statsmodels.api as sm
import holidays
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

In [125]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

## Data Exploration

*seasonal_decompose*

## Data Preparation

### Handling Missing Values

*dataframe.interpolate()*

### Handling Outliers

## Feature Engineering

In this section we will generate additional feature categories to provide more insight for the machine learning algorithm:

* Date Time Features
* Lag/Shifted Features
* Rolling Window Statistics
* Expanding Window Statistics

### Date Time Features

Time-Related features are features generated by the timestap of each observation.

Generated these additional time-related features can provide insights to improve your model.

For example you could use the timestamp and generate a feature called *is_holiday* which tells you whether that day is a holiday or not. From this feature you might see that sales are higher on holidays.

In [127]:
train.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [128]:
def holiday_list(df):
    
    df['date'] = pd.to_datetime(df.date)
    min_year = df.date.min().year
    max_year = df.date.max().year
    
    years_list = pd.period_range(min_year, max_year, freq = 'Y')
    list_of_holidays = []
    
    for year in years_list:
        list_of_holidays.append(holidays.US(years = int(str(year))).keys())
        
    holiday_list = [item for sublist in list_of_holidays for item in sublist]
    
    return holiday_list

In [129]:
def create_date_time_features(df):
    df['date'] = pd.to_datetime(df.date)
    df['month'] = df.date.dt.month
    df['day_of_month'] = df.date.dt.day
    df['day_of_year'] = df.date.dt.dayofyear
    df['week_of_year'] = df.date.dt.weekofyear
    df['day_of_week'] = df.date.dt.weekday + 1
    df['year'] = df.date.dt.year 
    df['is_weekend'] = df.date.dt.weekday // 5
    df['start_of_month'] = df.date.dt.is_month_start.astype(int)
    df['end_of_month'] = df.date.dt.is_month_end.astype(int)
    df['is_holiday'] = np.where(df.date.isin(holiday_list(df)), 1, 0)
    
    return df

In [130]:
train = create_date_time_features(train)
train.head()

Unnamed: 0,date,store,item,sales,month,day_of_month,day_of_year,week_of_year,day_of_week,year,is_weekend,start_of_month,end_of_month,is_holiday
0,2013-01-01,1,1,13,1,1,1,1,2,2013,0,1,0,1
1,2013-01-02,1,1,11,1,2,2,1,3,2013,0,0,0,0
2,2013-01-03,1,1,14,1,3,3,1,4,2013,0,0,0,0
3,2013-01-04,1,1,13,1,4,4,1,5,2013,0,0,0,0
4,2013-01-05,1,1,10,1,5,5,1,6,2013,1,0,0,0


### Lag/Shifted Features

Lag features are useful because the value observed at time $t$ is highly dependent on the value observed at time $t-1$

In [138]:
lag_list = [91, 92,93,94,95,96, 97, 98, 100, 105, 112, 119, 126, 150,
            182,200,220, 250, 300, 350, 355, 360,361,362,363, 364,
            365, 370, 375,380, 546, 600, 650, 680, 690, 700, 710, 728,
            730, 800, 900, 950, 990, 1000, 1050, 1090, 1095]

def create_lag_features(df, lag_list):
    for lag in lag_list:
        df['lag' + str(lag)] = df.groupby(["store", "item"]).sales.shift(lag)
    return df

In [139]:
train = create_lag_features(train, lag_list)
train.head()

Unnamed: 0,date,store,item,sales,month,day_of_month,day_of_year,week_of_year,day_of_week,year,...,lag728,lag730,lag800,lag900,lag950,lag990,lag1000,lag1050,lag1090,lag1095
0,2013-01-01,1,1,13,1,1,1,1,2,2013,...,,,,,,,,,,
1,2013-01-02,1,1,11,1,2,2,1,3,2013,...,,,,,,,,,,
2,2013-01-03,1,1,14,1,3,3,1,4,2013,...,,,,,,,,,,
3,2013-01-04,1,1,13,1,4,4,1,5,2013,...,,,,,,,,,,
4,2013-01-05,1,1,10,1,5,5,1,6,2013,...,,,,,,,,,,


### Rolling Mean/Moving Average

How rolling mean works is that you input a window of time and calculate the average or mean demand of that time period.

In [140]:
windows_list = [91, 98, 105, 112, 119, 126, 186, 200, 210, 250, 300, 365, 546, 700]

def create_rolling_mean_features(df, windows_list):
    for window in windows_list:
        df['sales_rolling_mean' + str(window)] = df.groupby(["store", "item"]).sales.rolling(window).mean().shift(1).values
    return df

In [141]:
train = create_rolling_mean_features(train, windows_list)
train.head()

Unnamed: 0,date,store,item,sales,month,day_of_month,day_of_year,week_of_year,day_of_week,year,...,sales_rolling_mean119,sales_rolling_mean126,sales_rolling_mean186,sales_rolling_mean200,sales_rolling_mean210,sales_rolling_mean250,sales_rolling_mean300,sales_rolling_mean365,sales_rolling_mean546,sales_rolling_mean700
0,2013-01-01,1,1,13,1,1,1,1,2,2013,...,,,,,,,,,,
1,2013-01-02,1,1,11,1,2,2,1,3,2013,...,,,,,,,,,,
2,2013-01-03,1,1,14,1,3,3,1,4,2013,...,,,,,,,,,,
3,2013-01-04,1,1,13,1,4,4,1,5,2013,...,,,,,,,,,,
4,2013-01-05,1,1,10,1,5,5,1,6,2013,...,,,,,,,,,,


### Exponentially Weight Mean Features

This feature applies weight to the time series values. More recent values will have a larger weight applied to it. This is because more recent points will be more relevant for future forecasts.

In [143]:
lags = [91, 98, 105, 112, 180, 270, 365, 546, 728]
alpha_list = [0.95, 0.9, 0.8, 0.7, 0.5]

def create_exp_weight_mean_features(df, alphas, lags):
    for alpha in alphas:
        for lag in lags:
            df['sales_ewm_alpha_' + str(alpha).replace(".","") + 
              "_lag_" + str(lag)] = df.groupby(["store", "item"]).sales.transform(
            lambda x: x.shift(lag).ewm(alpha = alpha).mean())
    return df

In [144]:
train = create_exp_weight_mean_features(train, alpha_list, lags)
train.head()

Unnamed: 0,date,store,item,sales,month,day_of_month,day_of_year,week_of_year,day_of_week,year,...,sales_ewm_alpha_07_lag_728,sales_ewm_alpha_05_lag_91,sales_ewm_alpha_05_lag_98,sales_ewm_alpha_05_lag_105,sales_ewm_alpha_05_lag_112,sales_ewm_alpha_05_lag_180,sales_ewm_alpha_05_lag_270,sales_ewm_alpha_05_lag_365,sales_ewm_alpha_05_lag_546,sales_ewm_alpha_05_lag_728
0,2013-01-01,1,1,13,1,1,1,1,2,2013,...,,,,,,,,,,
1,2013-01-02,1,1,11,1,2,2,1,3,2013,...,,,,,,,,,,
2,2013-01-03,1,1,14,1,3,3,1,4,2013,...,,,,,,,,,,
3,2013-01-04,1,1,13,1,4,4,1,5,2013,...,,,,,,,,,,
4,2013-01-05,1,1,10,1,5,5,1,6,2013,...,,,,,,,,,,


## Error Metric