# Import data

In [1]:
import pandas as pd
import numpy as np
import math
import os
import sys


nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [2]:
from time_series.hdfs_handle import HDFSHandler

ModuleNotFoundError: No module named 'datalabframework'

In [None]:
hdfs_handler = HDFSHandler()

In [None]:
start_date = pd.to_datetime('2019-10-01')
end_date = pd.to_datetime('2019-10-31') + pd.to_timedelta(1, unit='D') 

data_df = hdfs_handler.get_transaction_data(date=(start_date, end_date))

In [None]:
excel_data = pd.read_excel(os.path.join('../data', 'vnshop_order.xlsx'), sheet_name=0)

In [None]:
df = pd.concat([data_df, excel_data])
df.head()


# Sorting by date and indexing

In [None]:
df_sortdate = df.sort_values(by=['created_at'])

In [None]:
df_sortdate = df_sortdate.set_index('created_at')["2018-11":]

In [None]:
df_sortdate.head()

# Provide week, month, year, weekday 

In [None]:
df_sortdate['week'] = df_sortdate.index.week
df_sortdate['month'] = df_sortdate.index.month
df_sortdate['year'] = df_sortdate.index.year
df_sortdate['weekday'] = df_sortdate.index.weekday_name

In [None]:
df_sortdate.head()

# Visualize data by warehouse

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def sort_by_warehouse(df, warehouse):
    df_warehouse = df_sortdate[df_sortdate['Kho'] == warehouse]
    dfbymonth = df_warehouse.resample('M')['quantity'].sum()
    dfbyweek = df_warehouse.resample('W')['quantity'].sum()
    dfbyday = df_warehouse.resample('D')['quantity'].sum()
    dfbyweekday = df_warehouse.loc[:, ['weekday', 'quantity']]
    
    return df_warehouse, dfbymonth, dfbyweek, dfbyday, dfbyweekday

In [None]:
df_khohanoi, dfbymonth_khohanoi, dfbyweek_khohanoi, dfbyday_khohanoi, dfbyweekday_khohanoi = sort_by_warehouse(df_sortdate, 'Kho Hà Nội')
df_khodanang, dfbymonth_khodanang, dfbyweek_khodanang, dfbyday_khodanang, dfbyweekday_khodanang = sort_by_warehouse(df_sortdate, 'Kho Đà Nẵng')
df_khobinhduong, dfbymonth_khobinhduong, dfbyweek_khobinhduong, dfbyday_khobinhduong, dfbyweekday_khobinhduong = sort_by_warehouse(df_sortdate, 'Kho Bình Dương')

In [None]:
def plot_line(dfs, title=None, xaxis=None, yaxis=None, figsize=(20, 6)):
    sns.set(rc={'figure.figsize':figsize})
    
    color = ['b', 'g', 'purple']
    style = ['-', ':', '--']
        
    for i, df in enumerate(dfs):
        df.plot(linewidth=2, color=color[i], style=style[i])

    plt.legend(['Hà Nội','Đà Nẵng','Bình Dương'])
    plt.title(title)
    plt.xlabel(xaxis, size = 15)
    plt.ylabel(yaxis, size = 15)

In [None]:
def plot_line_with_subplot(dfs, title=None, figsize=(20, 14)):
    fig, axes = plt.subplots(len(dfs), 1, figsize=figsize, dpi=80)
    color = ['b', 'g', 'purple']
    style = ['-', ':', '--']
        
    for i, df in enumerate(dfs):
        df.plot(linewidth=2, color=color[i], style=style[i], ax=axes[i])

    axes[0].set_title(title + ' - Hanoi', fontsize=18)
    axes[1].set_title(title + ' - Danang', fontsize=18)
    axes[2].set_title(title + ' - Binhduong', fontsize=18)
    
    axes[0].set_xlabel('')
    axes[1].set_xlabel('')
    axes[2].set_xlabel('')


In [None]:
def plot_boxplot(x, y, dfs, title=None, figsize=(20, 14), order=None, vertical_orient=False, show_outlier=True):
    fig, axes = plt.subplots(len(dfs), 1, figsize=figsize, dpi=80)
    
    flierprops = dict(markerfacecolor='0.75', markersize=5,
                      linestyle='none')
    
    if not vertical_orient:
        vertical_orient = 'v'
    else:
        vertical_orient = 'h'
        x, y = y, x
    
    for i, df in enumerate(dfs):
        sns.boxplot(x=x, y=y, data=df, ax=axes[i], order=order, orient=vertical_orient, showfliers=show_outlier)

    # Set Title
    axes[0].set_title(title + ' - Hanoi', fontsize=18)
    axes[1].set_title(title + ' - Danang', fontsize=18)
    axes[2].set_title(title + ' - Binhduong', fontsize=18)

    axes[0].set_xlabel('')
    axes[1].set_xlabel('')
    axes[2].set_xlabel('')


In [None]:
def plot_histogram(dfs, title=None, figsize=(20, 14)):
    fig, axes = plt.subplots(len(dfs), 1, figsize=figsize, dpi=80)
    color = ['b', 'g', 'purple']
    
    for i, df in enumerate(dfs):
        df.plot.hist(bins=20, linewidth=2, color=color[i], ax=axes[i])

    axes[0].set_title(title + ' - Hanoi', fontsize=18)
    axes[1].set_title(title + ' - Danang', fontsize=18)
    axes[2].set_title(title + ' - Binhduong', fontsize=18)
    
    axes[0].set_xlabel('')
    axes[1].set_xlabel('')
    axes[2].set_xlabel('')

## Visualize data by weekday

In [3]:
def sum_weekday_df(dfs):
    dffs = []
    for df in dfs:
        dff = df.groupby('weekday').sum().reindex(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
        dffs.append(dff['quantity'])
    return dffs

In [4]:
dfs = sum_weekday_df([dfbyweekday_khohanoi, dfbyweekday_khodanang, dfbyweekday_khobinhduong])

NameError: name 'dfbyweekday_khohanoi' is not defined

In [None]:
#Number of salling product by weekday
plot_line(dfs,
         title='Sale by weekday',
         xaxis='Weekday',
         yaxis='Sale',
         figsize=(20, 8))

In [None]:
def prepare_df_weekly_boxplot(dfs):
    dffs = []
    for df in dfs:
        dff = pd.DataFrame(df)
        dff['weekday'] = dff.index.weekday_name
        dffs.append(dff)
    return dffs

In [None]:
dfs = prepare_df_weekly_boxplot([dfbyday_khohanoi, dfbyday_khodanang, dfbyday_khobinhduong])

plot_boxplot('weekday', 'quantity',
            dfs,
            title='Daily sale - with outlier',
            order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
            figsize=(20, 16),
            show_outlier=True,
            vertical_orient=False)

## Visualize data by week

In [None]:
#Number of salling product by week
plot_line([dfbyweek_khohanoi, dfbyweek_khodanang, dfbyweek_khobinhduong],
         title='Sale by week',
         xaxis='Week',
         yaxis='Sale')

In [None]:
dfs = [dfbyweek_khohanoi, dfbyweek_khodanang, dfbyweek_khobinhduong]

plot_histogram(dfs,
              title='Weekly sale histogram')

In [None]:
dfs = [dfbyweek_khohanoi.diff().dropna(), dfbyweek_khodanang.diff().dropna(), dfbyweek_khobinhduong.diff().dropna()]

plot_histogram(dfs,
              title='Weekly sale difference histogram')

## Visualize data by day

In [None]:
#Number of salling product by day
plot_line([dfbyday_khohanoi, dfbyday_khodanang, dfbyday_khobinhduong],
         title='Sale by week',
         xaxis='Week',
         yaxis='Sale',
         figsize=(20, 4))

In [None]:
plot_line_with_subplot([dfbyday_khohanoi, dfbyday_khodanang, dfbyday_khobinhduong],
         title='Sale by week')


In [None]:
dfs = [dfbyday_khohanoi, dfbyday_khodanang, dfbyday_khobinhduong]

plot_histogram(dfs,
              title='Daily sale histogram')

In [5]:
dfs = [dfbyday_khohanoi.diff().dropna(), dfbyday_khodanang.diff().dropna(), dfbyday_khobinhduong.diff().dropna()]

plot_histogram(dfs,
              title='Daily sale difference histogram')

NameError: name 'dfbyday_khohanoi' is not defined

## Boxplot for day in month

In [None]:
def prepare_df_daily_boxplot(dfs):
    dffs = []
    for df in dfs:
        dff = pd.DataFrame(df)
        dff['day'] = dff.index.day
        dffs.append(dff)
    return dffs

In [None]:
dfs = prepare_df_daily_boxplot([dfbyday_khohanoi, dfbyday_khodanang, dfbyday_khobinhduong])
plot_boxplot('day', 'quantity',
            dfs,
            title='Daily',
            figsize=(20, 20),
            show_outlier=False)

## Visualize data by month

In [None]:
#Number of selling product by month
sns.set(rc={'figure.figsize':(20, 6)})

dfbymonth_khodanang.plot(linewidth=2, color='green', style='-')
dfbymonth_khohanoi.plot(linewidth=2, color='blue', style='-')
dfbymonth_khobinhduong.plot(linewidth=2, color='purple', style='-')
plt.legend(['Đà Nẵng','Hà Nội','Bình Dương'])
plt.ylabel('Số sản phẩm xuất bán')
plt.xlabel('Date by month')

# trend, sesonal, residual

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 20,6
import statsmodels.api as sm

In [None]:
#Kho Hà Nội
decomposition_bydayHN = sm.tsa.seasonal_decompose(dfbyday_khohanoi, model='additive')
fig = decomposition_bydayHN.plot()
plt.show()

In [None]:
#Kho Đà Nẵng
decomposition_bydayDN = sm.tsa.seasonal_decompose(dfbyday_khodanang, model='additive')
fig = decomposition_bydayDN.plot()
plt.show()

In [None]:
#Kho Bình Dương
decomposition_bydayBD = sm.tsa.seasonal_decompose(dfbyday_khobinhduong, model='additive')
fig = decomposition_bydayBD.plot()
plt.show()

# Dickey - Fuller test

In [None]:
from statsmodels.tsa.stattools import adfuller 


def ADFtest(df, warehouse):
    print('Result for ADF test {}:'.format(warehouse))
    dftest = adfuller(df, autolag = 'AIC')

    dfoutput = pd.Series(dftest[0:4], index = ['Test Statistic', 'p-value', 'Number of lag used', 'Number of observation used'])
    for key, value in dftest[4].items():
        dfoutput['Critical value (%s)'%key] = value
        
    print(dfoutput)

In [None]:
ADFtest(dfbyday_khohanoi, 'Kho Hanoi')
ADFtest(dfbyday_khodanang, 'Kho Danang')
ADFtest(dfbyday_khobinhduong, 'Kho Binhduong')


- Kho Hà Nội is non-stationary 
- Kho Đà Nẵng is non-stationary 
- Kho Bình Dương is non-stationary 

In [None]:
ADFtest(dfbyday_khohanoi.diff().dropna(), 'Kho Hanoi')
ADFtest(dfbyday_khodanang.diff().dropna(), 'Kho Danang')
ADFtest(dfbyday_khobinhduong.diff().dropna(), 'Kho Binhduong')


# ACF, PACF kho Hà Nội

In [None]:
import numpy as np

In [None]:
from statsmodels.tsa.stattools import acf, pacf 
    
lag_acf = acf(dfbyday_khohanoi, nlags = 20)
lag_pacf = pacf(dfbyday_khohanoi, nlags = 20, method = 'ols')

#plot ACF: 
plt.subplot(121)
plt.plot(lag_acf)
plt.axhline(y=0, linestyle = '--', color = 'blue')
plt.axhline(y= -1.96/np.sqrt(len(dfbyday_khohanoi)), linestyle = '--', color = 'blue')
plt.axhline(y= 1.96/np.sqrt(len(dfbyday_khohanoi)), linestyle = '--', color = 'blue')
plt.title('Autocorrelation Function')

#plot PACF:
plt.subplot(122)
plt.plot(lag_pacf)
plt.axhline(y= 0, linestyle = '--', color = 'blue')
plt.axhline(y= -1.96/np.sqrt(len(dfbyday_khohanoi)), linestyle = '--', color = 'blue')
plt.axhline(y= 1.96/np.sqrt(len(dfbyday_khohanoi)), linestyle = '--', color = 'blue')
plt.title('Partial Autocorrelation Function')

plt.tight_layout()



In [None]:
from matplotlib import pyplot
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.tsaplots import plot_acf
plot_pacf(dfbyday_khohanoi, lags=30)
pyplot.show()
plot_acf(dfbyday_khohanoi, lags=30)
pyplot.show()

# Build and validate model kho Hà Nội

In [None]:
sns.set(rc={'figure.figsize':(20, 8)})
from statsmodels.tsa.arima_model import ARIMA
model_HN= ARIMA(dfbyday_khohanoi, order = (3,0,7))
model_fit_HN = model_HN.fit(disp = False)
plt.plot(dfbyday_khohanoi)
plt.plot(model_fit_HN.fittedvalues, color = 'red')
plt.title('RSS: %.4f'% sum((model_fit_HN.fittedvalues-dfbyday_khohanoi )**2))

# Predict sale for Kho Hà Nội

In [None]:
model_fit_HN.plot_predict(1,len(dfbyday_khohanoi)+14)
model_fit_HN.forecast(steps = 14)

# ACF, PACF kho Đà Nẵng

In [None]:
from matplotlib import pyplot
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.graphics.tsaplots import plot_acf
plot_pacf(dfbyday_khodanang, lags=20)
pyplot.show()
plot_acf(dfbyday_khodanang, lags=20)
pyplot.show()

# Build and validate model kho Đà Nẵng

In [None]:
sns.set(rc={'figure.figsize':(20, 8)})
from statsmodels.tsa.arima_model import ARIMA
model_DN= ARIMA(dfbyday_khodanang, order = (2,0,3))
model_fit_DN = model_DN.fit(disp =False)
plt.plot(dfbyday_khodanang)
plt.plot(model_fit_DN.fittedvalues, color = 'red')
plt.title('RSS: %.4f'% sum((model_fit_DN.fittedvalues-dfbyday_khodanang )**2))

# Prophet

In [None]:
from fbprophet import Prophet
def plot_forecast_component(df, period=0):
    dff = pd.DataFrame(df).reset_index().rename(columns={'created_at': 'ds', 'quantity': 'y'})
    prophet = Prophet()
    prophet.fit(dff)
    future = prophet.make_future_dataframe(periods=period)
    forecast = prophet.predict(future)
    pd.plotting.register_matplotlib_converters()
    
#     fig, axes = plt.subplots(2, 1, figsize=(20, 20), dpi=80)
    prophet.plot(forecast)
    prophet.plot_components(forecast)
    # Set Title
#     axes[0].set_title('Forecast', fontsize=18)
#     axes[1].set_title('Forecast component', fontsize=18)
    plt.show()
#     fig1 = prophet.plot(forecast)
#     forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

In [None]:
plot_forecast_component(dfbyday_khohanoi, period=90)

In [None]:
plot_forecast_component(dfbyday_khodanang, period=90)

In [None]:
plot_forecast_component(dfbyday_khobinhduong, period=90)

In [None]:
from pandas.plotting import lag_plot
lag_plot(dfbyday_khohanoi)

# Playground for testing

In [None]:
import statsmodels.tsa.api as smt
import scipy.stats as scs
import statsmodels.api as sm

def tsplot(y, lags=None, figsize=(10, 8), style='bmh',title=''):
    if not isinstance(y, pd.Series):
        y = pd.Series(y)
    with plt.style.context(style):    
        fig = plt.figure(figsize=figsize)
        #mpl.rcParams['font.family'] = 'Ubuntu Mono'
        layout = (3, 2)
        ts_ax = plt.subplot2grid(layout, (0, 0), colspan=2)
        acf_ax = plt.subplot2grid(layout, (1, 0))
        pacf_ax = plt.subplot2grid(layout, (1, 1))
        qq_ax = plt.subplot2grid(layout, (2, 0))
        pp_ax = plt.subplot2grid(layout, (2, 1))
        
        y.plot(ax=ts_ax)
        ts_ax.set_title(title)
        smt.graphics.plot_acf(y, lags=lags, ax=acf_ax, alpha=0.5)
        smt.graphics.plot_pacf(y, lags=lags, ax=pacf_ax, alpha=0.5)
        sm.qqplot(y, line='s', ax=qq_ax)
        qq_ax.set_title('QQ Plot')        
        scs.probplot(y, sparams=(y.mean(), y.std()), plot=pp_ax)

        plt.tight_layout()
    return 

In [None]:
tsplot(dfbyday_khohanoi, figsize=(20, 16))

In [None]:
from statsmodels.nonparametric.smoothers_lowess import lowess
plt.rcParams.update({'xtick.bottom' : False, 'axes.titlepad':5})

# 1. Moving Average
df_ma = dfbyday_khohanoi.rolling(7, center=True, closed='both').mean()

# 2. Loess Smoothing (5% and 15%)
df_loess_5 = pd.DataFrame(lowess(dfbyday_khohanoi, np.arange(len(dfbyday_khohanoi)), frac=0.05)[:, 1], index=dfbyday_khohanoi.index, columns=['value'])
df_loess_15 = pd.DataFrame(lowess(dfbyday_khohanoi, np.arange(len(dfbyday_khohanoi)), frac=0.15)[:, 1], index=dfbyday_khohanoi.index, columns=['value'])

# Plot
fig, axes = plt.subplots(4,1, figsize=(7, 7), sharex=True, dpi=120)
dfbyday_khohanoi.plot(ax=axes[0], color='k', title='Original Series')
df_loess_5['value'].plot(ax=axes[1], title='Loess Smoothed 5%')
df_loess_15['value'].plot(ax=axes[2], title='Loess Smoothed 15%')
df_ma.plot(ax=axes[3], title='Moving Average (3)')
fig.suptitle('How to Smoothen a Time Series', y=0.95, fontsize=14)
plt.show()