In [1]:
import numpy as np
import pandas as pd
import math
import os
import sys


nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

In [2]:
from time_series.hdfs_handle import HDFSHandler

ModuleNotFoundError: No module named 'datalabframework'

In [None]:
hdfs_handler = HDFSHandler()

In [None]:
df = hdfs_handler.get_transaction_data()
df

In [None]:
df_sortdate = df.sort_values('created_at').set_index('created_at')
df_sortdate

In [None]:
df_sortdate['week'] = df_sortdate.index.week
df_sortdate['month'] = df_sortdate.index.month
df_sortdate['year'] = df_sortdate.index.year
df_sortdate['weekday'] = df_sortdate.index.weekday_name

# Visualize data

In [None]:
def sort_by_warehouse(df, warehouse):
    df_warehouse = df_sortdate[df_sortdate['Kho'] == warehouse]
    warehouse_name = warehouse.replace('Kho ', '')
    
    dfbymonth = df_warehouse.resample('M')['quantity'].sum().rename(warehouse_name)
    dfbyweek = df_warehouse.resample('W')['quantity'].sum().rename(warehouse_name)
    dfbyday = df_warehouse.resample('D')['quantity'].sum().rename(warehouse_name)
    
    return df_warehouse, dfbymonth, dfbyweek, dfbyday

def aggregate_data(df):
    warehouse_list = df['Kho'].unique().tolist()
    
    df_hanoi, dfbymonth_hanoi, dfbyweek_hanoi, dfbyday_hanoi = sort_by_warehouse(df_sortdate, 'Kho Hà Nội')
    df_danang, dfbymonth_danang, dfbyweek_danang, dfbyday_danang = sort_by_warehouse(df_sortdate, 'Kho Đà Nẵng')
    df_binhduong, dfbymonth_binhduong, dfbyweek_binhduong, dfbyday_binhduong = sort_by_warehouse(df_sortdate, 'Kho Bình Dương')
    
    dfbyday = pd.concat([dfbyday_hanoi, dfbyday_danang, dfbyday_binhduong], axis=1, join='inner')
    dfbyweek = pd.concat([dfbyweek_hanoi, dfbyweek_danang, dfbyweek_binhduong], axis=1, join='inner')
    dfbymonth = pd.concat([dfbymonth_hanoi, dfbymonth_danang, dfbymonth_binhduong], axis=1, join='inner')
    dfbyweekday = dfbyday.copy(deep=True)
    dfbyweekday['weekday'] = dfbyweekday.index.weekday_name
    
    return dfbyday, dfbyweek, dfbymonth, dfbyweekday

In [None]:
df_hanoi, dfbymonth_hanoi, dfbyweek_hanoi, dfbyday_hanoi = sort_by_warehouse(df_sortdate, 'Kho Hà Nội')
df_danang, dfbymonth_danang, dfbyweek_danang, dfbyday_danang = sort_by_warehouse(df_sortdate, 'Kho Đà Nẵng')
df_binhduong, dfbymonth_binhduong, dfbyweek_binhduong, dfbyday_binhduong = sort_by_warehouse(df_sortdate, 'Kho Bình Dương')

In [None]:
dfbyday, dfbyweek, dfbymonth, dfbyweekday = aggregate_data(df_sortdate)

In [None]:
dfbyday

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.set(rc={'figure.figsize': (20, 6)})

dfbyday.plot()
plt.title('Daily sale')

In [None]:
sns.set(rc={'figure.figsize': (20, 6)})

dfbyweek.plot()
plt.title('Weekly sale')

In [None]:
dfbyday.loc['2019-10':, :]

In [None]:
def remove_outlier(df):
    dff = df.copy(deep=True)
    q25 = dff.quantile(0.25)
    q75 = dff.quantile(0.75)
    iqr = q75 - q25
    upper_limit = q75 + 1.5 * iqr
    
    dff[dff > upper_limit] = np.nan
    mean = dff.mean()
    dff[dff.isna()] = mean
    return dff

In [None]:
dfbyday_remove_outlier = dfbyday.copy(deep=True)
for warehouse in dfbyday.columns:
    dfbyday_remove_outlier[warehouse] = remove_outlier(dfbyday[warehouse])

In [None]:
from statsmodels.tsa.stattools import adfuller 


def ADFtest(df):
    warehouse_list = df.columns
    for warehouse in warehouse_list:
        print('Result for ADF test {}:'.format(warehouse))
        dftest = adfuller(df[warehouse], autolag = 'AIC')

        dfoutput = pd.Series(dftest[0:4], index = ['Test Statistic', 'p-value', 'Number of lag used', 'Number of observation used'])
        for key, value in dftest[4].items():
            dfoutput['Critical value (%s)'%key] = value
        
        print(dfoutput)

In [None]:
ADFtest(dfbyday)

In [None]:
ADFtest(dfbyday_remove_outlier)

In [None]:
ADFtest(dfbyday.diff().dropna())

In [None]:
ADFtest(dfbyday.diff().dropna())