In [None]:
import pandas as pd
from dateutil.relativedelta import relativedelta
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import plotly.graph_objects as go
from plotly.subplots import make_subplots
plt.style.use('fivethirtyeight')
from statsmodels.graphics.tsaplots import plot_acf,plot_pacf
from statsmodels.tsa.stattools import adfuller
from datetime import datetime
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.seasonal import STL
import itertools
from pytrends.request import TrendReq
import warnings
warnings.filterwarnings("ignore")

def import_from_ggtrend(kw_list,start,end=datetime.now().year,language='vi-VN',geo='VN',tz=480):
    pytrends = TrendReq(hl=language,tz=tz)
    timeframe = f'{datetime(start,1,1).date()} {datetime(end,1,1).date()}' 
    pytrends.build_payload(kw_list, geo=geo, timeframe=timeframe)
    df = pytrends.interest_over_time()
    df = df[df['isPartial'] == 'False']
    df.drop(['isPartial'],axis=1,inplace=True)
    df.index = pd.DatetimeIndex(df.index.values,freq=df.index.inferred_freq)
    return df
def replace_outlier(df,indx):
    date_indx = pd.to_datetime(indx)
    monthly_mean = df.groupby(df.index.month).mean()
    for i in date_indx:
        df.loc[i] = monthly_mean.loc[i.month]
    return df

def import_ts(filename,time_cols,freq='MS',header=0,cols_to_drop=[]):
    df = pd.read_csv(filename,parse_dates=[time_cols],infer_datetime_format=True,header=header,index_col=time_cols)
    try:
        df.drop(cols_to_drop,axis=1,inplace=True)
    except KeyError:
        pass
    df = df.resample(freq).sum()
    return df

def ts_lines(sr):
    decomposition = STL(sr).fit()
    adf_test = adfuller(decomposition.resid,regression='c',autolag='AIC')
    stationarity  = 'True' if adf_test[1] < 0.05 else 'False'
    

    f = make_subplots(rows=4,cols=1,shared_xaxes=True).update_layout(width=1600,height=1000)
    f.add_trace(go.Scatter(x=sr.index,y=sr.values,mode='lines',name='Observed Data'),row=1,col=1).update_layout(
        title=f'Plot of {sr.name} - Stationary = {stationarity}',
        font=dict(size=12))
    
    f.add_trace(go.Scatter(x=sr.index,y=decomposition.trend,mode='lines',name='Trend'),row=2,col=1).update_layout(                                                                            
        font=dict(size=12))
    f.add_trace(go.Scatter(x=sr.index,y=decomposition.seasonal,mode='lines',name='Seasonality'),row=3,col=1).update_layout(                                                                            
        font=dict(size=12))
    f.add_trace(go.Scatter(x=sr.index,y=decomposition.resid,mode='markers',name='Residuals'),row=4,col=1).update_layout(                                                                 
        font=dict(size=12))
    f.add_hrect(y0=-1.96*decomposition.resid.std(),y1=1.96*decomposition.resid.std(),line=dict(dash="dot",color='orangered'),fillcolor='rgba(255,204,204,0.2)',row=4,col=1)
    fig = plt.figure(figsize=(20,5))
    gs = GridSpec(1,2)
    ax5 = fig.add_subplot(gs[0,0])
    plot_acf(decomposition.resid,ax=ax5)
    ax6 = fig.add_subplot(gs[0,1])
    plot_pacf(decomposition.resid,ax=ax6)
    fig.tight_layout(pad=3)
    f.show()
    plt.show()
def grid_search(sr,p_max,d_max,q_max,freq):
    p = range(0,p_max)
    d = range(0,d_max)
    q = range(0,q_max)

    pdq = list(itertools.product(p,d,q))
    seasonal_pdq = [(x[0],x[1],x[2],freq) for x in pdq]
    aic_result = pd.DataFrame(columns=['params','params_seasonal','aic'])
    for params in pdq:
        for params_seasonal in seasonal_pdq:
            try:
                model = SARIMAX(sr,order=params,seasonal_order=params_seasonal,enforce_invertibility=False,enforce_stationarity=False)
                result = model.fit()
                aic_result = aic_result.append(pd.DataFrame(dict(params=[params],params_seasonal=[params_seasonal],aic=result.aic)))
            except:
                continue
    aic_result = aic_result.sort_values(by='aic',ascending=True,ignore_index=True).head(1)
    aic_result['params'].replace({'(':'',')':'',' ':''},inplace=True)
    aic_result['params_seasonal'].replace({'(':'',')':'',' ':''},inplace=True)
    order = aic_result['params'][0]
    seasonal_order = aic_result['params_seasonal'][0]
    return order,seasonal_order
def sarimax_fitting(sr,order,seasonal_order,pred_range):
    print(f'SARIMAX{order}x{seasonal_order}')
    pred_date = [(sr.index[-1] + relativedelta(months=x)) for x in range(0,pred_range+1)]
    pred_date = pd.Series(index=pred_date[1:])
    data = sr.append(pred_date)
    model = SARIMAX(data,order=order,seasonal_order=seasonal_order,enforce_invertibility=True,enforce_stationarity=True)
    result = model.fit()
    print(result.summary().tables[1])
    result.plot_diagnostics(figsize=(30,15))

    test = result.get_prediction(start=int(len(data)*0.8),end=len(sr)-1,dynamic=False,full_results=False)
    test_ci = test.conf_int()

    f = plt.figure(figsize=(30,10))
    ax1 = f.add_subplot()
    ax1.plot(data)
    test.predicted_mean.plot(ax=ax1, label='One-step ahead Forecast', alpha=.7,color='r',linestyle='--')
    ax1.fill_between(test_ci.index,
                test_ci.iloc[:, 0],
                test_ci.iloc[:, 1], color='k', alpha=.2)
    
    y_forecasted = test.predicted_mean
    y_truth = data[int(len(data)*0.8):len(sr)]

    rmse = np.sqrt(((y_forecasted - y_truth) ** 2).mean())
    print(f'The Root Mean Squared Error of the forecast is {round(rmse, 2)}')
    forecast = result.get_prediction(start=len(data)-pred_range,dynamic=False)
    forecast_ci = forecast.conf_int()
    forecast.predicted_mean.plot(ax=ax1,color='green')
    ax1.fill_between(forecast_ci.index,
                forecast_ci.iloc[:, 0],
                forecast_ci.iloc[:, 1], alpha=.2)
    plt.legend()
    plt.show()
    for i,(indx,val) in enumerate(forecast.predicted_mean.iteritems()):
        print(f'{indx.strftime("%Y-%m-%d")} : [{forecast_ci.iloc[i][0]:.2f}  {val:.2f}   {forecast_ci.iloc[i][1]:.2f}]')

In [None]:
kw = ['cat']
df = import_from_ggtrend(kw,start=2010,end=2022)
predict_range = 12

for i in range(len(kw)):
    ts_lines(df[kw[i]])
optimal_order = grid_search(df[kw[0]],2,2,2,12)
sarimax_fitting(df[kw[0]],optimal_order[0],optimal_order[1],pred_range=predict_range)