## Vector AutoRegression

This class inherets from Base_Analysis_Class and provides some simple functionality for: 
    1. Checking stationarity of (multiple) time-series
    
    2. Plotting timeline, lag-plot, autocorrelation plot (you need to be on ipython environment)
    
    3. Creating and fitting VAR model based on the dataframe (index=time variable) provided from the 'timeline' class. Here, you can speicfy:
        - N-lags /ByDefault statsmodels chooses the most apporporate lag for you based on Ljung-Box             Q-score. 
        
    

In [177]:
# Blueprint of the class VAR 
#requirements: 
from core.analysis_base_class import Analysis
from statsmodels.tsa.api import VAR as var 
from statsmodels.tsa.stattools import adfuller,kpss
from matplotlib import pyplot
from pandas.plotting import lag_plot
from statsmodels.graphics.tsaplots import plot_acf


class VAR(Analysis):
    """ VAR works with dataframes (or a sereis) """
    
    def __init__(self,df):
        self.df = df
        self.temp = 0.0   ##
          

    def test_assumptions(self,level):
        """ 
        Gives you output as list with assumptions stated as satisfied/not satisfied. If some important 
        assumptions are not satisfied gives you warning that you have to transform your data. 
        @df - dataframe with columns representing queries (eg count number of documents 
               whcich mention a word or phrase)
        @level -  this is the level you are testing your asusmptions on (either 1,5 or 10 %)
        """
        self.level =  level
        
        def _adf_test():
            """ 
            H_0: the observed time series is stationary 
            Returns: dataframe of summary of the test 
            """
            summary_adf = pd.DataFrame(columns=['ADF_Stat','p-value','Critical_val_1%','Critical_val_5%','Critical_val_10%'])
            for name in self.df.columns:
                series = self.df[name]
                result = adfuller(series)
                dic = {'ADF_Stat':result[0],'p-value':result[1],'Critical_val_1%':result[4]['1%'],'Critical_val_5%':result[4]['5%'],
                       'Critical_val_10%':result[4]['10%']}
                summary_adf = summary_adf.append(dic,ignore_index=True)
            summary_adf.set_index(df.columns,inplace=True)  

            return summary_adf 
    
        def _kpss_test():
            """ 
            H_0: there is a unit root in time series, hence stochastic trend with drift, hence non-stationary
            Returns: dataframe of summary of the test
            """
            summary_kpss = pd.DataFrame(columns=['KPSS_Stat','p-value','Critical_val_1%','Critical_val_5%','Critical_val_10%'])
            for name in self.df.columns:
                series = self.df[name]
                result = kpss(series)
                dic = {'KPSS_Stat':result[0],'p-value':result[1],'Critical_val_1%':result[3]['1%'],'Critical_val_5%':result[3]['5%'],
                       'Critical_val_10%':result[3]['10%']}
                summary_kpss = summary_kpss.append(dic,ignore_index=True)
            summary_kpss.set_index(df.columns,inplace=True)  

            return summary_kpss
        
        def _stationarity_check(explicit=True):
            """ 
            For each time series return the result of the check - return in created dataframe
            """
            lvl = float(self.level[:-1])/100 
            self.summary_adf = _adf_test()
            self.summary_kpss = _kpss_test()
            
            if explicit:
                for i in self.df.columns:
                    adf_flag = lvl > self.summary_adf.loc[i,'p-value']
                    kpss_flag = lvl < self.summary_kpss.loc[i,'p-value']
                    print("For {} stationarity is satisfied: ADF - {} | KPSS - {} ".format(i,adf_flag, kpss_flag)) # PRINT  (!)
                    
            return  ##
        
        call_stationarity = _stationarity_check(explicit=True)
        return ##

    def fit(self,nlags=None):
        """
        This method creates a Vector AutoRegressive model to timeline dataframe
        @df - dataframe with columns representing queries (eg count number of documents 
               whcich mention a word or phrase) 
        @nlags -  number of lags to consider 
        """
        self.model = var(self.df)                           # creating VAR model , could go to __init__
        self.result = self.model.fit(nlags)   
        self.order = self.model.select_order(verbose=True)
        print(self.order['bic'])   ##selects lag based on infomation criteria
        return ###

    def predict(self,d = 5):
        """
        Makes forecasts based on the parameters of fitted model 
        @ d - how many steps into the future you want to forecast 
        """
        prediction_array= self.result.forecast(self.df.values,d)
        
        predictions = pd.DataFrame(prediction_array, columns= self.df.columns)
        print(predictions)     ###
        return ## 


    def interpretation(self, **kwargs):
        """
        This method should have the functionality to interpret the status of the model after being trained and also document the various design choices\
        (i.e. parameters settings, assumptions, model selection, test method, dataset used). For example it can return a report-like looking formatted string.\n
        Please consider the following as possible model state interpretation:\n
           * For classification tasks depending on the underlying model: coeficient/feature weights, feature selection (random forest)\n
           * For clustering tasks: clusterings members/structure, distributions
        """
        raise NotImplementedError

    def diagnostics(self):
        """
        This method should have the functionality to report on the quality of the underlying (trained) model used for 
        the analysis (on a dataset)
           
        Common diagnostics for VAR: check noramlity of residuals
        """
        residuals = self.result.resid
        print(residuals)
        return ##

    def plot(self, plot_type=None, lag = 1):
        """
        To be able to see the results this method requires an ipython environment run
        """
        def lag_scatter():
            for name in self.df.columns:
                series = self.df[name]
                print("Lag plot where y is {}".format(name))
                lag_plot(series,lag)
                pyplot.show()    

            return ##
        
        def line_plot():
            for name in self.df.columns:
                series = self.df[name]
                series.plot(legend=True)
            pyplot.show()
            return ##
    
        def autocorrelation_plot():
            for name in self.df.columns:
                series = self.df[name]
                print("Autocorelation plot for {}".format(name))
                plot_acf(series, lags=lag)
                pyplot.show()
            
        if plot_type == None:
            lag_scatter() 
            line_plot()
            autocorrelation_plot()
        if (plot_type == ('line')):
            line_plot()
        if (plot_type == ('lag')):
            lag_scatter()
        if (plot_type == ('autocorrelation')):
            autocorrelation_plot()
        
        return ## 

## Testing the class

In [5]:
import numpy as np 
import pandas as pd

In [164]:
m2 = var(df)
res = m2.fit(2)
m2.

<statsmodels.tsa.vector_ar.var_model.VARResultsWrapper at 0x290d7d77b00>

In [161]:
# generate two arbitrary sample to work with, put them in df, add time trend
sample1 = np.random.gamma(4,3,50)
sample2 = np.random.binomial(10,0.7,size=50)

# create a pandas dataframe with counts of mentions of a particular word (two arrays)
df = pd.DataFrame({'x':sample1, 'y':sample2})
df['t'] = df.index
df['x'] = df.x.astype('int')
df = df.drop(['t'],axis=1)

#make index a date object
import datetime
base = datetime.datetime.today()
date_list = [base - datetime.timedelta(days=x) for x in range(0, 50)]

df.index = date_list

In [179]:
m1 = VAR(df)
m1.test_assumptions('1%')
m1.fit()
#m1.predict()
#m1.plot(plot_type = 'autocorrelation' ,lag = 2)
m1.diagnostics()

For x stationarity is satisfied: ADF - True | KPSS - True 
For y stationarity is satisfied: ADF - True | KPSS - True 
                 VAR Order Selection                  
            aic          bic          fpe         hqic
------------------------------------------------------
0         4.461       4.545*       86.55*       4.491*
1         4.533        4.786        93.09        4.625
2         4.597        5.019        99.47        4.750
3         4.740        5.331        115.2        4.953
4         4.735        5.495        115.7        5.010
5         4.897        5.826        137.8        5.233
6         4.933        6.031        145.8        5.330
7         4.910        6.177        146.6        5.368
8         4.791        6.226        135.0        5.310
9         4.718        6.323        132.2        5.298
10       4.410*        6.184        103.9        5.051
* Minimum

0
                                    x         y
2017-11-20 23:07:43.261708  -4.702464  0.767851
201



Unnamed: 0,x,y
2017-11-15 12:57:19.273678,3,9
2017-11-14 12:57:19.273678,13,6
2017-11-13 12:57:19.273678,8,9
2017-11-12 12:57:19.273678,8,10
2017-11-11 12:57:19.273678,11,7
2017-11-10 12:57:19.273678,11,6
2017-11-09 12:57:19.273678,6,7
2017-11-08 12:57:19.273678,20,7
2017-11-07 12:57:19.273678,10,8
2017-11-06 12:57:19.273678,7,10
