In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [32]:
feature_month = pd.read_csv("../Features/" + str(4436) +"/priority2_month.csv")
feature_month.drop(['Unnamed: 0'], axis = 1, inplace=True)

target_month = pd.read_csv("../Features/" + str(4436) +"/priority1_month.csv")
target_month = target_month['net_revenue']

feature_month['target'] = target_month

feature_month.head()

Unnamed: 0,date,current_ratio,debt_equity_ratio,acid_ratio,net_profit_margin,return_on_equity,debt_asset_ratio,gross_profit_margin,target
0,2012-09-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2012-10-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2012-11-30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2012-12-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2013-01-31,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
feature_year = pd.read_csv("../Features/" + str(4436) +"/priority2_year.csv")
feature_year.drop(['Unnamed: 0'], axis = 1, inplace=True)

target_year = pd.read_csv("../Features/" + str(4436) +"/priority1_year.csv")
target_year = target_year['net_revenue']

feature_year['target'] = target_year

feature_year.head()

Unnamed: 0,date,current_ratio,debt_equity_ratio,acid_ratio,net_profit_margin,return_on_equity,debt_asset_ratio,gross_profit_margin,target
0,2012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2014,0.0,0.0,0.0,106.416813,1.0,0.0,1.0,0.0
3,2015,0.294107,-1.128035,0.105472,-34.016982,0.036829,14.070895,0.193394,967309.1
4,2016,0.743492,-1.32956,0.347589,95.527727,-0.373424,4.561732,0.279759,2082686.0


# Introduction to Time Series Forcasting

Time series forecasting is a way to analyze patterns in time-dependent data. One example of time series forcasting is the ARIMA model. Since we are using features to predict revenue (exogenous features), we want to focus on multi-variate time series forecasting. 

Two primary models that we will be focusing on :

1. Traditional ARIMA --> Combines autoregressive models and moving average models to develop time-series forecasting.

2.  VAR --> Another Autoregressive model for predicting multivariate data

In [153]:
from statsmodels.tsa.vector_ar.var_model import VAR
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error

from statsmodels.tsa.arima_model import ARIMA
import pmdarima as pm

In [45]:
def adf_test(series,title=''):
    """
    Pass in a time series and an optional title, returns an ADF report
    """
    print(f'Augmented Dickey-Fuller Test: {title}')
    result = adfuller(series.dropna(),autolag='AIC') # .dropna() handles differenced data
    labels = ['ADF test statistic','p-value','# lags used','# observations']
    out = pd.Series(result[0:4],index=labels)
    for key,val in result[4].items():
        out[f'critical value ({key})']=val
    print(out.to_string())          # .to_string() removes the line "dtype: float64"
    if result[1] <= 0.05:
        print("Strong evidence against the null hypothesis")
        print("Reject the null hypothesis")
        print("Data has no unit root and is stationary")
    else:
        print("Weak evidence against the null hypothesis")
        print("Fail to reject the null hypothesis")
        print("Data has a unit root and is non-stationary")

## Preprocess Data

In [87]:
feature_month['date'] = pd.to_datetime(feature_month.date , format = '%Y-%m-%d')
data_month = feature_month.drop(['date'], axis=1)
data_month.index = feature_month.date
data_month.index = pd.DatetimeIndex(data_month.index).to_period('M')


data_month.head()

Unnamed: 0_level_0,current_ratio,debt_equity_ratio,acid_ratio,net_profit_margin,return_on_equity,debt_asset_ratio,gross_profit_margin,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [88]:
feature_year['date'] = pd.to_datetime(feature_year.date , format = '%Y')
data_year = feature_year.drop(['date'], axis=1)
data_year.index = feature_year.date
data_year.index = pd.DatetimeIndex(data_year.index).to_period('Y')

data_year.head()

Unnamed: 0_level_0,current_ratio,debt_equity_ratio,acid_ratio,net_profit_margin,return_on_equity,debt_asset_ratio,gross_profit_margin,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2014,0.0,0.0,0.0,106.416813,1.0,0.0,1.0,0.0
2015,0.294107,-1.128035,0.105472,-34.016982,0.036829,14.070895,0.193394,967309.1
2016,0.743492,-1.32956,0.347589,95.527727,-0.373424,4.561732,0.279759,2082686.0


In [89]:
for column in data_month.columns:
    print(f"{column}")
    adc = adf_test(data_month[column])
    print("")

current_ratio
Augmented Dickey-Fuller Test: 
ADF test statistic       -2.909999
p-value                   0.044198
# lags used               2.000000
# observations          112.000000
critical value (1%)      -3.490131
critical value (5%)      -2.887712
critical value (10%)     -2.580730
Strong evidence against the null hypothesis
Reject the null hypothesis
Data has no unit root and is stationary

debt_equity_ratio
Augmented Dickey-Fuller Test: 
ADF test statistic     -1.059868e+01
p-value                 6.252739e-19
# lags used             0.000000e+00
# observations          1.140000e+02
critical value (1%)    -3.489058e+00
critical value (5%)    -2.887246e+00
critical value (10%)   -2.580481e+00
Strong evidence against the null hypothesis
Reject the null hypothesis
Data has no unit root and is stationary

acid_ratio
Augmented Dickey-Fuller Test: 
ADF test statistic       -3.648477
p-value                   0.004899
# lags used               2.000000
# observations          112.000

In [90]:
data_month['target'] = data_month['target'].diff()
data_month['debt_asset_ratio'] = data_month['debt_asset_ratio'].diff()

data_month = data_month.fillna(0.0)
data_month.head()

Unnamed: 0_level_0,current_ratio,debt_equity_ratio,acid_ratio,net_profit_margin,return_on_equity,debt_asset_ratio,gross_profit_margin,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012-09,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2012-12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [91]:
for column in data_year.columns:
    print(f"{column}")
    adc = adf_test(data_year[column])
    print("")

current_ratio
Augmented Dickey-Fuller Test: 
ADF test statistic     -1.368281
p-value                 0.597385
# lags used             0.000000
# observations          9.000000
critical value (1%)    -4.473135
critical value (5%)    -3.289881
critical value (10%)   -2.772382
Weak evidence against the null hypothesis
Fail to reject the null hypothesis
Data has a unit root and is non-stationary

debt_equity_ratio
Augmented Dickey-Fuller Test: 
ADF test statistic     -1.205516
p-value                 0.671219
# lags used             0.000000
# observations          9.000000
critical value (1%)    -4.473135
critical value (5%)    -3.289881
critical value (10%)   -2.772382
Weak evidence against the null hypothesis
Fail to reject the null hypothesis
Data has a unit root and is non-stationary

acid_ratio
Augmented Dickey-Fuller Test: 
ADF test statistic     -1.282501
p-value                 0.637140
# lags used             0.000000
# observations          9.000000
critical value (1%)    -4.47

In [92]:
data_year['target'] = data_year['target'].diff()
data_year['debt_asset_ratio'] = data_year['debt_asset_ratio'].diff()
data_year['net_profit_margin'] = data_year['net_profit_margin'].diff()
data_year['acid_ratio'] = data_year['acid_ratio'].diff()
data_year['debt_equity_ratio'] = data_year['debt_equity_ratio'].diff()
data_year['current_ratio'] = data_year['current_ratio'].diff()

data_year = data_year.fillna(0.0)
data_year.head()

Unnamed: 0_level_0,current_ratio,debt_equity_ratio,acid_ratio,net_profit_margin,return_on_equity,debt_asset_ratio,gross_profit_margin,target
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2012,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2013,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2014,0.0,0.0,0.0,106.416813,1.0,0.0,1.0,0.0
2015,0.294107,-1.128035,0.105472,-140.433796,0.036829,14.070895,0.193394,967309.1
2016,0.449385,-0.201525,0.242117,129.544709,-0.373424,-9.509163,0.279759,1115376.9


In [150]:
train_m = data_month[:int(0.8*(len(data_month)))]
valid_m = data_month[int(0.8*(len(data_month))):]

train_y = data_year[:int(0.8*(len(data_year)))]
valid_y = data_year[int(0.8*(len(data_year))):]

## VAR Model

In [125]:
model_m = VAR(train_m)
model_y = VAR(train_y)

results_m = model_m.fit()
results_y = model_y.fit()

In [139]:
prediction_m = results_m.forecast(results_m.y, steps=len(valid_m))
prediction_y = results_y.forecast(results_y.y, steps=len(valid_y))

  obj = getattr(results, attr)


In [149]:
#converting predictions to dataframe
cols = data_month.columns
pred_m = pd.DataFrame(index=range(0,len(prediction_m)),columns=[cols])
for j in range(0,len(cols)):
    for i in range(0, len(prediction_m)):
       pred_m.iloc[i][j] = prediction_m[i][j]

pred_temp_m = pred_m['target']

cols = data_year.columns
pred_y = pd.DataFrame(index=range(0,len(prediction_y)),columns=[cols])
for j in range(0,len(cols)):
    for i in range(0, len(prediction_y)):
       pred_y.iloc[i][j] = prediction_y[i][j]

pred_temp_y = pred_y['target']

print('[month] rmse value for Revenue is : ', np.sqrt(mean_squared_error(pred_temp_m, valid_m['target'])))
print('[year] rmse value for Revenue is : ', np.sqrt(mean_squared_error(pred_temp_y, valid_y['target'])))

[month] rmse value for Revenue is :  7623.232643931527
[year] rmse value for Revenue is :  435107.08833333856


## ARIMA Model

In [161]:
other = train_m.drop(['target'], axis = 1)

model_m = pm.auto_arima(train_m.target, exogenous = other, start_p=1, start_q=1,
                      test='adf',       # use adftest to find optimal 'd'
                      max_p=3, max_q=3, # maximum p and q
                      m=1,              # frequency of series 
                      d=None,           # let model determine 'd'
                      seasonal=False,   # No Seasonality
                      start_P=0, 
                      D=0, 
                      trace=True,
                      error_action='ignore',  
                      suppress_warnings=True, 
                      stepwise=True)

print(model_m.summary())

Performing stepwise search to minimize aic
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=2310.137, Time=0.18 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=2656.174, Time=0.01 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=2311.866, Time=0.10 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=2308.437, Time=0.05 sec
 ARIMA(0,0,2)(0,0,0)[0]             : AIC=2310.460, Time=0.13 sec
 ARIMA(1,0,2)(0,0,0)[0]             : AIC=2306.353, Time=0.24 sec
 ARIMA(2,0,2)(0,0,0)[0]             : AIC=2314.011, Time=0.23 sec
 ARIMA(1,0,3)(0,0,0)[0]             : AIC=inf, Time=0.17 sec
 ARIMA(0,0,3)(0,0,0)[0]             : AIC=2312.762, Time=0.10 sec
 ARIMA(2,0,1)(0,0,0)[0]             : AIC=2311.879, Time=0.17 sec
 ARIMA(2,0,3)(0,0,0)[0]             : AIC=inf, Time=0.28 sec
 ARIMA(1,0,2)(0,0,0)[0] intercept   : AIC=inf, Time=0.26 sec

Best model:  ARIMA(1,0,2)(0,0,0)[0]          
Total fit time: 1.910 seconds
                               SARIMAX Results                                
Dep. Variable:          

In [166]:
other = train_y.drop(['target'], axis = 1)

model_y = pm.auto_arima(train_y.target, exogenous = other, start_p=1, start_q=1,
                      test='adf',       # use adftest to find optimal 'd'
                      max_p=3, max_q=3, # maximum p and q
                      m=1,              # frequency of series
                      d=None,           # let model determine 'd'
                      seasonal=False,   # No Seasonality
                      start_P=0, 
                      D=0, 
                      trace=True,
                      error_action='ignore',  
                      suppress_warnings=True, 
                      stepwise=True)

print(model_y.summary())

Performing stepwise search to minimize aic
 ARIMA(1,0,1)(0,0,0)[0]             : AIC=-148.500, Time=0.07 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=-153.503, Time=0.04 sec
 ARIMA(1,0,0)(0,0,0)[0]             : AIC=-151.030, Time=0.04 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=-149.578, Time=0.04 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=-151.503, Time=0.05 sec

Best model:  ARIMA(0,0,0)(0,0,0)[0]          
Total fit time: 0.238 seconds
                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                    8
Model:                        SARIMAX   Log Likelihood                  84.752
Date:                Tue, 23 Nov 2021   AIC                           -153.503
Time:                        15:31:17   BIC                           -152.868
Sample:                    12-31-2012   HQIC                          -157.790
                         - 12-31-2019                                       

In [168]:
other = valid_m.drop(['target'], axis = 1)
pred_m = model_m.predict(n_periods = len(valid_m), exogenous=other)

other = valid_y.drop(['target'], axis = 1)
pred_y = model_y.predict(n_periods = len(valid_y), exogenous=other)

print('[month] rmse value for Revenue is : ', np.sqrt(mean_squared_error(pred_m, valid_m['target'])))
print('[year] rmse value for Revenue is : ', np.sqrt(mean_squared_error(pred_y, valid_y['target'])))

[month] rmse value for Revenue is :  34039.083516641644
[year] rmse value for Revenue is :  325865.1392977933


## SARIMA Model

In [175]:
other = train_m.drop(['target'], axis = 1)

smodel_m = pm.auto_arima(train_m.target, exogenous = other, start_p=1, start_q=1,
                      test='adf',       # use adftest to find optimal 'd'
                      max_p=3, max_q=3, # maximum p and q
                      m=1,              # frequency of series
                      d=None,           # let model determine 'd'
                      seasonal=True,   # Yes Seasonality
                      start_P=0, 
                      D=0, 
                      trace=True,
                      error_action='ignore',  
                      suppress_warnings=True, 
                      stepwise=True)

print(model_m.summary())

Performing stepwise search to minimize aic
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=2308.279, Time=0.19 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=2332.940, Time=0.02 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=2312.903, Time=0.10 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=2306.428, Time=0.09 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=2656.174, Time=0.01 sec
 ARIMA(0,0,2)(0,0,0)[0] intercept   : AIC=2309.037, Time=0.17 sec
 ARIMA(1,0,2)(0,0,0)[0] intercept   : AIC=inf, Time=0.29 sec
 ARIMA(0,0,1)(0,0,0)[0]             : AIC=2308.437, Time=0.06 sec

Best model:  ARIMA(0,0,1)(0,0,0)[0] intercept
Total fit time: 0.942 seconds
                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                   92
Model:               SARIMAX(1, 0, 2)   Log Likelihood               -1142.176
Date:                Tue, 23 Nov 2021   AIC                           2306.353
Time:                        15:35:07   B

In [176]:
other = train_y.drop(['target'], axis = 1)

smodel_y = pm.auto_arima(train_y.target, exogenous = other, start_p=1, start_q=1,
                      test='adf',       # use adftest to find optimal 'd'
                      max_p=3, max_q=3, # maximum p and q
                      m=1,              # frequency of series
                      d=None,           # let model determine 'd'
                      seasonal=True,   # Yes Seasonality
                      start_P=0, 
                      D=0, 
                      trace=True,
                      error_action='ignore',  
                      suppress_warnings=True, 
                      stepwise=True)

print(model_y.summary())

Performing stepwise search to minimize aic
 ARIMA(1,0,1)(0,0,0)[0] intercept   : AIC=-146.366, Time=0.07 sec
 ARIMA(0,0,0)(0,0,0)[0] intercept   : AIC=-151.503, Time=0.04 sec
 ARIMA(1,0,0)(0,0,0)[0] intercept   : AIC=-148.981, Time=0.05 sec
 ARIMA(0,0,1)(0,0,0)[0] intercept   : AIC=-149.503, Time=0.05 sec
 ARIMA(0,0,0)(0,0,0)[0]             : AIC=-153.503, Time=0.04 sec

Best model:  ARIMA(0,0,0)(0,0,0)[0]          
Total fit time: 0.257 seconds
                               SARIMAX Results                                
Dep. Variable:                      y   No. Observations:                    8
Model:                        SARIMAX   Log Likelihood                  84.752
Date:                Tue, 23 Nov 2021   AIC                           -153.503
Time:                        15:35:19   BIC                           -152.868
Sample:                    12-31-2012   HQIC                          -157.790
                         - 12-31-2019                                       

  return np.roots(self.polynomial_reduced_ma)**-1


In [177]:
other = valid_m.drop(['target'], axis = 1)
pred_m = smodel_m.predict(n_periods = len(valid_m), exogenous=other)

other = valid_y.drop(['target'], axis = 1)
pred_y = smodel_y.predict(n_periods = len(valid_y), exogenous=other)

print('[month] rmse value for Revenue is : ', np.sqrt(mean_squared_error(pred_m, valid_m['target'])))
print('[year] rmse value for Revenue is : ', np.sqrt(mean_squared_error(pred_y, valid_y['target'])))

[month] rmse value for Revenue is :  27402.924256223556
[year] rmse value for Revenue is :  325865.1392977933


# Why Values are So High

In [173]:
valid_m['target']

date
2019-08    0.0
2019-09    0.0
2019-10    0.0
2019-11    0.0
2019-12    0.0
2020-01    0.0
2020-02    0.0
2020-03    0.0
2020-04    0.0
2020-05    0.0
2020-06    0.0
2020-07    0.0
2020-08    0.0
2020-09    0.0
2020-10    0.0
2020-11    0.0
2020-12    0.0
2021-01    0.0
2021-02    0.0
2021-03    0.0
2021-04    0.0
2021-06    0.0
2021-07    0.0
Freq: M, Name: target, dtype: float64

In [174]:
valid_y['target']

date
2020    0.0
2021    0.0
Freq: A-DEC, Name: target, dtype: float64