In [1]:
import warnings
warnings.filterwarnings('ignore')

In [51]:
%matplotlib inline

import pandas as pd
import numpy as np

from statsmodels.api import OLS, add_constant, graphics
from statsmodels.graphics.tsaplots import plot_acf
from scipy.stats import norm

import seaborn as sns
import matplotlib.pyplot as plt
from typing import Tuple, List, Dict, Any

In [3]:
sns.set_style('whitegrid')
idx = pd.IndexSlice

In [5]:
DATA_STORE = "../data/linear.h5"

In [104]:
with pd.HDFStore(DATA_STORE) as store:
    data = (store['model_data']
            .dropna()
            .drop(['open', 'close', 'low', 'high'], axis=1))

In [105]:
def get_X_y(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    universe = data[data.dollar_vol_rank < 100]
    y = universe.filter(like='target')
    X = universe.drop(y.columns, axis=1)
    return X, y

X, y = get_X_y(data)

In [106]:
exog_max = np.max(np.asarray(X), axis=0)

In [107]:
data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 2559077 entries, ('FLWS', Timestamp('2013-07-03 00:00:00')) to ('ZTS', Timestamp('2017-11-29 00:00:00'))
Data columns (total 65 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   volume                 float64
 1   dollar_vol_raw         float64
 2   dollar_vol             float64
 3   dollar_vol_rank        float64
 4   rsi                    float64
 5   bb_high                float64
 6   bb_low                 float64
 7   atr                    float64
 8   macd                   float64
 9   return_1d              float64
 10  return_5d              float64
 11  return_10d             float64
 12  return_21d             float64
 13  return_42d             float64
 14  return_63d             float64
 15  return_1d_lag1         float64
 16  return_5d_lag1         float64
 17  return_10d_lag1        float64
 18  return_21d_lag1        float64
 19  return_1d_lag2         float64
 20  return_5d_lag2 

In [108]:
target = 'target_5d'
model = OLS(endog=np.asarray(y[target]), exog=add_constant(np.asarray(X)))
trained_model = model.fit()
trained_model.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.027
Model:,OLS,Adj. R-squared:,0.027
Method:,Least Squares,F-statistic:,51.0
Date:,"Thu, 05 Oct 2023",Prob (F-statistic):,0.0
Time:,19:11:56,Log-Likelihood:,378610.0
No. Observations:,109675,AIC:,-757100.0
Df Residuals:,109614,BIC:,-756500.0
Df Model:,60,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-0.0038,0.000,-11.162,0.000,-0.005,-0.003
x1,-1.765e-12,1.86e-12,-0.949,0.343,-5.41e-12,1.88e-12
x2,1.738e-14,6.23e-14,0.279,0.780,-1.05e-13,1.39e-13
x3,1.399e-13,7.34e-14,1.905,0.057,-4.03e-15,2.84e-13
x4,2.632e-06,1.07e-06,2.470,0.014,5.44e-07,4.72e-06
x5,4.53e-05,5.26e-06,8.613,0.000,3.5e-05,5.56e-05
x6,0.0243,0.006,4.084,0.000,0.013,0.036
x7,-0.0091,0.006,-1.662,0.096,-0.020,0.002
x8,-0.0002,2.82e-05,-8.326,0.000,-0.000,-0.000

0,1,2,3
Omnibus:,43262.554,Durbin-Watson:,0.436
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2331651.874
Skew:,-1.122,Prob(JB):,0.0
Kurtosis:,25.477,Cond. No.,202000000000000.0


In [65]:
nsample = 100
x = np.linspace(0, 10, 100)
X = np.column_stack((x, x ** 2))
beta = np.array([1, 0.1, 10])
e = np.random.normal(size=nsample)


In [77]:
X = add_constant(X)
y = np.dot(X, beta) + e


In [81]:
X.shape, y.shape

((100, 3), (100,))

In [78]:
model = OLS(y, X)
results = model.fit()
print(results.summary())


                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 5.035e+06
Date:                Thu, 05 Oct 2023   Prob (F-statistic):          5.12e-244
Time:                        18:02:56   Log-Likelihood:                -135.25
No. Observations:                 100   AIC:                             276.5
Df Residuals:                      97   BIC:                             284.3
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.2470      0.279      4.463      0.0

In [93]:
results.params

array([ 1.24703772,  0.04839221, 10.00137247])

In [89]:
list(np.dot(results.params, X[i]) for i in range(100))

[1.247037715152377,
 1.3539702260401243,
 1.6649915538605649,
 2.180101698613699,
 2.8993006602995264,
 3.8225884389180473,
 4.949965034469262,
 6.281430446953168,
 7.816984676369769,
 9.556627722719062,
 11.500359586001053,
 13.648180266215734,
 16.000089763363107,
 18.556088077443174,
 21.316175208455935,
 24.280351156401387,
 27.448615921279536,
 30.820969503090375,
 34.39741190183391,
 38.17794311751014,
 42.16256315011907,
 46.35127199966067,
 50.74406966613499,
 55.34095614954198,
 60.14193144988168,
 65.14699556715406,
 70.35614850135916,
 75.76939025249692,
 81.38672082056739,
 87.20814020557054,
 93.2336484075064,
 99.46324542637494,
 105.8969312621762,
 112.53470591491015,
 119.37656938457675,
 126.42252167117609,
 133.6725627747081,
 141.1266926951728,
 148.7849114325702,
 156.6472189869003,
 164.7136153581631,
 172.98410054635852,
 181.45867455148672,
 190.1373373735476,
 199.0200890125412,
 208.10692946846737,
 217.39785874132636,
 226.89287683111803,
 236.59198373784238,


In [85]:
results.predict()

array([   1.24703772,    1.35397023,    1.66499155,    2.1801017 ,
          2.89930066,    3.82258844,    4.94996503,    6.28143045,
          7.81698468,    9.55662772,   11.50035959,   13.64818027,
         16.00008976,   18.55608808,   21.31617521,   24.28035116,
         27.44861592,   30.8209695 ,   34.3974119 ,   38.17794312,
         42.16256315,   46.351272  ,   50.74406967,   55.34095615,
         60.14193145,   65.14699557,   70.3561485 ,   75.76939025,
         81.38672082,   87.20814021,   93.23364841,   99.46324543,
        105.89693126,  112.53470591,  119.37656938,  126.42252167,
        133.67256277,  141.1266927 ,  148.78491143,  156.64721899,
        164.71361536,  172.98410055,  181.45867455,  190.13733737,
        199.02008901,  208.10692947,  217.39785874,  226.89287683,
        236.59198374,  246.49517946,  256.602464  ,  266.91383736,
        277.42929953,  288.14885053,  299.07249033,  310.20021896,
        321.5320364 ,  333.06794266,  344.80793774,  356.75202