In [1]:
%load_ext kedro.extras.extensions.ipython
%reload_kedro /Users/yeungadrian/Documents/repo/TimeSeries

2022-06-06 17:25:48,970 - kedro.framework.session.store - INFO - `read()` not implemented for `BaseSessionStore`. Assuming empty store.
2022-06-06 17:25:50,335 - root - INFO - ** Kedro project timeseries
2022-06-06 17:25:50,339 - root - INFO - Defined global variable `context`, `session`, `catalog` and `pipelines`
2022-06-06 17:25:50,993 - kedro.framework.session.store - INFO - `read()` not implemented for `BaseSessionStore`. Assuming empty store.
2022-06-06 17:25:51,208 - root - INFO - ** Kedro project timeseries
2022-06-06 17:25:51,214 - root - INFO - Defined global variable `context`, `session`, `catalog` and `pipelines`


## To Do:
- Individual stocks vs portfolio
- Individual stock
  - Choose factors
  - Choose frequency
  - Choose timescale
- Portfolio

## Libraries

In [2]:
import numpy as np
import pandas as pd
from pydantic import BaseModel
import statsmodels.formula.api as smf
from typing import List, TypeVar, Optional, Dict

In [3]:
PandasDataFrame = TypeVar("pandas.core.frame.DataFrame")

## Data

In [4]:
price_df = catalog.load("fund_prices")
price_df

2022-06-06 17:26:19,007 - kedro.io.data_catalog - INFO - Loading data from `fund_prices` (ParquetDataSet)...


Unnamed: 0,date,ABMD,ATVI,AMD,AMZN,AAPL,DTE,EBAY
0,2020-12-31,324.200,92.850000,91.710,3256.93,132.690000,121.410000,50.250000
1,2020-12-30,323.920,91.580000,92.290,3285.85,133.720000,119.660000,50.550000
2,2020-12-29,320.930,91.370000,90.620,3322.00,134.870000,119.240000,50.860000
3,2020-12-28,312.910,91.430000,91.600,3283.96,136.690000,119.230000,50.240000
4,2020-12-24,303.410,90.960000,91.810,3172.69,131.970000,119.720000,50.120000
...,...,...,...,...,...,...,...,...
5322,1999-11-05,11.315,1.148739,10.625,64.94,0.680100,13.626682,6.904808
5323,1999-11-04,10.470,1.091975,10.315,63.06,0.643981,13.360490,6.733871
5324,1999-11-03,10.250,1.083012,10.655,65.81,0.627654,13.626682,6.583654
5325,1999-11-02,10.315,1.064339,10.280,66.44,0.618028,13.626682,6.801210


In [5]:
ff_factors = catalog.load("ff_5_factor")
ff_factors

2022-06-06 17:26:19,449 - kedro.io.data_catalog - INFO - Loading data from `ff_5_factor` (CSVDataSet)...


Unnamed: 0,date,Mkt-RF,SMB,HML,RMW,CMA,RF
0,19630701,-0.67,0.01,-0.35,0.03,0.11,0.012
1,19630702,0.79,-0.31,0.24,-0.08,-0.25,0.012
2,19630703,0.63,-0.16,-0.09,0.13,-0.24,0.012
3,19630705,0.40,0.09,-0.26,0.07,-0.28,0.012
4,19630708,-0.63,0.07,-0.19,-0.27,0.06,0.012
...,...,...,...,...,...,...,...
14805,20220425,0.71,0.12,-1.84,0.10,-1.07,0.000
14806,20220426,-3.00,-0.34,1.79,0.21,1.21,0.000
14807,20220427,0.12,-0.56,0.33,0.15,0.81,0.000
14808,20220428,2.44,-0.50,-0.45,0.47,0.09,0.000


In [10]:
pd.to_datetime(ff_factors.date,format='%Y%m%d')

0       1963-07-01
1       1963-07-02
2       1963-07-03
3       1963-07-05
4       1963-07-08
           ...    
14805   2022-04-25
14806   2022-04-26
14807   2022-04-27
14808   2022-04-28
14809   2022-04-29
Name: date, Length: 14810, dtype: datetime64[ns]

## Factor Regression

In [286]:
class Portfolio(BaseModel):
    fund_codes: List[str]
    fund_amounts: List[float]
    start_date: str
    end_date: str
    price_df: PandasDataFrame
    french_fama_df: PandasDataFrame

    def prepare_data(self):
        self.price_df["date"] = pd.to_datetime(self.df["date"])
        self.price_df = self.price_df.sort_values(by="date").reset_index(drop=True)
        self.price_df = self.price_df.loc[
            (self.price_df.date >= self.start_date) & (self.df.date <= self.end_date)
        ].reset_index(drop=True)
        self.french_fama_df.date = pd.to_datetime(ff_factors.date,format='%Y%m%d')

    def calculate_returns(self,fund_codes):

        for i in fund_codes:
            subset_data[f"{i}index"] = (subset_data[i] / subset_data[i].shift()) - 1

        subset_data = subset_data.dropna()
        subset_data = subset_data.drop(fund_codes, axis=1)
        subset_data.columns = response_columns

        return subset_data

    def get_summary_results(self,results, fund_code):
        """take the result of an statsmodel results table and transforms it into a dataframe
        https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.RegressionResults.html"""
        pvals = results.pvalues
        coefficient = results.params
        conf_lower = results.conf_int()[0]
        conf_higher = results.conf_int()[1]
        standard_errors = results.bse
        residuals = results.resid
        num_obs = results.nobs
        rsquared = results.rsquared
        rsquard_adj = results.rsquared_adj
        fvalue = results.fvalue

        output_result = {
            "fundCode": fund_code,
            "numberObservations": num_obs,
            "rSquared": rsquared,
            "rSquaredAdjusted": rsquard_adj,
            "fValue": fvalue,
            "coefficient": coefficient,
            "standardErrors": standard_errors,
            "pValues": pvals,
            "confidenceIntervalLower": conf_lower,
            "confidenceIntervalHigher": conf_higher,
            "residuals": residuals,
        }

        return output_result

    def calculate_factor_regression(
        self,
        fund_code,
        regression_factors,
        frenchfama_Factors,
    ):

        np.random.seed(1000)

        regression_equation = " + ".join(regression_factors)

        historical_returns = historical_returns.set_index("date")
        historical_returns.index.name = None

        frenchfama_Factors = frenchfama_Factors.set_index("date")
        frenchfama_Factors.index.name = None

        regression_data = pd.concat(
            [historical_returns, frenchfama_Factors], axis=1, join="inner"
        )

        regression_data[fund_code] = regression_data[fund_code] - regression_data["RF"]

        model = smf.ols(
            formula=f"{fund_code} ~ {regression_equation}", data=regression_data
        )

        results = model.fit()

        output = self.get_summary_results(results, fund_code)

        return output







In [287]:
external_data = {
    "fund_codes": ["EBAY", "AAPL"],
    "fund_amounts": [30,30],
    "start_date": "2010-12-27",
    "end_date": "2019-12-31",
    "price_df": price_df,
    "french_fama_df": ff_factors

}

In [288]:
mock_portfolio = Portfolio(**external_data)

In [289]:
mock_portfolio.prepare_data()
mock_portfolio.backtest_strategy()

Unnamed: 0,EBAY,AAPL,portfolio,date
0,30.000000,30.000000,60.000000,2012-12-27
1,29.713661,29.681350,59.395011,2012-12-28
2,30.422171,30.996752,61.418923,2012-12-31
0,30.709462,30.709462,61.418923,2012-12-31
1,32.270476,31.682214,63.952690,2013-01-02
...,...,...,...,...
17,88.028200,91.850086,179.878286,2019-12-24
18,88.125442,93.672419,181.797861,2019-12-26
19,87.687853,93.636877,181.324730,2019-12-27
20,87.031471,94.192624,181.224095,2019-12-30
