In [2]:
%load_ext kedro.extras.extensions.ipython
%reload_kedro /Users/yeungadrian/Documents/repo/TimeSeries

2022-06-07 22:51:40,053 - kedro.framework.session.store - INFO - `read()` not implemented for `BaseSessionStore`. Assuming empty store.
2022-06-07 22:51:42,065 - root - INFO - ** Kedro project timeseries
2022-06-07 22:51:42,067 - root - INFO - Defined global variable `context`, `session`, `catalog` and `pipelines`
2022-06-07 22:51:42,352 - kedro.framework.session.store - INFO - `read()` not implemented for `BaseSessionStore`. Assuming empty store.
2022-06-07 22:51:42,441 - root - INFO - ** Kedro project timeseries
2022-06-07 22:51:42,443 - root - INFO - Defined global variable `context`, `session`, `catalog` and `pipelines`


## To Do:
- Individual stocks vs portfolio
- Individual stock
  - Choose factors
  - Choose frequency
  - Choose timescale
- Portfolio

## Libraries

In [3]:
import numpy as np
import pandas as pd
from pydantic import BaseModel
import statsmodels.formula.api as smf
from typing import List, TypeVar, Optional, Dict

In [4]:
PandasDataFrame = TypeVar("pandas.core.frame.DataFrame")

## Data

In [5]:
price_df = catalog.load("fund_prices")
price_df

2022-06-07 22:51:45,312 - kedro.io.data_catalog - INFO - Loading data from `fund_prices` (ParquetDataSet)...


Unnamed: 0,date,ABMD,ATVI,AMD,AMZN,AAPL,DTE,EBAY
0,2020-12-31,324.200,92.850000,91.710,3256.93,132.690000,121.410000,50.250000
1,2020-12-30,323.920,91.580000,92.290,3285.85,133.720000,119.660000,50.550000
2,2020-12-29,320.930,91.370000,90.620,3322.00,134.870000,119.240000,50.860000
3,2020-12-28,312.910,91.430000,91.600,3283.96,136.690000,119.230000,50.240000
4,2020-12-24,303.410,90.960000,91.810,3172.69,131.970000,119.720000,50.120000
...,...,...,...,...,...,...,...,...
5322,1999-11-05,11.315,1.148739,10.625,64.94,0.680100,13.626682,6.904808
5323,1999-11-04,10.470,1.091975,10.315,63.06,0.643981,13.360490,6.733871
5324,1999-11-03,10.250,1.083012,10.655,65.81,0.627654,13.626682,6.583654
5325,1999-11-02,10.315,1.064339,10.280,66.44,0.618028,13.626682,6.801210


In [6]:
ff_factors = catalog.load("ff_5_factor")
ff_factors

2022-06-07 22:51:45,713 - kedro.io.data_catalog - INFO - Loading data from `ff_5_factor` (CSVDataSet)...


Unnamed: 0,date,Mkt-RF,SMB,HML,RMW,CMA,RF
0,19630701,-0.67,0.01,-0.35,0.03,0.11,0.012
1,19630702,0.79,-0.31,0.24,-0.08,-0.25,0.012
2,19630703,0.63,-0.16,-0.09,0.13,-0.24,0.012
3,19630705,0.40,0.09,-0.26,0.07,-0.28,0.012
4,19630708,-0.63,0.07,-0.19,-0.27,0.06,0.012
...,...,...,...,...,...,...,...
14805,20220425,0.71,0.12,-1.84,0.10,-1.07,0.000
14806,20220426,-3.00,-0.34,1.79,0.21,1.21,0.000
14807,20220427,0.12,-0.56,0.33,0.15,0.81,0.000
14808,20220428,2.44,-0.50,-0.45,0.47,0.09,0.000


In [7]:
pd.to_datetime(ff_factors.date,format='%Y%m%d')

0       1963-07-01
1       1963-07-02
2       1963-07-03
3       1963-07-05
4       1963-07-08
           ...    
14805   2022-04-25
14806   2022-04-26
14807   2022-04-27
14808   2022-04-28
14809   2022-04-29
Name: date, Length: 14810, dtype: datetime64[ns]

## Factor Regression

In [32]:
class Portfolio(BaseModel):
    fund_codes: List[str]
    start_date: str
    end_date: str
    price_df: PandasDataFrame
    french_fama_df: PandasDataFrame

    def prepare_data(self):
        self.price_df["date"] = pd.to_datetime(self.price_df["date"])
        self.price_df = self.price_df.sort_values(by="date").reset_index(drop=True)
        self.price_df = self.price_df.loc[
            (self.price_df.date >= self.start_date) & (self.price_df.date <= self.end_date)
        ].reset_index(drop=True)
        self.french_fama_df.date = pd.to_datetime(ff_factors.date,format='%Y%m%d')
        self.french_fama_df = self.french_fama_df.rename(columns={'Mkt-RF':'Mkt'})

    def calculate_returns(self):

        columns = ['date'] + self.fund_codes
        subset_data = self.price_df[columns]
        for i in self.fund_codes:
            subset_data[i] = (subset_data[i] / subset_data[i].shift()) - 1

        subset_data = subset_data.dropna()


        return subset_data

    def get_summary_results(self,results, fund_code):
        """take the result of an statsmodel results table and transforms it into a dataframe
        https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.RegressionResults.html"""
        pvals = results.pvalues
        coefficient = results.params
        conf_lower = results.conf_int()[0]
        conf_higher = results.conf_int()[1]
        standard_errors = results.bse
        residuals = results.resid
        num_obs = results.nobs
        rsquared = results.rsquared
        rsquard_adj = results.rsquared_adj
        fvalue = results.fvalue

        output_result = {
            "fundCode": fund_code,
            "numberObservations": num_obs,
            "rSquared": rsquared,
            "rSquaredAdjusted": rsquard_adj,
            "fValue": fvalue,
            "coefficient": coefficient,
            "standardErrors": standard_errors,
            "pValues": pvals,
            "confidenceIntervalLower": conf_lower,
            "confidenceIntervalHigher": conf_higher,
            "residuals": residuals,
        }

        return output_result

    def calculate_factor_regression(
        self,
        fund_code,
        regression_factors,
        frenchfama_Factors,
        historical_returns
    ):

        np.random.seed(1000)

        regression_equation = " + ".join(regression_factors)

        historical_returns = historical_returns.set_index("date")
        historical_returns.index.name = None

        frenchfama_Factors = frenchfama_Factors.set_index("date")
        frenchfama_Factors.index.name = None

        regression_data = pd.concat(
            [historical_returns, frenchfama_Factors], axis=1, join="inner"
        )

        regression_data[fund_code] = regression_data[fund_code] - regression_data["RF"]

        model = smf.ols(
            formula=f"{fund_code} ~ {regression_equation}", data=regression_data
        )

        results = model.fit()

        output = self.get_summary_results(results, fund_code)

        return output

    def regress_funds(self):
        fund_returns = self.calculate_returns()
        output = []

        for i in self.fund_codes:
            output.append(self.calculate_factor_regression(
                i,
                ['Mkt','SMB','HML','RMW'],
                self.french_fama_df,
                fund_returns
            ))
        return output



In [33]:
external_data = {
    "fund_codes": ["EBAY", "AAPL"],
    "start_date": "2010-12-27",
    "end_date": "2019-12-31",
    "price_df": price_df,
    "french_fama_df": ff_factors

}

In [34]:
mock_portfolio = Portfolio(**external_data)

In [35]:
mock_portfolio.prepare_data()
mock_portfolio.regress_funds()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_data[i] = (subset_data[i] / subset_data[i].shift()) - 1


[{'fundCode': 'EBAY',
  'numberObservations': 2268.0,
  'rSquared': 0.3269123661302993,
  'rSquaredAdjusted': 0.3257226398662786,
  'fValue': 274.7794816477351,
  'coefficient': Intercept   -0.002117
  Mkt          0.010811
  SMB          0.000184
  HML         -0.002710
  RMW         -0.001194
  dtype: float64,
  'standardErrors': Intercept    0.000312
  Mkt          0.000375
  SMB          0.000664
  HML          0.000630
  RMW          0.001041
  dtype: float64,
  'pValues': Intercept     1.509981e-11
  Mkt          4.880540e-156
  SMB           7.813795e-01
  HML           1.753485e-05
  RMW           2.513008e-01
  dtype: float64,
  'confidenceIntervalLower': Intercept   -0.002729
  Mkt          0.010076
  SMB         -0.001118
  HML         -0.003944
  RMW         -0.003235
  Name: 0, dtype: float64,
  'confidenceIntervalHigher': Intercept   -0.001505
  Mkt          0.011546
  SMB          0.001487
  HML         -0.001475
  RMW          0.000847
  Name: 1, dtype: float64,
  'resi

In [37]:
a = mock_portfolio.regress_funds()[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  subset_data[i] = (subset_data[i] / subset_data[i].shift()) - 1
