# Compute FF 3-Factors and Factor Exposures

In [2]:
import IPython
IPython.auto_scroll_threshold = 9999

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
from IPython.display import display, HTML

%load_ext autoreload
%autoreload 2
%load_ext autotime

In [3]:
from pyspark.sql.functions import *
from pyspark.sql import Window

time: 224 ms


In [4]:
from spark import init_spark_context, load_and_get_table_df

sc, sql_context = init_spark_context("Factor Analysis Job - FF Risk Factors")

time: 4.04 s


## Dates of the analysis

The following are the dates we will use to conduct our analysis

In [5]:
start_analysis_date = "2005-01-01"
end_analysis_date = "2015-12-31"

last_stock_exchange_date = "2015-12-30"
last_fundamentals_date = "2015-12-31"

time: 1.54 ms


## Using the Quarterly calendar

We will work with a quarterly trading calendar. Computing the returns and factors using this periodicity.

In [6]:
# Ontain the list of companies in our universe
calendar = load_and_get_table_df(sql_context, "tfm_uoc_analysis", "quarterly_trading_dates"). \
    filter(col("calendar") == "BMF").select(col("first_date"), col("last_date")). \
    filter((col("last_date") >= start_analysis_date) & (col("last_date") <= end_analysis_date))
calendar.cache()
calendar.show()

DataFrame[first_date: date, last_date: date]

+----------+----------+
|first_date| last_date|
+----------+----------+
|2005-01-03|2005-03-31|
|2005-04-01|2005-06-30|
|2005-07-01|2005-09-30|
|2005-10-03|2005-12-29|
|2006-01-02|2006-03-31|
|2006-04-03|2006-06-30|
|2006-07-03|2006-09-29|
|2006-10-02|2006-12-29|
|2007-01-02|2007-03-30|
|2007-04-02|2007-06-29|
|2007-07-02|2007-09-28|
|2007-10-01|2007-12-28|
|2008-01-02|2008-03-31|
|2008-04-01|2008-06-30|
|2008-07-01|2008-09-30|
|2008-10-01|2008-12-30|
|2009-01-02|2009-03-31|
|2009-04-01|2009-06-30|
|2009-07-01|2009-09-30|
|2009-10-01|2009-12-30|
+----------+----------+
only showing top 20 rows

time: 5.81 s


## Loading the data from sources

We are going to load the following data:

1. Fundamentals.

    1.1. Account 1 – Total Assets (total_assets)
    
    1.2. Account 1.89.03 - Total Shares (total_shares). Used to compute marketcap.
    
    1.3. Account 2.03 - Shareholders' equity (book_equity). Used to compute Book to Market - BM.
    
    
2. Technical.

    1.1. Stock Adjusted Price (stock_price)
    
    1.2. Stock Traded Volume (volume_traded). Used to compute company liquidity.


With the previous information we will need to compute the following factors:

1. Marketcap: $mkt\_cap = total\_shares * stock\_price$
2. Book to Market. $BM = \frac{book\_equity}{mkt\_cap}$

We will use 10 years of data, from 2005-01-01 to 2015-12-31, and we will use monthly prices of the stocks in our univers. We computed our Universe using the `Universe Selection.ipynb` file. 

In [7]:
# Ontain the list of companies in our universe
universe = load_and_get_table_df(sql_context, "tfm_uoc_analysis", "portfolio_universe")
universe.cache()

# Build a matrix with all the trading dates between the period of analysis 
# for each ticker/symbol in the database (and ccvm - company id, and its type of security)
security_tickers = load_and_get_table_df(sql_context, "tfm_uoc_analysis", "security_prices").groupBy(
    col("ccvm"), col("ticker"), col("type")).count().select("ccvm", "ticker", "type"). \
    crossJoin(calendar.select(col("last_date").alias("date"))). \
    orderBy(["date", "ccvm", "type", "ticker"], ascending=[True, True, True, True])

# Obtain the prices history for all the stocks in bovespa, including
# the IBOV index and the Brazilian 10Year Bond and complete the gaps
# (missing trading date) doing a cross join with the expected trading
# dates by stock
security_prices = load_and_get_table_df(sql_context, "tfm_uoc_analysis", "security_prices"). \
    filter((col("date") >= start_analysis_date) & (col("date") <= end_analysis_date)). \
    join(security_tickers, ["date", "ccvm", "ticker", "type"], how="right"). \
    orderBy(["date", "ccvm", "ticker"], ascending=[True, True, True])

# Getting the stock prices for all the companies in Bovespa
stock_prices = security_prices.filter(col("type") == "EQUITY")

# Getting the prices for all the equity stocks in our universe
universe_stock_prices = load_and_get_table_df(sql_context, "tfm_uoc_analysis", "security_prices").join(
    universe.select(col("ccvm")),
    "ccvm", how='right')

# Get the index of reference that represents the market rate (Bovespa)
market_index_prices = security_prices.filter((col("type") == "INDEX") & (col("ticker") == "^BVSP") )

# Get the risk free rate as the Brazilian 10 Year Bond Yield
risk_free_rate_prices = security_prices.filter((col("type") == "DEBT") & (col("ticker") == "GEBR10Y") )

DataFrame[ccvm: string, ticker: string, EBIT: decimal(38,18), EarningsYield: double, ROC: decimal(38,18), astodate: date, cash: decimal(38,18), current_assets: decimal(38,18), current_liabilities: decimal(38,18), current_non_cash_assets: decimal(38,18), dividend: double, excess_cash: decimal(38,18), fixed_liabilities: decimal(38,18), liquidity120days: double, marketcap: double, price_share: float, quality_index: double, short_term_investments: decimal(38,18), solr_query: string, stock: decimal(38,18), total_cash: decimal(38,18), total_debt: decimal(38,18), total_shares: decimal(38,18)]

time: 373 ms


## We compute the stock returns

We compute the log returns of the universe stock prices, the market index, and the risk free rate.

Not we only compute the returns, but we also complete the missing returns applying the fast forward fill approach.

In [8]:
from financials import get_security_returns, group_by_period
from spark import fillna

stock_returns_Q = group_by_period(
    stock_prices,
    columns=["close", "high", "low", "open", "adjclose", "volume"],
    columns_aliases=["close", "high", "low", "open", "adjclose", "volume"],
    columns_agg=[last, last, last, last, last, last, last],
    date_field="date",
    group_columns=["ccvm", "ticker", "type"],
    frequency="quarterly"
)

stock_returns_Q = get_security_returns(stock_returns_Q, ct_price_field="adjclose")


# Fill nulls with the average value of the previous returns
stock_returns_Q_filled = fillna(stock_returns_Q, 
       partition_keys=["ccvm", "ticker", "type"], 
       order_field="date", 
       field_to_fill="adjclose_returns",
       fill_function=last)
stock_returns_Q_filled.cache()

DataFrame[ccvm: string, ticker: string, type: string, year: int, quarter: int, close: float, high: float, low: float, open: float, adjclose: float, volume: float, date: date, adjclose_returns: double, adjclose_returns_filled: double]

time: 515 ms


We calculate also the returns for the index of reference (Bovespa). That's the market's premium.

In [9]:
market_index_returns_Q = group_by_period(
    market_index_prices,
    columns=["close", "high", "low", "open", "adjclose", "volume"],
    columns_aliases=["close", "high", "low", "open", "adjclose", "volume"],
    columns_agg=[last, last, last, last, last, last, last],
    date_field="date",
    group_columns=["ccvm", "ticker", "type"],
    frequency="quarterly"
)

market_index_returns_Q = get_security_returns(market_index_returns_Q)


# Fill nulls with the average value of the previous returns
market_index_returns_Q_filled = fillna(market_index_returns_Q, 
       partition_keys=["ccvm", "ticker", "type"], 
       order_field="date", 
       field_to_fill="close_returns",
       fill_function=avg).select("ccvm", "ticker", "type", "date", "close_returns", "close_returns_filled")
market_index_returns_Q_filled.cache()

DataFrame[ccvm: string, ticker: string, type: string, date: date, close_returns: double, close_returns_filled: double]

time: 326 ms


And finally we compute the returns for the risk-free rate too. In our case the Brazilian 10-Year Bond Yield.

In [10]:
# Fill nulls with the average value of the previous returns
risk_free_rate_prices_filled = fillna(risk_free_rate_prices, 
       partition_keys=["ccvm", "ticker", "type"], 
       order_field="date", 
       field_to_fill="close",
       fill_function=avg).select(
    "ccvm", "ticker", "type", "date", "high", "low", "open", "close", "adjclose", "volume", "close_filled")

risk_free_rate_returns_Q = group_by_period(
    risk_free_rate_prices_filled,
    columns=["close", "high", "low", "open", "adjclose", "close_filled", "volume"],
    columns_aliases=["close", "high", "low", "open", "adjclose", "close_filled", "volume"],
    columns_agg=[last, last, last, last, last, last, last],
    date_field="date",
    group_columns=["ccvm", "ticker", "type"],
    frequency="quarterly"
)

risk_free_rate_returns_Q = get_security_returns(risk_free_rate_returns_Q, ct_price_field="close_filled")


# Fill nulls with the average value of the previous returns
risk_free_rate_returns_Q_filled = fillna(risk_free_rate_returns_Q, 
       partition_keys=["ccvm", "ticker", "type"], 
       order_field="date", 
       field_to_fill="close_filled_returns",
       fill_function=avg).select(
    "ccvm", "ticker", "type", "date", "close_filled_returns", "close_filled_returns_filled")
risk_free_rate_returns_Q_filled.cache()

DataFrame[ccvm: string, ticker: string, type: string, date: date, close_filled_returns: double, close_filled_returns_filled: double]

time: 377 ms


In [11]:
# We compute the log returns for the stocks that are part only of our
# universe of investment
universe_stock_prices = universe_stock_prices. \
    filter((col("date") >= start_analysis_date) & (col("date") <= end_analysis_date)). \
    orderBy(["date", "ccvm", "ticker"], ascending=[True, True, True])

universe_stock_returns_Q = group_by_period(
    universe_stock_prices,
    columns=["close", "high", "low", "open", "adjclose", "volume"],
    columns_aliases=["close", "high", "low", "open", "adjclose", "volume"],
    columns_agg=[last, last, last, last, last, last, last],
    date_field="date",
    group_columns=["ccvm", "ticker", "type"],
    frequency="quarterly"
)

universe_stock_returns_Q = get_security_returns(universe_stock_returns_Q, ct_price_field="adjclose")

universe_stock_returns_Q_filled = fillna(universe_stock_returns_Q, 
       partition_keys=["ccvm", "ticker", "type"], 
       order_field="date", 
       field_to_fill="adjclose_returns",
       fill_function=last)
universe_stock_returns_Q_filled.cache()

universe_stock_returns_Q_filled.show()

DataFrame[ccvm: string, ticker: string, type: string, year: int, quarter: int, close: float, high: float, low: float, open: float, adjclose: float, volume: float, date: date, adjclose_returns: double, adjclose_returns_filled: double]

+-----+------+------+----+-------+-------+-------+-------+-------+----------+------------+----------+----------------+-----------------------+
| ccvm|ticker|  type|year|quarter|  close|   high|    low|   open|  adjclose|      volume|      date|adjclose_returns|adjclose_returns_filled|
+-----+------+------+----+-------+-------+-------+-------+-------+----------+------------+----------+----------------+-----------------------+
| 1023| BBAS3|EQUITY|2005|      1|9.86667|9.86667|9.63333|9.63333|  5.594592|   2378700.0|2005-03-31|            null|                   null|
|11312| OIBR3|EQUITY|2005|      1|100.376|100.376|100.302|100.376| 97.147125|         1.0|2005-03-31|            null|                   null|
|11312| OIBR4|EQUITY|2005|      1|  108.5|  110.5|  107.0|  110.5| 104.87875|      2885.0|2005-03-31|            null|                   null|
|14320| USIM3|EQUITY|2005|      1|   11.4|   11.4|11.1333|11.1333| 10.508494|      9900.0|2005-03-31|            null|                   null|

## Compute now the $f_m$ factor

We can compute now the $f_m$. This factor is the first of the Fama and French factors. We compute the factor using the following formula:

$$f_m = market\_index\_returns - risk\_free\_rate\_returns $$

In [12]:
# We drop any date with null on the market return or on the risk free return.
market_factor = market_index_returns_Q_filled.select(
    "date", col("close_returns_filled").alias("market_returns")).join(
    risk_free_rate_returns_Q_filled.select("date", col("close_filled_returns_filled").alias("riskfree_returns")),
    "date", how="left"). \
    withColumn("market_factor", col("market_returns") - col("riskfree_returns")). \
    orderBy("date", ascending=[True])
market_factor.cache()

DataFrame[date: date, market_returns: double, riskfree_returns: double, market_factor: double]

time: 58.5 ms


In [13]:
import pandas as pd
market_factor_pd = market_factor.toPandas()[1:] # First is non (no returns)
market_factor_pd['date'] = pd.to_datetime(market_factor_pd['date'], format='%Y-%m-%d')
market_factor_pd

Unnamed: 0,date,market_returns,riskfree_returns,market_factor
1,2005-06-30,-0.058622,,
2,2005-09-30,0.260788,,
3,2005-12-31,0.059271,,
4,2006-03-31,0.134385,,
5,2006-06-30,-0.034807,,
6,2006-09-30,-0.004968,,
7,2006-12-31,0.059341,,
8,2007-03-31,0.059341,,
9,2007-06-30,0.187469,,
10,2007-09-30,0.111652,,


time: 28.5 s


### Visualization of the returns

We visualize the accummulated returns of the market index and the risk-free rate.

We also visualize the accummulated returns for the computed market factor.

In [14]:
import altair as alt

chart_market_factor_pd = pd.melt(market_factor_pd, 
        id_vars=['date'], 
        value_vars=['market_returns', 'riskfree_returns'], 
        var_name='rate_type', value_name='returns')

chart_market_factor_pd["cumsum_returns"] = \
    chart_market_factor_pd.groupby(['rate_type'])['returns'].cumsum()

returns_line = alt.Chart(chart_market_factor_pd). \
    mark_line(color="green").encode(
    x=alt.X('date', axis=alt.Axis(title='Quarter')),
    y=alt.Y('cumsum_returns', axis=alt.Axis(title='Returns', format='.0%'), 
            scale=alt.Scale(domain=(-0.6, 1.1))),
    color=alt.Color('rate_type', legend=alt.Legend(title="Type of Rates")))

returns_line

time: 1.25 s


We can see that the returns of the market are high at the end of the period (a final return of 70%). But most of these returns comes from the period that comes from 2005 to 2008, where Brazil was living its best economic period ever. In 2008 the world financial crisis has its impact on the Brazilian market, reducing considerably the returns obtained in the market.

In the following image we analyze the returns obtained during the period that goes from 2008 to 2015. We can see how the final cumulative sum of the returns are negative (aprox. -20%). Since 2008 Brazil has been faced multiple external and internal events that impacted negatively on its public market of stocks. Some of the events are: the slowdown of the Chinese economy and fall in commodity prices that impactes the exportation of commodities, especially iron ore, petroleum, and soy, a sector in which the Brazilian economy depends on, [mensalao scandal](https://en.wikipedia.org/wiki/Mensal%C3%A3o_scandal), [Brazilian aviation crisis](https://en.wikipedia.org/wiki/2006%E2%80%9307_Brazilian_aviation_crisis), [TAM Linhas Aereas flight crash](https://en.wikipedia.org/wiki/TAM_Linhas_A%C3%A9reas_Flight_3054), [operation car wash](https://en.wikipedia.org/wiki/Operation_Car_Wash).

In [15]:
chart_market_factor_pd = pd.melt(market_factor_pd, 
        id_vars=['date'], 
        value_vars=['market_returns', 'riskfree_returns'], 
        var_name='rate_type', value_name='returns')

chart_market_factor_pd = chart_market_factor_pd[chart_market_factor_pd["date"] >= pd.to_datetime("2008-03-31")]

chart_market_factor_pd["cumsum_returns"] = \
    chart_market_factor_pd.groupby(['rate_type'])['returns'].cumsum()

returns_line = alt.Chart(chart_market_factor_pd). \
    mark_line(color="green").encode(
    x=alt.X('date', axis=alt.Axis(title='Quarter')),
    y=alt.Y('cumsum_returns', axis=alt.Axis(title='Returns', format='.0%'), 
            scale=alt.Scale(domain=(-0.6, 1.1))),
    color=alt.Color('rate_type', legend=alt.Legend(title="Type of Rates")))

returns_line

time: 20.9 ms


In [16]:
market_factor_pd["cumsum_returns"] = \
    market_factor_pd['market_factor'].cumsum()

market_factor_line = alt.Chart(market_factor_pd).mark_line(color="orange").encode(
    x=alt.X('date', axis=alt.Axis(title='Quarter')),
    y=alt.Y('cumsum_returns', axis=alt.Axis(title='Market Factor Returns', format='.0%'), 
            scale=alt.Scale(domain=(-0.6, 1.1))))

market_factor_line

time: 15.9 ms


## Generation of theoretical portfolios (SMB and HML factors)

In order to generate the size factor and value factor of Fama and French we will need to first:

1. We need to sort our stocks into long and short portfolios. We use the company size (a.k.a. `marketcap`) criteria to sort the companies from lowest to highest. The companies at the 90th percentile will be tagged as B (BIG) and the ones at the 10th percentile will be tagged as S (SMALL)

    
2. On each of the previous two ranks of companies we need to sort its stocks into long and short portfolios but now using the `Book to Market` ratio (BE/ME ratio), from the lowest to the highest. Stocks with negative ratios must be removed from the computation. Then, we need to tag the 30th percentile as High-value (V - VALUE), the 70th percentile as Low-value (G - GROWTH), and the companies in between as Medium-value (N - NEUTRAL)

    
3. Now we create 6 portfolios using the combination of value groups, which are `SV`, `SN`, `SG`, `BV`, `BN`, `BG`. An we compute the quarterly return of each portfolio as the average return of the stocks in the portfolio.

Formula to compute the Book to Market ratio: $\frac{shareholders\_equity}{marketcap}$

- Accounts - 2.03 - Shareholders Equity

We already computed the marketcap of the companies in our Universe (`Universe Selection` notebook), we can relay on it.

In [17]:
# Accessing to the fundamental data of the companies
fundamentals = load_and_get_table_df(sql_context, "tfm_uoc", "bovespa_account")

# Compute the Market to Book
# Account 2.03 - Shareholders Equity
factors_df = fundamentals.filter(
    col("number").isin(["2.03"]))
factors_df = factors_df.withColumn(
    "factor_name", when(factors_df.number == "2.03", "shareholders_equity").otherwise("UNKNOWN"))

factors_df = factors_df\
    .select(col("ccvm").alias("asset"), 
            col("period").alias("astodate"),
            col("factor_name"), 
            col("amount").alias("amount"))

factors_df = factors_df.groupby(col("asset"), col("astodate"))\
    .pivot("factor_name").sum("amount"). \
    filter((col("astodate") >= start_analysis_date) & (col("astodate") <= end_analysis_date)). \
    orderBy("asset", "astodate", ascending=[True, True])
factors_df.cache()

DataFrame[asset: string, astodate: date, shareholders_equity: decimal(38,18)]

time: 50.6 s


### Creation of the portfolios

We need to build the following portfolios

SV = Small + Value

SN = Small + Neutral

SG = Small + Growth

BV = Big + Value

BN = Big + Neutral

BG = Big + Growth

Computing the market cap and the book to market ratios. We use this ratios to create the 6 theoretical portfolios.

In [18]:
# Accessing to the fundamental data of the companies
marketcap_df = universe.select(col("ccvm").alias("asset"), "astodate", "ticker", "marketcap")

book_to_market_df  = factors_df.join(marketcap_df, ["asset", "astodate"], how="right"). \
    withColumn(
        "book_to_market", 
        (factors_df.shareholders_equity / marketcap_df.marketcap)). \
        filter("book_to_market>0"). \
        orderBy("asset", "astodate", ascending=[True, True])

time: 182 ms


Using the ratios we proceed to the creation of the portfolios, creating the groups of Small/Big and Growth/Value.

In [19]:
max_assets = book_to_market_df.count()
max_value_assets = int(max_assets / 2)

value_ranges = int(max_value_assets / 3)

size_window = Window.orderBy("marketcap")
rank_window = Window.partitionBy("size").orderBy("book_to_market")

book_to_market_df = book_to_market_df. \
    orderBy(["marketcap", "book_to_market"], ascending=[True, True]). \
    withColumn("size_rank", rank().over(size_window)). \
    withColumn("size", when(col("size_rank") <= max_value_assets, "small").otherwise('big')). \
    withColumn("value_rank", rank().over(rank_window)). \
    withColumn("value", when(col("value_rank") <= value_ranges, "value").otherwise(
            when(col("value_rank") > (2 * value_ranges), "growth").otherwise("neutral")))

book_to_market_df.cache()

DataFrame[asset: string, astodate: date, shareholders_equity: decimal(38,18), ticker: string, marketcap: double, book_to_market: double, size_rank: int, size: string, value_rank: int, value: string]

time: 1min 10s


In [20]:
book_to_market_df.select(col("size").alias("Size Groups"), col("value").alias("Value Groups"), col("ticker")).toPandas()

Unnamed: 0,Size Groups,Value Groups,ticker
0,big,value,ODPV3
1,big,value,ABEV3
2,big,value,NATU3
3,big,value,CIEL3
4,big,value,TOTS3
5,big,neutral,BRFS3
6,big,neutral,UGPA3
7,big,neutral,BBSE3
8,big,neutral,ITSA4
9,big,neutral,VALE3


time: 1.33 s


In [21]:
SV = book_to_market_df.filter((col("size") == 'small') & (col("value") == "value")). \
    select(col("asset").alias("ccvm"), col("ticker"))
SN = book_to_market_df.filter((col("size") == 'small') & (col("value") == "neutral")). \
    select(col("asset").alias("ccvm"), col("ticker"))
SG = book_to_market_df.filter((col("size") == 'small') & (col("value") == "growth")). \
    select(col("asset").alias("ccvm"), col("ticker"))

BV = book_to_market_df.filter((col("size") == 'big') & (col("value") == "value")). \
    select(col("asset").alias("ccvm"), col("ticker"))
BN = book_to_market_df.filter((col("size") == 'big') & (col("value") == "neutral")). \
    select(col("asset").alias("ccvm"), col("ticker"))
BG = book_to_market_df.filter((col("size") == 'big') & (col("value") == "growth")). \
    select(col("asset").alias("ccvm"), col("ticker"))

time: 137 ms


In [22]:
SV.show()

+-----+------+
| ccvm|ticker|
+-----+------+
|20788| MRFG3|
|19453| ECOR3|
|22497| QUAL3|
|20028| VLID3|
|19739| RENT3|
+-----+------+

time: 56.3 ms


### Compute the return of each theoretical portfolio

A simple average on the asset returns of each portfolio

In [23]:
SV_return = universe_stock_returns_Q_filled. \
        select("ccvm", "ticker", "date", 
               col("adjclose_returns_filled").alias("adjclose_returns")). \
        join(SV, ["ccvm", "ticker"], how="right"). \
        groupBy("date"). \
        agg(avg("adjclose_returns").alias("sv_r")). \
        orderBy("date", ascending=True)

SN_return = universe_stock_returns_Q_filled. \
        select("ccvm", "ticker", "date", 
               col("adjclose_returns_filled").alias("adjclose_returns")). \
        join(SN, ["ccvm", "ticker"], how="right"). \
        groupBy("date"). \
        agg(avg("adjclose_returns").alias("sn_r")). \
        orderBy("date", ascending=True)

SG_return = universe_stock_returns_Q_filled. \
        select("ccvm", "ticker", "date", 
               col("adjclose_returns_filled").alias("adjclose_returns")). \
        join(SG, ["ccvm", "ticker"], how="right"). \
        groupBy("date"). \
        agg(avg("adjclose_returns").alias("sg_r")). \
        orderBy("date", ascending=True)

time: 78.1 ms


In [24]:
BV_return = universe_stock_returns_Q_filled. \
        select("ccvm", "ticker", "date", 
               col("adjclose_returns_filled").alias("adjclose_returns")). \
        join(BV, ["ccvm", "ticker"], how="right"). \
        groupBy("date"). \
        agg(avg("adjclose_returns").alias("bv_r")). \
        orderBy("date", ascending=True)

BN_return = universe_stock_returns_Q_filled. \
        select("ccvm", "ticker", "date", 
               col("adjclose_returns_filled").alias("adjclose_returns")). \
        join(BN, ["ccvm", "ticker"], how="right"). \
        groupBy("date"). \
        agg(avg("adjclose_returns").alias("bn_r")). \
        orderBy("date", ascending=True)

BG_return = universe_stock_returns_Q_filled. \
        select("ccvm", "ticker", "date", 
               col("adjclose_returns_filled").alias("adjclose_returns")). \
        join(BG, ["ccvm", "ticker"], how="right"). \
        groupBy("date"). \
        agg(avg("adjclose_returns").alias("bg_r")). \
        orderBy("date", ascending=True)

time: 73.2 ms


In [25]:
theoretical_portfolios_r = SV_return.join(SN_return, "date"). \
    join(SG_return, "date"). \
    join(BV_return, "date"). \
    join(BN_return, "date"). \
    join(BG_return, "date"). \
    join(market_factor, "date")
theoretical_portfolios_r.cache()

DataFrame[date: date, sv_r: double, sn_r: double, sg_r: double, bv_r: double, bn_r: double, bg_r: double, market_returns: double, riskfree_returns: double, market_factor: double]

time: 379 ms


In [26]:
theoretical_portfolios_r.toPandas()

Unnamed: 0,date,sv_r,sn_r,sg_r,bv_r,bn_r,bg_r,market_returns,riskfree_returns,market_factor
0,2005-06-30,,,-0.126334,-0.187325,-0.086096,-0.112968,-0.058622,,
1,2005-09-30,0.468261,,0.526293,0.152539,0.366289,0.323284,0.260788,,
2,2005-12-31,0.547562,0.053462,0.181691,0.139733,0.050434,0.000411,0.059271,,
3,2006-03-31,0.285368,0.102226,0.23113,0.16779,0.143068,0.268274,0.134385,,
4,2006-06-30,0.086956,-0.132163,-0.044824,-0.09532,-0.084866,-0.030743,-0.034807,,
5,2006-09-30,0.184526,0.060687,-0.065384,0.205096,-0.010669,-0.041945,-0.004968,,
6,2006-12-31,0.2004,0.187477,0.271827,0.117162,0.305986,0.192211,0.059341,,
7,2007-03-31,0.03478,-0.084848,0.097228,0.067567,0.053818,0.116163,0.059341,,
8,2007-06-30,0.120212,0.285103,0.249446,0.239649,0.171722,0.159136,0.187469,,
9,2007-09-30,-0.059879,-0.180006,0.366271,-0.068542,0.191256,0.129445,0.111652,,


time: 30.9 s


### Compute the SMB and HML

$$SMB = \frac{1}{3}((r_{s,v} + r_{s,n} + r_{s,g}) - (r_{b,v} + r_{b,n} + r_{b,g}))$$

$$HML = \frac{1}{2}((r_{s,v} + r_{b,v}) - (r_{s,g} + r_{b,g}))$$

In [27]:
ff_factors = theoretical_portfolios_r.withColumn(
    "SMB", 
    ((col("sv_r") + col("sn_r") + col("sg_r")) - 
     (col("bv_r") + col("bn_r") + col("bg_r")) / 3)) . \
    withColumn(
    "HML", 
    ((col("sv_r") + col("bv_r")) - 
     (col("sg_r") + col("bg_r")) / 2)). \
    select("date", "market_factor", "SMB", "HML")
ff_factors.cache()

DataFrame[date: date, market_factor: double, SMB: double, HML: double]

time: 166 ms


In [28]:
ff_factors_pdf = ff_factors.toPandas()
ff_factors_pdf = ff_factors_pdf.set_index("date")
ff_factors_pdf

Unnamed: 0_level_0,market_factor,SMB,HML
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2005-06-30,,,
2005-09-30,,,0.196011
2005-12-31,,0.719189,0.596243
2006-03-31,,0.425681,0.203456
2006-06-30,,-0.019722,0.029419
2006-09-30,,0.129001,0.443287
2006-12-31,,0.454584,0.085543
2007-03-31,,-0.032023,-0.004348
2007-06-30,,0.464592,0.15557
2007-09-30,,0.042334,-0.37628


time: 769 ms


In [29]:
import altair as alt

chart_ff_factors_pdf = pd.melt(ff_factors_pdf.reset_index(), 
        id_vars=['date'], 
        value_vars=['market_factor', 'SMB', 'HML'], 
        var_name='factor', value_name='factor_returns')

chart_ff_factors_pdf['date'] = pd.to_datetime(
    chart_ff_factors_pdf['date'], format='%Y-%m-%d')

chart_ff_factors_pdf["cumsum_factor_returns"] = \
    chart_ff_factors_pdf.groupby(['factor'])['factor_returns'].cumsum()

factors_returns_line = alt.Chart(chart_ff_factors_pdf). \
    mark_line().encode(
    x=alt.X('date', axis=alt.Axis(title='Quarter')),
    y=alt.Y('cumsum_factor_returns', axis=alt.Axis(title='Factor Returns', format='.0%')),
    color=alt.Color('factor', legend=alt.Legend(title="Factors")))

factors_returns_line

time: 25.1 ms


We only can use data from `2018-03-31` as we do not have more data points before that date for the risk-free factor.

In [30]:
ff_factors = ff_factors.na.drop()
ff_factors.cache()

DataFrame[date: date, market_factor: double, SMB: double, HML: double]

time: 20.1 ms


In [31]:
from db import sync_table

sync_table(ff_factors, "tfm_uoc_dse", "tfm_uoc_analysis", "ff_factors", ["date"])

Closing connections
time: 140 ms




In [32]:
ff_factors.write\
    .format("org.apache.spark.sql.cassandra")\
    .options(table="ff_factors", keyspace="tfm_uoc_analysis")\
    .option("confirm.truncate","true")\
    .mode("overwrite")\
    .partitionBy("astodate")\
    .save()

time: 1.19 s


## Compute the factor exposure of each asset to the risk factors

Now that we have all the Fama and French factors, we can proceed to identify the exposure that each asset of our portfolio has to each of these factors.

To do so, we need to collect the quaterly stock price between 2005 and 2015 and train a LinearRegression on the asset time series to extrar the coeficients (beta) of each factor.

In [33]:
# We assign the same FF factors to each asset in our universe
# for each of the trading dates (quarters)
ff_factors_by_asset = ff_factors.join(universe_stock_returns_Q_filled. \
        select("ccvm", "ticker", "date", 
               col("adjclose_returns_filled").alias("adjclose_returns")), "date", how="left"). \
    orderBy("date", ascending=True).na.drop()
ff_factors_by_asset.cache()

DataFrame[date: date, market_factor: double, SMB: double, HML: double, ccvm: string, ticker: string, adjclose_returns: double]

time: 72.9 ms


In [34]:
factors_pdf = ff_factors_by_asset.toPandas()

time: 1.17 s


In [35]:
factors_pdf

Unnamed: 0,date,market_factor,SMB,HML,ccvm,ticker,adjclose_returns
0,2008-03-31,-0.045675,-0.027065,-0.004486,906,BBDC3,-0.169012
1,2008-03-31,-0.045675,-0.027065,-0.004486,8656,GOAU4,0.044154
2,2008-03-31,-0.045675,-0.027065,-0.004486,8656,GOAU3,0.046366
3,2008-03-31,-0.045675,-0.027065,-0.004486,7617,ITSA4,-0.202269
4,2008-03-31,-0.045675,-0.027065,-0.004486,7617,ITSA3,0.006687
...,...,...,...,...,...,...,...
1176,2015-12-31,-0.103862,-0.249679,0.173015,14320,USIM5,-0.537313
1177,2015-12-31,-0.103862,-0.249679,0.173015,14320,USIM3,-0.513317
1178,2015-12-31,-0.103862,-0.249679,0.173015,11312,OIBR4,-0.301075
1179,2015-12-31,-0.103862,-0.249679,0.173015,11312,OIBR3,-0.220780


time: 14.1 ms


We group the data by tickers and we train a LinearRegression for each ticker. For the linear regression we use the stock returns of the stock as a dependent variable and the 3-factor returns as independent variables.

We use the coefficients of the trained model as the exposure of the stock to each of the factors. We also get the p_value for each coefficient to get the level of significance of the factor itself.

In [36]:
import statsmodels.api as sm

coefficients = []
for name, group in factors_pdf.groupby(["ticker"]):
    y = group["adjclose_returns"]
    X = group[["market_factor", "SMB", "HML"]]
    
    # We don't get the last row, as is the one we will
    # use to trade
    y = y.iloc[:-1]
    X = X.iloc[:-1,:]
    X = sm.add_constant(X)
    model = sm.OLS(y, X).fit()
    factor_exposures = [model.params[i] for i in ["market_factor", "SMB", "HML"]]
    factor_exposures_significance = [model.pvalues[i] for i in ["market_factor", "SMB", "HML"]]
    coefficients.append(pd.DataFrame([[name] + factor_exposures + factor_exposures_significance], 
                                     columns=["ticker", "market_factor", "SMB", "HML"] + 
                                     ["mf_pvalue", "SMB_pvalue", "HML_pvalue"]))

  return ptp(axis=axis, out=out, **kwargs)


time: 1.95 s


Now we have a list of factor exposures by stock, we need to merge them into one to create the matrix of factor exposures

In [37]:
from functools import reduce
df_merged = reduce(
    lambda left,right: pd.merge(left, right, how='outer'), coefficients)

df_merged

Unnamed: 0,ticker,market_factor,SMB,HML,mf_pvalue,SMB_pvalue,HML_pvalue
0,ABEV3,0.198205,0.103727,0.065893,0.435649,0.3045144,0.740251
1,BBAS3,0.653457,0.366731,-0.266449,0.017448,0.001287372,0.1996497
2,BBDC3,0.444186,0.195025,-0.160041,0.020208,0.01078285,0.2674987
3,BBDC4,0.33218,0.097985,-0.292947,0.04975,0.1381121,0.03062666
4,BBSE3,0.16666,0.811814,0.189612,0.56001,0.01697756,0.3853696
5,BEEF3,-0.226517,0.635952,0.446286,0.6452,0.002691383,0.2529846
6,BRAP3,0.522709,0.269768,-0.073226,0.024308,0.004354265,0.6737899
7,BRAP4,0.229713,0.364689,-0.378919,0.38485,0.001456508,0.07451853
8,BRFS3,0.044936,0.19044,0.016253,0.839002,0.03675281,0.9254335
9,BRML3,0.512105,0.185504,0.110014,0.143532,0.1791034,0.6835367


time: 241 ms


In [38]:
df_merged.to_csv('factor_exposures.csv')

time: 10.6 ms


In [39]:
df_merged[["ticker", "market_factor", "SMB", "HML"]]

Unnamed: 0,ticker,market_factor,SMB,HML
0,ABEV3,0.198205,0.103727,0.065893
1,BBAS3,0.653457,0.366731,-0.266449
2,BBDC3,0.444186,0.195025,-0.160041
3,BBDC4,0.33218,0.097985,-0.292947
4,BBSE3,0.16666,0.811814,0.189612
5,BEEF3,-0.226517,0.635952,0.446286
6,BRAP3,0.522709,0.269768,-0.073226
7,BRAP4,0.229713,0.364689,-0.378919
8,BRFS3,0.044936,0.19044,0.016253
9,BRML3,0.512105,0.185504,0.110014


time: 10.2 ms


In [40]:
from pyspark.sql.types import *
from db import sync_table

time: 710 µs


In [41]:
factor_exposures_df = pd.read_csv('factor_exposures.csv')
factor_exposures_df = factor_exposures_df[["ticker","market_factor", "SMB", "HML", "mf_pvalue", "SMB_pvalue", "HML_pvalue"]]

factor_exposures_schema = StructType([
    StructField("ticker", StringType(), True),
    StructField("market_factor", FloatType(), True),
    StructField("SMB", FloatType(), True),
    StructField("HML", FloatType(), True),
    StructField("mf_pvalue", FloatType(), True),
    StructField("SMB_pvalue", FloatType(), True),
    StructField("HML_pvalue", FloatType(), True)])   

factor_exposures_spark_df = sql_context.createDataFrame(
    factor_exposures_df,
    schema=factor_exposures_schema)

time: 60.7 ms


In [42]:
sync_table(factor_exposures_spark_df, "tfm_uoc_dse", "tfm_uoc_analysis", "ff_factor_exposures", ["ticker"])

Closing connections
time: 36.8 ms




In [43]:
factor_exposures_spark_df.write\
    .format("org.apache.spark.sql.cassandra")\
    .options(table="ff_factor_exposures", keyspace="tfm_uoc_analysis")\
    .option("confirm.truncate","true")\
    .mode("overwrite")\
    .partitionBy("astodate")\
    .save()

time: 1.17 s


In [44]:
sc.stop()

time: 970 ms
