In [None]:
pip install -q yfinance yahooquery setuptools pandas-datareader plotly

In [None]:
### Setting up libraries
import numpy as np
import pandas as pd
import yfinance as yf
import yahooquery as yq
import requests
import plotly.express as px
from datetime import datetime, timedelta
from tqdm.notebook import tqdm
import time
from scipy.stats import spearmanr
import statsmodels
import statsmodels.api as sm
from statsmodels.tools.tools import pinv_extended
from google.colab import  drive
import warnings


warnings.filterwarnings("ignore")

In [None]:
### Captura todas as ações negociadas do dia de hoje
url = 'https://api.nasdaq.com/api/screener/stocks?tableonly=true&limit=25&offset=0&download=true'
headers = {
    'Accept-Language': 'en-US,en;q=0.9',
    'Accept-Encoding': 'gzip, deflate, br',
    'User-Agent': 'Java-http-client/'
}

### data tidying - Calculo da Volumetria por Ativo & Filtros (Volume + Duplicados)
assets = assets = (
    pd.DataFrame(
        requests
        .get(url, headers=headers)
        .json()
        ['data']['rows']
      )
    .assign(
        marketCap = lambda x: pd.to_numeric(x.marketCap)
    )
    [lambda x: x.marketCap > x.describe(percentiles=[.89, .99]).T['89%'][0]]
    .sort_values(by = ['marketCap'], ascending=False)
)['symbol'].tolist()

assets.remove('BRK/A')
assets.remove('BRK/B')

# Tickers of Global Indexes
factors = [
    '^VIX'    # Global Volatility Index
    ,'^IRX'   # Juros 3 meses EUA - BSHV39
    ,'^GSPC'  # S&P 500 - IVVB11
    ,'ACWX'   # MSCI - Top Ações mundo ordenado por Cap SEM USA
    ,'XEM.TO' # MSCI - Ações Emergentes ordenado por Cap - BEEM39
    ,'EMB'    # USD Emerging Markets Bond
    ,'GD=F'   # GSCI ETF commodities - MATB11
    ,'GC=F'   # Ouro

    ,'MTUM'   # Fator Momentum
    ,'QUAL'   # Fator Qualidade
    ,'SIZE'   # Fator Tamanho
    ,'VLUE'   # Fator Valor
    ,'USMV'   # Fator Baixa Volatilidade
]

# Union B3's Assets + Global Factors
assets = factors + assets

In [None]:
# Date Range
start = (datetime.today() - timedelta(days=365*11)).strftime('%Y-%m-%d')
end = datetime.today().strftime('%Y-%m-%d')

# Downloading data & adjusting it
data = yf.download(assets, start = start, end = end, threads=True, timeout=35)
data = (
    data
    .loc[:,('Adj Close', slice(None))]
    .droplevel(level=0, axis=1)
    [lambda x: x.index.dayofweek < 5]
    .dropna(axis=1,how='all')
)

# Turn all values from lognormal to normal
final_data = np.log1p(data.pct_change(fill_method=None))

[*********************100%***********************]  734 of 734 completed


In [None]:
# Factors
factors = [
    '^VIX'    # Global Volatility Index
    ,'^IRX'   # Juros 3 meses EUA - BSHV39
    ,'^GSPC'  # S&P 500 - IVVB11
    ,'ACWX'   # MSCI - Top Ações mundo ordenado por Cap SEM USA
    ,'XEM.TO' # MSCI - Ações Emergentes ordenado por Cap - BEEM39
    ,'EMB'    # USD Emerging Markets Bond
    ,'GD=F'   # GSCI ETF commodities - MATB11
    ,'GC=F'   # Ouro

    ,'MTUM'   # Fator Momentum
    ,'QUAL'   # Fator Qualidade
    ,'SIZE'   # Fator Tamanho
    ,'VLUE'   # Fator Valor
    ,'USMV'   # Fator Baixa Volatilidade
]

# Fix Assets List
assets = [item for item in final_data.columns if item not in ['EMB']]

# Create dataframe to save alpha and betas
data = pd.DataFrame()

# Run a linear regression to get alpha
for i in tqdm([x for x in assets if x not in factors]):

  # Select index
  y = final_data[i].dropna()

  # Selecting factors
  vars = final_data[final_data.index.isin(y.index)][factors].dropna()

  X_sm = sm.add_constant(vars[factors])
  y = y[y.index.isin(X_sm.index)]

  # fit OLS model
  results = sm.OLS(y, X_sm).fit_regularized(L1_wt=0, alpha=0.00025)

  n_model = sm.OLS(y, X_sm)
  pinv_wexog,_ = pinv_extended(n_model.wexog)
  normalized_cov_params = np.dot(pinv_wexog, np.transpose(pinv_wexog))

  final = sm.regression.linear_model.OLSResults(
      n_model,
      results.params,
      normalized_cov_params
  )

  a = np.where(
      i == 'GSPC', 's_p',
        np.where(
            i == '^RUT', 'rsl_2000',
              np.where(
                  i == 'EWJ' , 'top_jp',
                    np.where(
                        i == 'GDF', 'cmmdt',
                          np.where(
                              i == 'GCF', 'gld',
                                np.where(
                                    i == 'CL=F', 'oil',
                                      np.where(
                                          i == '000001.SS', 'sse_china',
                                              np.where(
                                                  i == 'IXIC', 'nsdq', i
                                                  )
                                              )
                                      )
                                )
                          )
                    )
              )
      )

  # Create the last table woth all coefficients
  dt = pd.DataFrame(
    {
     'ticker': a,
     'qtd_dias': len(y),
     'vix': [results.params[1].round(3)],
     'fed_3_y': [results.params[2].round(3)],
     's_p': [results.params[3].round(3)],
     'msci_top_ex_us': [results.params[4].round(3)],
     'msci_emg': [results.params[5].round(3)],
#     'bond_emg': [results.params[2].round(3)],
     'cmmdt': [results.params[6].round(3)],
     'gold': [results.params[7].round(3)],
     'momnt': [results.params[8].round(3)],
     'quali': [results.params[9].round(3)],
     'size': [results.params[10].round(3)],
     'vlue': [results.params[11].round(3)],
     'lowvol': [results.params[12].round(3)],
     'return': [final_data[i].sum().round(3)],
     'vol': round(np.std(y), 5),
     'alpha': [(results.params[0]).round(5)],
     'r_score': [final.rsquared.round(3)]
     }
  )

  data = pd.concat([data, dt], ignore_index=True)

  0%|          | 0/721 [00:00<?, ?it/s]

In [None]:
### Plot Avaliando - O indicador de comportamental passado prediz o futuro?
var = 'alpha'
df_plot = data[[var, 'return']].dropna()
corr, _ = spearmanr(df_plot[var], df_plot['return'])

fig = px.scatter(
    data, x=var, y='return', hover_name='ticker',
    labels={
        var: 'Exposição ao Fator',
        'return': 'Retorno'
    },
    title=f'Relação Exposição ao Fator (Alpha) & Retorno das Ações do Ibovespa',
    trendline='ols',
    trendline_color_override = 'black',
    template='plotly_white'
).update_traces(
    marker_size=12,
    marker=dict(color='green'),
    opacity=0.4
).update_layout(
    font=dict(size=14), showlegend=False
)

fig.show()

In [None]:
### Filtros de Exposição à Fatores
resumo = data.describe(percentiles=[.03, .97]).T
resumo = resumo.iloc[:, 4:7]

resumo.columns = ['lower', 'mid', 'upper']

### Filtro final - Relativizado
final_data = (
    data[
          # Maior, mais retorno
          (data['qtd_dias'] >= resumo['mid'].loc['qtd_dias'])

          & (data['vix'] >= resumo['lower'].loc['vix'])
          & (data['vix'] < resumo['upper'].loc['vix'])

          & (data['fed_3_y'] >= resumo['lower'].loc['fed_3_y'])
          & (data['fed_3_y'] <= resumo['upper'].loc['fed_3_y'])

          & (data['msci_top_ex_us'] >= resumo['lower'].loc['msci_top_ex_us'])
          & (data['msci_top_ex_us'] <= resumo['upper'].loc['msci_top_ex_us'])

          & (data['msci_emg'] <= resumo['upper'].loc['msci_emg'] * 0.9)

          & (data['cmmdt'] <= resumo['upper'].loc['cmmdt'])

          & (data['gold'] >= resumo['lower'].loc['gold'])
          & (data['gold'] <= resumo['upper'].loc['gold'])

          & (data['momnt'] >= resumo['lower'].loc['momnt'])
          & (data['momnt'] <= resumo['upper'].loc['momnt'])

          & (data['quali'] >= resumo['lower'].loc['momnt'])
          & (data['quali'] <= resumo['upper'].loc['momnt'])

          & (data['size'] >= resumo['lower'].loc['size'])
          & (data['size'] <= resumo['upper'].loc['size'])

          & (data['vlue'] >= resumo['lower'].loc['vlue'])
          & (data['vlue'] <= resumo['upper'].loc['vlue'])

          & (data['lowvol'] >= resumo['lower'].loc['lowvol'])
          & (data['lowvol'] <= resumo['upper'].loc['lowvol'])

          & (data['vol'] <= resumo['upper'].loc['vol'] * 0.9)

          & (data['return'] >= resumo['mid'].loc['return'])

          & (data['alpha'] > 0)
      ]
      .sort_values(by=['alpha'], ascending=False)
      .reset_index(drop=True)
)

final_data

Unnamed: 0,ticker,qtd_dias,vix,fed_3_y,s_p,msci_top_ex_us,msci_emg,cmmdt,gold,momnt,quali,size,vlue,lowvol,return,vol,alpha,r_score
0,EME,2597,-0.052,0.001,0.081,0.072,0.012,0.022,0.025,-0.014,0.045,0.084,0.112,0.144,2.664,0.01859,0.00063,0.382
1,LNG,2597,-0.050,0.005,0.057,0.077,0.016,0.040,0.203,-0.015,0.032,0.047,0.072,0.102,1.972,0.02191,0.00058,0.262
2,RSG,2597,-0.019,0.001,0.065,0.029,-0.000,0.015,-0.004,0.005,0.052,0.068,0.072,0.060,2.050,0.01159,0.00052,0.392
3,CQP,2597,-0.027,0.006,0.046,0.065,0.033,0.056,0.193,-0.001,0.027,0.036,0.073,0.077,1.605,0.02046,0.00050,0.209
4,HLT,2597,-0.050,0.002,0.080,0.077,0.047,0.037,0.011,-0.026,0.051,0.083,0.114,0.122,2.013,0.01801,0.00044,0.390
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,STLA,2597,-0.063,0.003,0.087,0.186,0.103,0.051,0.022,-0.016,0.041,0.079,0.128,0.170,1.232,0.02481,0.00010,0.364
62,HIG,2597,-0.027,0.004,0.087,0.071,0.022,0.025,0.027,-0.024,0.036,0.078,0.131,0.157,1.490,0.01750,0.00010,0.392
63,NUE,2597,-0.050,0.001,0.097,0.090,0.043,0.005,0.072,0.010,0.029,0.096,0.134,0.184,1.311,0.02162,0.00010,0.376
64,BIP,2597,-0.005,0.001,0.077,0.111,0.047,0.074,0.078,0.031,0.047,0.079,0.118,0.108,1.239,0.01705,0.00009,0.349


In [None]:
# Select tickers to get fundamentalist informations
lista = final_data['ticker']

# Create dataframe to save fundamental indexes
data = pd.DataFrame()

for ticker in tqdm(lista):
  ### Get the Historical Company Performance - Gross and Net Margin
  #################################################################
  yf_data = yq.Ticker(ticker)
  asst_data = yf_data.history(period = '10y').reset_index(0)
  asst_data['year'] = pd.to_datetime(asst_data.index.to_series(), errors='coerce', utc=True).dt.year

  if 'dividends' not in asst_data.columns:
    asst_data['dividends'] = 0

  ### Get the last price of each year
  time.sleep(0.1)
  last_prices = asst_data.groupby('year')['close'].agg(['last'])

  ### Get the Historical Dividend Yield
  #####################################
  sun_div = asst_data[asst_data.dividends != 0].groupby('year')['dividends'].agg(['sum'])

  ### Grouping Last Price with Dividends Sum
  asst_div_data = pd.concat([last_prices, sun_div], axis=1)

  asst_div_data['yield'] = asst_div_data['sum'] / asst_div_data['last']

  hist_div = round(asst_div_data['yield'].median() * 100, 2)


  ### Final Dataset - Fundamentalist Performance & Index
  ######################################################
  final_data_fund = pd.DataFrame(
      {
      'ticker': ticker,
      'Div. Yield Med': [hist_div],
      }
  )

  time.sleep(0.1)

  data = pd.concat([data, final_data_fund], ignore_index=True)

  0%|          | 0/66 [00:00<?, ?it/s]

In [None]:
final_us_data = final_data.merge(data, on='ticker', how='left')[(lambda x: x['Div. Yield Med'] >= 2)]

final_us_data.head(59)

Unnamed: 0,ticker,qtd_dias,vix,fed_3_y,s_p,msci_top_ex_us,msci_emg,cmmdt,gold,momnt,quali,size,vlue,lowvol,return,vol,alpha,r_score,Div. Yield Med
3,CQP,2597,-0.027,0.006,0.046,0.065,0.033,0.056,0.193,-0.001,0.027,0.036,0.073,0.077,1.605,0.02046,0.0005,0.209,6.41
5,PKG,2597,-0.032,-0.001,0.08,0.063,0.028,0.005,0.01,-0.017,0.033,0.078,0.109,0.137,1.784,0.01726,0.00044,0.331,2.88
8,CAT,2597,-0.05,-0.001,0.081,0.088,0.047,-0.001,0.115,-0.009,0.032,0.079,0.108,0.157,1.98,0.01838,0.00041,0.437,2.17
11,WRB,2597,-0.02,0.0,0.074,0.049,0.005,0.009,0.017,-0.016,0.048,0.067,0.105,0.118,1.899,0.01469,0.0004,0.376,2.33
14,PCAR,2597,-0.043,0.001,0.078,0.066,0.031,0.008,0.027,-0.02,0.047,0.082,0.087,0.13,1.538,0.016,0.00035,0.412,3.98
15,CRH,2597,-0.054,0.004,0.084,0.15,0.061,0.064,0.025,0.015,0.05,0.08,0.12,0.122,1.795,0.01895,0.00034,0.476,2.1
18,APD,2597,-0.034,-0.0,0.079,0.079,0.042,0.023,0.025,0.018,0.056,0.079,0.091,0.095,1.498,0.01583,0.00031,0.401,2.1
19,CMI,2597,-0.035,0.001,0.078,0.08,0.044,0.017,0.051,-0.01,0.032,0.082,0.105,0.144,1.408,0.01674,0.00031,0.4,2.64
20,ITW,2597,-0.033,0.001,0.091,0.066,0.031,0.02,0.021,-0.005,0.038,0.103,0.106,0.126,1.439,0.01433,0.0003,0.544,2.2
23,IHG,2597,-0.033,0.003,0.092,0.146,0.078,0.057,0.012,-0.0,0.054,0.088,0.133,0.138,1.436,0.0187,0.00028,0.447,2.01


In [None]:
### Save the output inside Google Drive
# drive.mount('drive')

# final_us_data.to_csv('/content/drive/My Drive/data_lake/us_alpha_raking.csv', encoding='utf-8', index=False)

In [None]:
# Tickers of Global Indexes
assets = final_us_data['ticker']

# Load all data
data = pd.DataFrame()

for i in assets:
  df = (
      yq.Ticker(i)
      .history(start = start, end = end, interval = '1d')
      .reset_index(0)
      [lambda x: pd.to_datetime(x.index).dayofweek < 5]
      [['adjclose', 'dividends']]
      .fillna(method='ffill')
  )

  data[i] = df['adjclose']
  data['div_' + i] = df['dividends']


data.index = pd.to_datetime(data.index).tz_localize('UTC')



### Reinvestimentos
amount_asset = 1000000 / len(assets)

for c in data[assets]:

  data['qtd_pst_' + c] = round(amount_asset / data[c].head(1), 0)

  for i in range(1, len(data)):

    if data['div_' + c][i-1] > 0:
      data['qtd_pst_' + c][i] = round(
          data['qtd_pst_' + c][i-1]
          + (data['div_' + c][i-1] * data['qtd_pst_' + c][i-1] * 1 / data[c][i])
          , 0
        )
    else:
      data['qtd_pst_' + c][i] = data['qtd_pst_' + c][i-1]

  data['+' + c] = data['qtd_pst_' + c] * data[c]


div_data = data[data.filter(like='+').columns]
div_data.index = pd.to_datetime(div_data.index)
div_data.index = div_data.index.tz_convert('UTC')

div_data.pct_change().sum().sort_values(ascending=False).head(59)

Unnamed: 0,0
+KDP,8.306963
+ET,3.290707
+MPLX,3.170304
+CQP,2.989768
+STLA,2.825128
+MPC,2.739524
+VLO,2.599855
+EXR,2.503763
+CAT,2.50257
+PKG,2.395116


In [None]:
# Tickers of Global Indexes
assets = final_us_data['ticker']

# Load all data
data = pd.DataFrame()

for i in assets:
  df = (
      yq.Ticker(i)
      .history(start = start, end = end, interval = '1d')
      .reset_index(0)
      [lambda x: pd.to_datetime(x.index).dayofweek < 5]
      [['adjclose', 'dividends']]
      .fillna(method='ffill')
  )

  data[i] = df['adjclose']
  data['div_' + i] = df['dividends']


data.index = pd.to_datetime(data.index).tz_localize('UTC')


### Reinvestimentos
amount_asset = 1000000 / len(assets)

for c in data[assets]:

  data['qtd_pst_' + c] = round(amount_asset / data[c].head(1), 0)

  for i in range(1, len(data)):

    if data['div_' + c][i-1] > 0:
      data['qtd_pst_' + c][i] = round(
          data['qtd_pst_' + c][i-1]
          + (data['div_' + c][i-1] * data['qtd_pst_' + c][i-1] * 0 / data[c][i])
          , 0
        )
    else:
      data['qtd_pst_' + c][i] = data['qtd_pst_' + c][i-1]

  data['+' + c] = data['qtd_pst_' + c] * data[c]


div_data = data[data.filter(like='+').columns]
div_data.index = pd.to_datetime(div_data.index)
div_data.index = div_data.index.tz_convert('UTC')

div_data.pct_change().sum().sort_values(ascending=False).head(59)

Unnamed: 0,0
+MPC,2.348104
+CAT,2.156219
+STLA,2.116438
+VLO,2.092271
+WRB,2.07495
+EXR,2.027098
+PKG,2.006527
+CRH,2.006306
+ET,1.960428
+CQP,1.951045
