# Data Engineering

In [1]:
import pandas as pd
import numpy as np
import re

import fundamentus

from google.cloud import storage
import os

from utils import initialize_bucket
import requests
import requests_cache
import logging
import time
from datetime import date

from statsmodels.tsa.holtwinters import ExponentialSmoothing, Holt,SimpleExpSmoothing

2023-04-23 21:51:07,291 [logging.log_init] INFO: LOGLEVEL=INFO


In [2]:
pd.set_option("display.max_columns", 200)

In [3]:
credentials_path = 'datascience-capstone-project-05b1642f45c3.json'

## 1 - Data Collection

For the data gathering, two repositories on Github were combined, so that the historical data could be scraped from the web. This Raw data is beeing stored in the cloud (GCP) and are the main data used for the project.

Repositories utilized:
* https://github.com/mv/fundamentus-api
* https://github.com/Victorcorcos/bovespa-winner

In [4]:
def perc_to_float(val):
    """
    Percent to float
      - replace string in pt-br to float
      - from '45,56%' to 0.4556
    Input:
        (DataFrame, column_name)
    """

    res = val
    res = res.replace( to_replace=r'[%]', value='' , regex=True )
    res = res.replace( to_replace=r'[.]', value='' , regex=True )
    res = res.replace( to_replace=r'[,]', value='.', regex=True )
    res = res.astype(float) / 100

    return res

def _rename_cols(data):
    """
    Rename columns in DataFrame
      - use a valid Python identifier
      - so each column can be a DataFrame property
      - Example:
          df.pl > 0
    """

    df2 = pd.DataFrame()

    ## Fix: rename columns
    df2['cotacao'  ] = data['Cotação'          ]
    df2['pl'       ] = data['P/L'              ]
    df2['pvp'      ] = data['P/VP'             ]
    df2['psr'      ] = data['PSR'              ]
    df2['dy'       ] = data['Div.Yield'        ]
    df2['pa'       ] = data['P/Ativo'          ]
    df2['pcg'      ] = data['P/Cap.Giro'       ]
    df2['pebit'    ] = data['P/EBIT'           ]
    df2['pacl'     ] = data['P/Ativ Circ.Liq'  ]
    df2['evebit'   ] = data['EV/EBIT'          ]
    # df2['evebitda' ] = data['EV/EBITDA'        ]
    df2['mrgebit'  ] = data['Mrg Ebit'         ]
    df2['mrgliq'   ] = data['Mrg. Líq.'        ]
    df2['roic'     ] = data['ROIC'             ]
    df2['roe'      ] = data['ROE'              ]
    df2['liqc'     ] = data['Liq. Corr.'       ]
    df2['liq2m'    ] = data['Liq.2meses'       ]
    df2['patrliq'  ] = data['Patrim. Líq'      ]
    df2['divbpatr' ] = data['Dív.Brut/ Patrim.']
    df2['c5y'      ] = data['Cresc. Rec.5a'    ]

    return df2

def get_resultado_raw(url):
    """
    Get data from fundamentus:
      URL:
        http://fundamentus.com.br/resultado.php
    RAW:
      DataFrame preserves original HTML header names
    Output:
      DataFrame
    """

    ##
    ## Busca avançada por empresa
    ##
    # url = 'http://www.fundamentus.com.br/resultado.php'
    hdr = {'User-agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
           'Accept': 'text/html, text/plain, text/css, text/sgml, */*;q=0.01',
           'Accept-Encoding': 'gzip, deflate',
           }

    with requests_cache.enabled():
        content = requests.get(url, headers=hdr)

        if content.from_cache:
            logging.debug('.../resultado.php: [CACHED]')
        else: # pragma: no cover
            logging.debug('.../resultado.php: sleeping...')
            time.sleep(.500) # 500 ms


    ## parse + load
    df = pd.read_html(content.text, decimal=",", thousands='.')[0]

    ## Fix: percent string
    df['Div.Yield']     = perc_to_float( df['Div.Yield']     )
    df['Mrg Ebit']      = perc_to_float( df['Mrg Ebit']      )
    df['Mrg. Líq.']     = perc_to_float( df['Mrg. Líq.']     )
    df['ROIC']          = perc_to_float( df['ROIC']          )
    df['ROE']           = perc_to_float( df['ROE']           )
    df['Cresc. Rec.5a'] = perc_to_float( df['Cresc. Rec.5a'] )

    ## index by 'Papel', instead of 'int'
    df.index = df['Papel']
    df.drop('Papel', axis='columns', inplace=True)
    df.sort_index(inplace=True)

    ## naming
    df.name = 'Fundamentus: HTML names'
    df.columns.name = 'Multiples'
    df.index.name = 'papel'

    ## return sorted by 'papel'
    return df


def get_resultado(url):
    """
    Data from fundamentus, fixing header names.
      URL:
        given from the user
      Obs:
        DataFrame uses short header names
    Output:
      DataFrame
    """

    ## get RAW data
    data1 = get_resultado_raw(url)

    ## rename!
    data2 = _rename_cols(data1)

    ## metadata
    data2.name = 'Fundamentus: short names'
    data2.columns.name = 'Multiples'
    data2.index.name = 'papel'

    ## remove duplicates
#   df = data2.drop_duplicates(subset=['cotacao','pl','pvp'], keep='last')
    df = data2.drop_duplicates(keep='first')

    return df


In [5]:


# def initialize_bucket(credentials_path, create_bucket=False):

#     os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = credentials_path

#     client = storage.Client()
#     bucket = client.bucket('storage-barsianize')
    
#     if create_bucket:
#         bucket.location = 'US-EAST1'
#         bucket.create()

#     return client, bucket
    

In [6]:
def get_windows(max, min, window=5, step=1):

    try:
        maxes = list(range(min+window,max+1, step))
        mins = list(range(min, max+1-window, step))
    except:
        print(f'error 1!!! min {maxes}, max {maxes}')
    sequences = []
    if len(maxes)==len(mins):
        for i in range(len(maxes)):
            sequence = list(range(mins[i],maxes[i]+1,step))
            sequences.append(sequence)
        return sequences
    else:
        print(f'error 2!!! min {maxes}, max {maxes}')

### Get historical data

In [7]:
urls = {
    2008: 'https://web.archive.org/web/20080613050801/http://www.fundamentus.com.br/resultado.php',
    2009: 'https://web.archive.org/web/20090123022224/http://www.fundamentus.com.br/resultado.php',
    2010: 'https://web.archive.org/web/20100115191626/http://www.fundamentus.com.br/resultado.php',
    2011: 'https://web.archive.org/web/20110113192117/http://www.fundamentus.com.br/resultado.php',
    2012: 'https://web.archive.org/web/20120106023830/http://www.fundamentus.com.br/resultado.php',
    2013: 'https://web.archive.org/web/20130105004012/http://www.fundamentus.com.br/resultado.php',
    2014: 'https://web.archive.org/web/20140108164618/http://www.fundamentus.com.br/resultado.php',
    2015: 'https://web.archive.org/web/20150119231047/http://www.fundamentus.com.br/resultado.php',
    2016: 'https://web.archive.org/web/20160106101916/http://www.fundamentus.com.br/resultado.php',
    2017: 'https://web.archive.org/web/20170505164235/http://www.fundamentus.com.br/resultado.php',
    2018: 'https://web.archive.org/web/20180105120409/http://www.fundamentus.com.br/resultado.php',
    2019: 'https://web.archive.org/web/20190102202956/http://www.fundamentus.com.br/resultado.php',
    2020: 'https://web.archive.org/web/20200122200313/http://www.fundamentus.com.br/resultado.php',
    2021: 'https://web.archive.org/web/20210227034423/http://www.fundamentus.com.br/resultado.php',
    2022: 'https://web.archive.org/web/20220314021607/http://www.fundamentus.com.br/resultado.php',
    2023: 'https://web.archive.org/web/20230324145930/http://www.fundamentus.com.br/resultado.php'
  }
  
years = list(range(2008,2024))

def get_dates(urls):
    dict_dates = {}
    for key, value in urls.items():
        str_1 = re.split('/web/',value)[1]
        str_2 = re.split('/http',str_1)[0]
        str_date = f'{str_2[:4]}-{str_2[4:6]}-{str_2[6:8]}'
        dict_dates[int(str_2[:4])] = pd.to_datetime(str_date)
    return dict_dates

dates = get_dates(urls)

df_full = pd.DataFrame()
for year in years:
    df = get_resultado(urls[year])
    df['year'] = year
    df['date'] = dates[year]
    df_full = pd.concat([df_full,df])
    
df_full = df_full.reset_index()

In [8]:
df_full.head()

Multiples,papel,cotacao,pl,pvp,psr,dy,pa,pcg,pebit,pacl,evebit,mrgebit,mrgliq,roic,roe,liqc,liq2m,patrliq,divbpatr,c5y,year,date
0,ABCB3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1107010000.0,0.0,0.0,2008,2008-06-13
1,ABCB4,9.0,0.0,1.1,0.0,0.0489,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4160460.0,1107010000.0,0.0,0.0,2008,2008-06-13
2,ABNB3,17.35,12.39,3.27,1.845,0.0254,2.59,7.26,8.45,5.04,8.33,0.2182,0.149,0.3465,0.2639,3.28,2586090.0,273349000.0,0.0,0.1446,2008,2008-06-13
3,ABYA3,16.0,25.67,4.13,4.399,0.0079,0.57,2.1,17.45,2.51,25.25,0.252,0.1714,0.0356,0.1608,1.94,5915230.0,197161000.0,2.4,0.0,2008,2008-06-13
4,ACES3,95.27,8.79,2.28,1.693,0.0,1.438,6.4,7.2,3.77,6.54,0.2352,0.1927,0.269,0.2595,2.05,181013.0,3105800000.0,0.09,0.1334,2008,2008-06-13


### Get detailed information about each ticker

In [9]:
# papeis = df_full['papel'].unique()

# df_papeis = pd.DataFrame()
# for papel in papeis:
#     try:
#         df = fundamentus.get_papel(papel)
#         df_papeis = pd.concat([df_papeis,df])
#     except:
#         print(f'fail papel {papel}')

In [10]:
df_full[df_full['papel']=='ITUB4']

Multiples,papel,cotacao,pl,pvp,psr,dy,pa,pcg,pebit,pacl,evebit,mrgebit,mrgliq,roic,roe,liqc,liq2m,patrliq,divbpatr,c5y,year,date
1885,ITUB4,37.91,19.86,3.55,0.0,0.0188,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1786,0.0,168656000.0,48861600000.0,0.0,0.1767,2010,2010-01-15
2654,ITUB4,40.25,14.55,3.22,0.0,0.0246,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.221,0.0,215639000.0,57225100000.0,0.0,0.2734,2011,2011-01-13
3440,ITUB4,34.66,13.27,2.09,0.0,0.0292,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1572,0.0,246331000.0,75916000000.0,0.0,-0.4082,2012,2012-01-06
4236,ITUB4,35.51,14.42,1.93,0.0,0.0325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1337,0.0,272830000.0,84160400000.0,0.0,-0.4905,2013,2013-01-05
5051,ITUB4,31.27,14.83,1.8,0.0,0.0316,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1214,0.0,312727000.0,87334800000.0,0.0,-0.4519,2014,2014-01-08
5870,ITUB4,33.1,12.07,1.88,0.0,0.0299,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1559,0.0,424769000.0,97269900000.0,0.0,-0.1391,2015,2015-01-19
6689,ITUB4,25.34,6.96,1.43,0.0,0.0584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.205,0.0,461788000.0,108020000000.0,0.0,0.2599,2016,2016-01-06
7524,ITUB4,38.1,12.31,2.13,0.0,0.0468,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1733,0.0,429483000.0,117557000000.0,0.0,0.1019,2017,2017-05-05
8372,ITUB4,45.1,13.7,2.35,0.0,0.0334,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1717,0.0,484131000.0,125575000000.0,0.0,0.0391,2018,2018-01-05
9224,ITUB4,35.5,17.59,2.77,0.0,0.0595,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1576,0.0,707858000.0,125534000000.0,0.0,-0.1029,2019,2019-01-02


In [11]:
df_full.head()

Multiples,papel,cotacao,pl,pvp,psr,dy,pa,pcg,pebit,pacl,evebit,mrgebit,mrgliq,roic,roe,liqc,liq2m,patrliq,divbpatr,c5y,year,date
0,ABCB3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1107010000.0,0.0,0.0,2008,2008-06-13
1,ABCB4,9.0,0.0,1.1,0.0,0.0489,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4160460.0,1107010000.0,0.0,0.0,2008,2008-06-13
2,ABNB3,17.35,12.39,3.27,1.845,0.0254,2.59,7.26,8.45,5.04,8.33,0.2182,0.149,0.3465,0.2639,3.28,2586090.0,273349000.0,0.0,0.1446,2008,2008-06-13
3,ABYA3,16.0,25.67,4.13,4.399,0.0079,0.57,2.1,17.45,2.51,25.25,0.252,0.1714,0.0356,0.1608,1.94,5915230.0,197161000.0,2.4,0.0,2008,2008-06-13
4,ACES3,95.27,8.79,2.28,1.693,0.0,1.438,6.4,7.2,3.77,6.54,0.2352,0.1927,0.269,0.2595,2.05,181013.0,3105800000.0,0.09,0.1334,2008,2008-06-13


### Storing the raw data into Google Cloud

In [12]:
client, bucket = initialize_bucket(credentials_path,'storage-barsianize')

# upload raw fundamentalist data to Google Cloud Storage
blob = bucket.blob('raw/df_full.csv')
blob.upload_from_string(df_full.to_csv(), 'text/csv')
# # upload raw fundamentalist data to Google Cloud Storage
# blob = bucket.blob('raw/df_papeis.csv')
# blob.upload_from_string(df_papeis.to_csv(), 'text/csv')
# upload raw fundamentalist data to Google Cloud Storage
# blob = bucket.blob('raw/categories.csv')
# blob.upload_from_string(categories.to_csv(), 'text/csv')

## 2 - Data Cleansing

### Load data

In [13]:
client, bucket = initialize_bucket(credentials_path,'storage-barsianize')

path = "gs://storage-barsianize/raw/df_full.csv"
df_full =  pd.read_csv(path, index_col=0)

path = "gs://storage-barsianize/raw/df_papeis.csv"
df_papeis = pd.read_csv(path, index_col=0)

Desired information about each ticker

In [14]:
df_full.head()

Unnamed: 0,papel,cotacao,pl,pvp,psr,dy,pa,pcg,pebit,pacl,evebit,mrgebit,mrgliq,roic,roe,liqc,liq2m,patrliq,divbpatr,c5y,year,date
0,ABCB3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1107010000.0,0.0,0.0,2008,2008-06-13
1,ABCB4,9.0,0.0,1.1,0.0,0.0489,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4160460.0,1107010000.0,0.0,0.0,2008,2008-06-13
2,ABNB3,17.35,12.39,3.27,1.845,0.0254,2.59,7.26,8.45,5.04,8.33,0.2182,0.149,0.3465,0.2639,3.28,2586090.0,273349000.0,0.0,0.1446,2008,2008-06-13
3,ABYA3,16.0,25.67,4.13,4.399,0.0079,0.57,2.1,17.45,2.51,25.25,0.252,0.1714,0.0356,0.1608,1.94,5915230.0,197161000.0,2.4,0.0,2008,2008-06-13
4,ACES3,95.27,8.79,2.28,1.693,0.0,1.438,6.4,7.2,3.77,6.54,0.2352,0.1927,0.269,0.2595,2.05,181013.0,3105800000.0,0.09,0.1334,2008,2008-06-13


In [15]:
df_papeis.head()

Unnamed: 0,Papel,Tipo,Empresa,Setor,Subsetor,Cotacao,Data_ult_cot,Min_52_sem,Max_52_sem,Vol_med_2m,Valor_de_mercado,Valor_da_firma,Ult_balanco_processado,Nro_Acoes,PL,PVP,PEBIT,PSR,PAtivos,PCap_Giro,PAtiv_Circ_Liq,Div_Yield,EV_EBITDA,EV_EBIT,Cres_Rec_5a,LPA,VPA,Marg_Bruta,Marg_EBIT,Marg_Liquida,EBIT_Ativo,ROIC,ROE,Liquidez_Corr,Div_Br_Patrim,Giro_Ativos,Ativo,Cart_de_Credito,Depositos,Patrim_Liq,Result_Int_Financ_12m,Rec_Servicos_12m,Lucro_Liquido_12m,Result_Int_Financ_3m,Rec_Servicos_3m,Lucro_Liquido_3m,Disponibilidades,Ativo_Circulante,Div_Bruta,Div_Liquida,Receita_Liquida_12m,EBIT_12m,Receita_Liquida_3m,EBIT_3m
ABCB3,ABCB3,ON N2,ABC Brasil ON N2,Intermediários Financeiros,Bancos,0.0,1899-12-30,0.0,0.0,0,0,-,2022-09-30,226090000,0,0,-,-,-,-,-,0.0%,-,-,30.5%,0,0,-,-,0.0%,0.0%,-,15.0%,-,-,-,52140500000,0.0,0.0,5070730000,1394250000.0,362890000.0,763129000,382239000.0,118020000.0,218148000,,,,,,,,
ABCB4,ABCB4,PN N2,ABC Brasil PN N2,Intermediários Financeiros,Bancos,17.91,2023-01-04,14.06,22.3,13880200,4049270000,-,2022-09-30,226090000,531,80,-,-,-,-,-,7.8%,-,-,30.5%,338,2243,-,-,0.0%,0.0%,-,15.0%,-,-,-,52140500000,0.0,0.0,5070730000,1394250000.0,362890000.0,763129000,382239000.0,118020000.0,218148000,,,,,,,,
ABYA3,ABYA3,ON NM,ABYARA ON NM,Construção Civil,Incorporações,4.91,2010-02-11,0.0,0.0,0,515088000,868691000,2010-03-31,104906000,-21480,176,1996,205,053,198,-275,0.0%,3367,3367,16.4%,-2,278,32.0%,10.3%,-1.0%,2.6%,2.8%,-0.8%,209,131,026,978287000,,,292060000,,,-2398000,,,-766000,29556000.0,498222000.0,383159000.0,353603000.0,250657000.0,25802000.0,67990000.0,18057000.0
ACES3,ACES3,ON,ARCELORMITTAL INOX BRASIL ON,Siderurgia e Metalurgia,Siderurgia,95.27,2008-04-18,0.0,0.0,0,7080470000,6437160000,2007-12-31,74320000,879,228,720,169,144,640,2077,0.0%,654,654,13.3%,1084,4179,32.8%,23.5%,19.3%,20.0%,26.9%,25.9%,205,009,085,4922360000,,,3105800000,,,805922000,,,163789000,922471000.0,2157470000.0,279166000.0,-643305000.0,4182420000.0,983634000.0,949114000.0,174503000.0
ACES4,ACES4,PN,ARCELORMITTAL INOX BRASIL PN,Siderurgia e Metalurgia,Siderurgia,94.73,2008-04-24,0.0,0.0,0,7040330000,6397030000,2007-12-31,74320000,874,227,716,168,143,637,2065,0.0%,650,650,13.3%,1084,4179,32.8%,23.5%,19.3%,20.0%,26.9%,25.9%,205,009,085,4922360000,,,3105800000,,,805922000,,,163789000,922471000.0,2157470000.0,279166000.0,-643305000.0,4182420000.0,983634000.0,949114000.0,174503000.0


In [16]:
df_papeis.columns

Index(['Papel', 'Tipo', 'Empresa', 'Setor', 'Subsetor', 'Cotacao',
       'Data_ult_cot', 'Min_52_sem', 'Max_52_sem', 'Vol_med_2m',
       'Valor_de_mercado', 'Valor_da_firma', 'Ult_balanco_processado',
       'Nro_Acoes', 'PL', 'PVP', 'PEBIT', 'PSR', 'PAtivos', 'PCap_Giro',
       'PAtiv_Circ_Liq', 'Div_Yield', 'EV_EBITDA', 'EV_EBIT', 'Cres_Rec_5a',
       'LPA', 'VPA', 'Marg_Bruta', 'Marg_EBIT', 'Marg_Liquida', 'EBIT_Ativo',
       'ROIC', 'ROE', 'Liquidez_Corr', 'Div_Br_Patrim', 'Giro_Ativos', 'Ativo',
       'Cart_de_Credito', 'Depositos', 'Patrim_Liq', 'Result_Int_Financ_12m',
       'Rec_Servicos_12m', 'Lucro_Liquido_12m', 'Result_Int_Financ_3m',
       'Rec_Servicos_3m', 'Lucro_Liquido_3m', 'Disponibilidades',
       'Ativo_Circulante', 'Div_Bruta', 'Div_Liquida', 'Receita_Liquida_12m',
       'EBIT_12m', 'Receita_Liquida_3m', 'EBIT_3m'],
      dtype='object')

In [17]:
# info_papeis = ['Papel','Tipo', 'Empresa', 'Setor', 'Subsetor','Data_ult_cot']
info_papeis = ['Papel','Tipo', 'Empresa', 'Setor', 'Subsetor','Data_ult_cot']
               
df_papeis_clean = df_papeis[info_papeis]

df_papeis_clean = df_papeis_clean.dropna()
df_papeis_clean = df_papeis_clean.drop_duplicates()

In [18]:
df_papeis_clean['Data_ult_cot'] = pd.to_datetime(df_papeis_clean['Data_ult_cot'])

In [19]:
df_completed = df_papeis_clean.merge(df_full, how='left', left_index=True, right_on='papel').drop_duplicates()

In [20]:
df_completed.isna().mean().sort_values(ascending=False)

Papel           0.0
Tipo            0.0
year            0.0
c5y             0.0
divbpatr        0.0
patrliq         0.0
liq2m           0.0
liqc            0.0
roe             0.0
roic            0.0
mrgliq          0.0
mrgebit         0.0
evebit          0.0
pacl            0.0
pebit           0.0
pcg             0.0
pa              0.0
dy              0.0
psr             0.0
pvp             0.0
pl              0.0
cotacao         0.0
papel           0.0
Data_ult_cot    0.0
Subsetor        0.0
Setor           0.0
Empresa         0.0
date            0.0
dtype: float64

In [21]:
client, bucket = initialize_bucket(credentials_path,'storage-barsianize')

# upload raw fundamentalist data to Google Cloud Storage
blob = bucket.blob('trusted/df_completed.csv')
blob.upload_from_string(df_completed.to_csv(), 'text/csv',)

## 3 - Data Preparation

In [22]:
client, bucket = initialize_bucket(credentials_path,'storage-barsianize')

In [23]:
path = "gs://storage-barsianize/trusted/df_completed.csv"
df_completed =  pd.read_csv(path, index_col=0)
df_completed['Data_ult_cot'] = pd.to_datetime(df_completed['Data_ult_cot'])

In [24]:
df_completed.head()

Unnamed: 0,Papel,Tipo,Empresa,Setor,Subsetor,Data_ult_cot,papel,cotacao,pl,pvp,psr,dy,pa,pcg,pebit,pacl,evebit,mrgebit,mrgliq,roic,roe,liqc,liq2m,patrliq,divbpatr,c5y,year,date
0,ABCB3,ON N2,ABC Brasil ON N2,Intermediários Financeiros,Bancos,1899-12-30,ABCB3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1107010000.0,0.0,0.0,2008,2008-06-13
736,ABCB3,ON N2,ABC Brasil ON N2,Intermediários Financeiros,Bancos,1899-12-30,ABCB3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1553,0.0,0.0,1158760000.0,0.0,0.0,2009,2009-01-23
1478,ABCB3,ON N2,ABC Brasil ON N2,Intermediários Financeiros,Bancos,1899-12-30,ABCB3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1016,0.0,0.0,1182180000.0,0.0,-0.1233,2010,2010-01-15
2237,ABCB3,ON N2,ABC Brasil ON N2,Intermediários Financeiros,Bancos,1899-12-30,ABCB3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1521,0.0,0.0,1312080000.0,0.0,0.1043,2011,2011-01-13
3020,ABCB3,ON N2,ABC Brasil ON N2,Intermediários Financeiros,Bancos,1899-12-30,ABCB3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1571,0.0,0.0,1460150000.0,0.0,0.1272,2012,2012-01-06


In [25]:
df_completed.columns

Index(['Papel', 'Tipo', 'Empresa', 'Setor', 'Subsetor', 'Data_ult_cot',
       'papel', 'cotacao', 'pl', 'pvp', 'psr', 'dy', 'pa', 'pcg', 'pebit',
       'pacl', 'evebit', 'mrgebit', 'mrgliq', 'roic', 'roe', 'liqc', 'liq2m',
       'patrliq', 'divbpatr', 'c5y', 'year', 'date'],
      dtype='object')

In [26]:
df_completed['Data_ult_cot_year'] = df_completed['Data_ult_cot'].dt.year

In [27]:
df_actual = df_completed[df_completed['Data_ult_cot_year'] == pd.to_datetime(date.today()).year]

In [28]:
tickers = df_actual['Papel'].unique()

In [29]:
tickers

array(['ABCB4', 'AGRO3', 'ALPA3', 'ALPA4', 'AZEV3', 'AZEV4', 'BAHI3',
       'BALM3', 'BALM4', 'BAZA3', 'BBAS3', 'BBDC3', 'BBDC4', 'BDLL4',
       'BEEF3', 'BEES3', 'BGIP4', 'BIOM3', 'BMEB3', 'BMEB4', 'BMIN3',
       'BMIN4', 'BMKS3', 'BOBR4', 'BRAP3', 'BRAP4', 'BRGE11', 'BRGE3',
       'BRIV3', 'BRIV4', 'BRKM3', 'BRKM5', 'BRML3', 'BRSR3', 'BRSR5',
       'BRSR6', 'CALI3', 'CAMB3', 'CBEE3', 'CCRO3', 'CEBR3', 'CEBR5',
       'CEBR6', 'CEDO4', 'CEEB3', 'CGAS3', 'CGAS5', 'CGRA3', 'CGRA4',
       'CLSC3', 'CMIG3', 'CMIG4', 'COCE5', 'CPFE3', 'CPLE3', 'CPLE6',
       'CRIV3', 'CRIV4', 'CSAB3', 'CSAB4', 'CSAN3', 'CSMG3', 'CSNA3',
       'CSRN3', 'CSRN5', 'CSRN6', 'CTKA4', 'CTNM4', 'CTSA3', 'CTSA4',
       'CYRE3', 'DASA3', 'DOHL4', 'EALT3', 'EALT4', 'EKTR3', 'EKTR4',
       'ELET3', 'ELET6', 'EMAE4', 'EMBR3', 'ENBR3', 'ENGI3', 'ENGI4',
       'EQTL3', 'ESTR4', 'ETER3', 'EUCA3', 'EUCA4', 'EVEN3', 'EZTC3',
       'FESA3', 'FESA4', 'FHER3', 'GEPA3', 'GEPA4', 'GFSA3', 'GGBR3',
       'GGBR4', 'GO

In [30]:
stock_info = ['Papel','Tipo','Empresa','Setor','Subsetor']

indicators = ['cotacao', 'pl', 'pvp', 'psr', 'dy', 'pa', 'pcg', 'pebit',
              'pacl', 'evebit', 'mrgebit', 'mrgliq', 'roic', 'roe', 'liqc', 'liq2m',
              'patrliq', 'divbpatr', 'c5y','date','year']

In [31]:
df_indicators = df_actual[np.isin(df_actual['Papel'],tickers)][stock_info + indicators]

In [32]:
data = df_actual[df_actual['Papel']==tickers[2]].sort_values('year', ascending=False)

max = data['year'].max()
min = data['year'].min()
windows = get_windows(max, min, window=5, step=1)

### Get relative information for each sector

In [33]:
cols_to_rel = ['mrgliq','liq2m','patrliq','mrgebit','c5y']

def relativize(data, cols, sector_col, window):
    """
    Relativize the values in a DataFrame by dividing them by the corresponding values in a reference DataFrame.

    Parameters
    ----------
    data : pandas.DataFrame
        The DataFrame to relativize.
    cols : list of str
        The column names to relativize.
    sector_col : str
        The name of the column containing the sector information.

    Returns
    -------
    pandas.DataFrame
        A copy of the input DataFrame `data` with the specified columns relativized.
    """
    data_out = data.copy()
    sectors = data_out[sector_col].unique()
    max_window = np.max(window)

    ref_data = data_out.groupby(sector_col)[cols].mean()
    
    
    data_out = data_out[data_out['year']==max_window]
    for sector in sectors:
        sector_data = data_out.loc[data_out[sector_col] == sector]
        for col in cols:
            sector_data[col] /= ref_data.at[sector, col]
        data_out.loc[data_out[sector_col] == sector] = sector_data
    # data_out = data_out[[sector_col] + cols + ['year','Papel']]

    ref_data.columns = [col+'_field' for col in ref_data.columns]
    ref_data['year'] = max_window
    
    return ref_data, data_out

In [34]:
max = 2023
min = 2008

windows = get_windows(max, min, window=5, step=1)

In [35]:
databases = []
for window in windows:
    
    ref_data, data_out = relativize(df_actual[np.isin(df_actual['year'],window)], cols_to_rel, 'Setor',window)
    database = data_out.merge(ref_data, how='left', left_on='Setor',right_index=True)
    databases.append(database)

database_full = pd.concat(databases)

database_full['year'] = database_full['year_x'].astype(int)
database_full.drop(['year_x','year_y'],axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sector_data[col] /= ref_data.at[sector, col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sector_data[col] /= ref_data.at[sector, col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sector_data[col] /= ref_data.at[sector, col]
A value is trying to be set on a copy of a slice from a DataFrame.
Try

In [36]:
import requests
import pandas as pd

def get_bcb_data(codes, start_date, end_date, output_format='json'):
    """
    Get macroeconomic data from the Brazilian Central Bank's API.

    Args:
        codes (list): List of series codes to retrieve. See the API documentation for available codes.
        start_date (str): Start date in format 'dd/mm/yyyy'.
        end_date (str): End date in format 'dd/mm/yyyy'.
        output_format (str, optional): Output format, either 'json' or 'csv'. Defaults to 'json'.

    Returns:
        pandas.DataFrame: DataFrame with the requested series data.

    Raises:
        ValueError: If the output format is invalid.
        requests.exceptions.RequestException: If the API request fails.

    Example:
        Get the values of the crude oil production, commodities, dollar, euro, IPCA, IGPM, and Selic series from January 1st, 2021 to December 31st, 2021 in JSON format:

        >>> codes = [13522, 13521, 4390, 189, 11, 1178]
        >>> start_date = '01/01/2021'
        >>> end_date = '31/12/2021'
        >>> output_format = 'json'
        >>> df = get_bcb_data(codes, start_date, end_date, output_format)

    """
    # Define the base URL of the API
    url_base = "https://api.bcb.gov.br/dados/serie/bcdata.sgs.{}/dados"

    # Define the output format parameter
    if output_format not in ['json', 'csv']:
        raise ValueError("Invalid output format. Must be 'json' or 'csv'.")
    formato = output_format

    # Create a dictionary to store the series DataFrames
    dataframes = {}

    # Get the data for each series and store it in a DataFrame
    for code_name, code in codes.items():
        # Build the complete URL with the defined parameters
        url = url_base.format(code) + f"?formato={formato}&dataInicial={start_date}&dataFinal={end_date}"
        # Make the API request
        response = requests.get(url)
        # Check if the request was successful
        if response.status_code == 200:
            # Convert the response to a pandas DataFrame and set the column name to the series code
            try:
                df = pd.read_json(response.text)
            except:
                try:
                    df = pd.read_xml(response.text)
                except:
                    continue
            col_name = code_name
            df = df.rename(columns={"valor": col_name})
            # Set the DataFrame index to the date
            df = df.set_index("data")
            # Store the DataFrame in the dictionary
            dataframes[col_name] = df[col_name]
        else:
            raise requests.exceptions.RequestException(f"Error getting data. HTTP status code: {response.status_code}")

    # Combine the DataFrames for each series into a single DataFrame
    final_df = pd.concat(dataframes.values(), axis=1)

    return final_df

In [37]:
# https://www3.bcb.gov.br/sgspub/localizarseries/localizarSeries.do?method=prepararTelaLocalizarSeries

codes = {
    'preco_do_petroleo': 4390,
    'preco_do_minerio_de_ferro': 25521,
    'indice_da_industria': 24369,
    'indice_do_agro': 24368,
    'dolar_comercial': 1,
    'euro': 21619,
    'ibovespa': 23686,
    'pib': 21920,
    'pib_dolarizado': 22786,
    'igpm': 189,
    'ipca': 433,
    'selic': 11
}
start_date = "01/01/2008"
end_date = "28/02/2023"
output_format='json'

final_df = get_bcb_data(codes, start_date, end_date, output_format=output_format)

In [38]:
final_df

Unnamed: 0_level_0,preco_do_petroleo,indice_da_industria,dolar_comercial,euro,ibovespa,pib_dolarizado,igpm,ipca,selic
data,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
01/01/2008,0.93,,,,267227.1,1183.3,1.09,0.54,
01/02/2008,0.80,,1.7451,2.58659,,771.3,0.53,0.49,0.042065
01/03/2008,0.84,,,,,1107.8,0.74,0.48,
01/04/2008,0.90,,1.7534,2.73574,,1005.5,0.69,0.55,0.042065
01/05/2008,0.88,,,,,838.4,1.61,0.79,
...,...,...,...,...,...,...,...,...,...
22/02/2023,,,5.1730,5.49790,,,,,0.050788
23/02/2023,,,5.1330,5.44050,,,,,0.050788
24/02/2023,,,5.1791,5.46080,,,,,0.050788
27/02/2023,,,5.1960,5.50930,,,,,0.050788


In [39]:
final_df = final_df.sort_values(by='data').reset_index()
final_df['data'] = pd.to_datetime(final_df['data'])
final_df['year'] = final_df['data'].dt.year
final_df['month'] = final_df['data'].dt.month
final_df = final_df.groupby(pd.Grouper(key='data', freq='Y',origin=0, label='left')).first()\
                    .fillna(0).reset_index()

bc_data_columns = final_df.columns
database_full['date'] = pd.to_datetime(database_full['date'])
database_full['year'] = database_full['date'].dt.year
database_full = database_full.copy()
database_full = database_full.merge(final_df, how='left', on='year')

  final_df['data'] = pd.to_datetime(final_df['data'])
  final_df['data'] = pd.to_datetime(final_df['data'])
  final_df['data'] = pd.to_datetime(final_df['data'])
  final_df['data'] = pd.to_datetime(final_df['data'])
  final_df['data'] = pd.to_datetime(final_df['data'])
  final_df['data'] = pd.to_datetime(final_df['data'])
  final_df['data'] = pd.to_datetime(final_df['data'])
  final_df['data'] = pd.to_datetime(final_df['data'])
  final_df['data'] = pd.to_datetime(final_df['data'])
  final_df['data'] = pd.to_datetime(final_df['data'])
  final_df['data'] = pd.to_datetime(final_df['data'])
  final_df['data'] = pd.to_datetime(final_df['data'])
  final_df['data'] = pd.to_datetime(final_df['data'])
  final_df['data'] = pd.to_datetime(final_df['data'])
  final_df['data'] = pd.to_datetime(final_df['data'])
  final_df['data'] = pd.to_datetime(final_df['data'])
  final_df['data'] = pd.to_datetime(final_df['data'])
  final_df['data'] = pd.to_datetime(final_df['data'])
  final_df['data'] = pd.to_d

In [40]:
database_full.head()

Unnamed: 0,Papel,Tipo,Empresa,Setor,Subsetor,Data_ult_cot,papel,cotacao,pl,pvp,psr,dy,pa,pcg,pebit,pacl,evebit,mrgebit,mrgliq,roic,roe,liqc,liq2m,patrliq,divbpatr,c5y,date,Data_ult_cot_year,mrgliq_field,liq2m_field,patrliq_field,mrgebit_field,c5y_field,year,data,preco_do_petroleo,indice_da_industria,dolar_comercial,euro,ibovespa,pib_dolarizado,igpm,ipca,selic,month
0,ABCB4,PN N2,ABC Brasil PN N2,Intermediários Financeiros,Bancos,2023-01-04,ABCB4,14.19,8.75,1.22,0.0,0.0334,0.0,0.0,0.0,0.0,0.0,-0.0,0.0,0.0,0.1389,0.0,0.198734,0.104159,0.0,0.623592,2013-01-05,2023,0.202566,21882470.0,15731850000.0,-0.245499,0.136948,2013,2012-12-31,0.6,7.3,1.9843,2.6981,202662.4,1656.7,0.34,0.86,0.02726,1
1,AGRO3,ON NM,BRASILAGRO ON NM,Agropecuária,Agricultura,2023-01-04,AGRO3,10.09,-22.64,1.06,3.842,0.0,0.808,5.25,-28.57,16.14,-30.41,0.34795,-0.609352,-0.0327,-0.0468,2.16,0.171306,0.687695,0.16,1.370918,2013-01-05,2023,0.288175,2133847.0,809221800.0,-0.38655,0.543067,2013,2012-12-31,0.6,7.3,1.9843,2.6981,202662.4,1656.7,0.34,0.86,0.02726,1
2,ALPA3,ON N1,ALPARGATAS ON N1,"Tecidos, Vestuário e Calçados",Calçados,2023-01-04,ALPA3,14.95,21.33,3.6,2.033,0.0142,2.107,5.44,18.14,7.73,17.01,1.832448,6.775139,0.1895,0.1686,2.3,1.871477,3.179211,0.19,2.004121,2013-01-05,2023,0.014096,485831.2,508487800.0,0.061175,0.082829,2013,2012-12-31,0.6,7.3,1.9843,2.6981,202662.4,1656.7,0.34,0.86,0.02726,1
3,ALPA4,PN N1,ALPARGATAS PN N1,"Tecidos, Vestuário e Calçados",Calçados,2023-01-04,ALPA4,15.15,21.62,3.64,2.06,0.0154,2.136,5.51,18.38,7.84,17.25,1.832448,6.775139,0.1895,0.1686,2.3,11.222601,3.179211,0.19,2.004121,2013-01-05,2023,0.014096,485831.2,508487800.0,0.061175,0.082829,2013,2012-12-31,0.6,7.3,1.9843,2.6981,202662.4,1656.7,0.34,0.86,0.02726,1
4,AZEV3,ON,AZEVEDO & TRAVASSOS ON,Construção e Engenharia,Construção Pesada,2023-01-04,AZEV3,10.01,39.69,8.05,1.207,0.0,1.891,6.0,21.78,-12.97,22.87,0.43872,0.549261,0.0961,0.2028,2.04,0.0,0.821983,0.52,-0.501056,2013-01-05,2023,0.055347,423365.4,40845120.0,0.126276,0.140703,2013,2012-12-31,0.6,7.3,1.9843,2.6981,202662.4,1656.7,0.34,0.86,0.02726,1


In [41]:
tickers = database_full['Papel'].unique()

In [42]:
def forecast_model(vals):

    try:
        forecast_model = SimpleExpSmoothing(vals).fit(
        smoothing_level=0.75, optimized=False
        )
        return forecast_model.forecast(1)
       
    except:
        print('forecast fail')
        return 0 

In [43]:
def get_variations(data, windows, stock_info_cols):
    """
    Compute various statistics for a set of rolling windows of data.

    Parameters
    ----------
    data : pandas.DataFrame
        The input DataFrame containing the data to compute statistics for.
    windows : list of list of int
        A list of rolling windows to compute statistics for. Each window should be a list of years.
    stock_info_cols : list of str
        The names of the columns containing stock information.

    Returns
    -------
    pandas.DataFrame
        A DataFrame containing the computed statistics for each rolling window.
    """

    data_line_full = pd.DataFrame()
    cols = ['cotacao', 'pl', 'pvp', 'psr', 'dy', 'pa', 'pcg',
            'pebit', 'pacl', 'evebit', 'mrgebit', 'mrgliq', 'roic', 'roe', 'liqc',
            'liq2m', 'patrliq', 'divbpatr', 'c5y',
            'preco_do_petroleo','dolar_comercial','euro','ibovespa','pib_dolarizado','igpm','ipca','selic']
    extra_cols = list(data.columns[~np.isin(data.columns, stock_info_cols + cols)])
    
    for window in windows:
        window.sort(reverse=True)

        data_window = data[np.isin(data['year'],window)]
        data_window = data_window.sort_values(by='year', ascending=False)
        # data_full_window = data_full[np.isin(data_full['year'],window)]
        # data_full_window = data_full_window.sort_values(by='year', ascending=False)

        # rel_base = relativize(data_full_window, cols_to_rel, 'Setor')

        if np.sum(data['year']>np.max(window)):

            data_line = {}

            #get label
            data_line['dy_label'] = data[data['year'] == (np.max(window)+1)]['dy'].values[0]

            for col in cols:
                data_line[col] = data_window[col].iloc[0]
                try:
                    data_line[f'{col}_rel'] = data_window[col].iloc[0] / data_window[col].iloc[-1]
                except:
                    data_line[f'{col}_rel'] = 0

                data_line[f'{col}_median'] = data_window[col].median()

                data_line[f'{col}_mean'] = data_window[col].mean()

                try:
                    data_line[f'{col}_mean_median_ratio'] = data_line[f'{col}_mean']/data_line[f'{col}_median']
                except:
                    data_line[f'{col}_rel'] = 0

                data_line[f'{col}_std'] = data_window[col].std()

                try:
                    data_line[f'{col}_spread'] = (data_window[col].max() - data_window[col].min()) / data_window[col].mean()
                except:
                    data_line[f'{col}_spread'] = 0

                try:
                    data_line[f'{col}_spread_rel_now'] = data_line[f'{col}_spread']/data_line[col]
                except:
                    data_line[f'{col}_spread_rel_now'] = 0

                if col == 'dy':
                    vals = data_window[col].fillna(0).values

                    data_line[f'{col}_sequence'] = np.sum(vals > 0)

                    data_line[f'{col}_sequence_bool'] = int(np.sum(vals > 0) == len(vals))

                    data_line[f'{col}_forecast'] = forecast_model(vals)

            for extra_col in extra_cols:
                data_line[extra_col] = data_window[extra_col].iloc[0]
           
            df_data_line = pd.DataFrame(data_line, index=[0])
            data_line_full = pd.concat([data_line_full, df_data_line])

        
        else:
            continue
    
    statistic_cols = list(data_line_full.columns)
    # extra_cols = list(data.columns[~np.isin(data.columns, stock_info_cols + cols)])

    data_line_full[stock_info_cols] = data[stock_info].iloc[0]
    data_line_full = data_line_full[stock_info_cols + statistic_cols]

    return data_line_full

In [44]:
def get_full_variations(data, tickers, window, stock_info_cols):

    df_variations = pd.DataFrame()

    for ticker in tickers:

        data_ticker = data[data['Papel']==ticker].sort_values('year', ascending=False).copy()

        max = data_ticker['year'].max()
        min = data_ticker['year'].min()

        if (max - min) < window:
            continue
        else:
            windows = get_windows(max, min, window=window, step=1)

            ticker_variation = get_variations(data_ticker, windows, stock_info_cols)
        
            df_variations = pd.concat([df_variations, ticker_variation])

    return df_variations


In [45]:
base_dataset = get_full_variations(database_full, tickers, window=5, stock_info_cols=stock_info)

  data_line[f'{col}_rel'] = data_window[col].iloc[0] / data_window[col].iloc[-1]
  data_line[f'{col}_spread_rel_now'] = data_line[f'{col}_spread']/data_line[col]
  data_line[f'{col}_rel'] = data_window[col].iloc[0] / data_window[col].iloc[-1]
  data_line[f'{col}_spread_rel_now'] = data_line[f'{col}_spread']/data_line[col]
  data_line[f'{col}_rel'] = data_window[col].iloc[0] / data_window[col].iloc[-1]
  data_line[f'{col}_spread_rel_now'] = data_line[f'{col}_spread']/data_line[col]
  data_line[f'{col}_rel'] = data_window[col].iloc[0] / data_window[col].iloc[-1]
  data_line[f'{col}_spread_rel_now'] = data_line[f'{col}_spread']/data_line[col]
  data_line[f'{col}_rel'] = data_window[col].iloc[0] / data_window[col].iloc[-1]
  data_line[f'{col}_spread_rel_now'] = data_line[f'{col}_spread']/data_line[col]
  data_line[f'{col}_rel'] = data_window[col].iloc[0] / data_window[col].iloc[-1]
  data_line[f'{col}_spread_rel_now'] = data_line[f'{col}_spread']/data_line[col]
  data_line[f'{col}_rel'] = 

In [46]:
base_dataset.head()

Unnamed: 0,Papel,Tipo,Empresa,Setor,Subsetor,dy_label,cotacao,cotacao_rel,cotacao_median,cotacao_mean,cotacao_mean_median_ratio,cotacao_std,cotacao_spread,cotacao_spread_rel_now,pl,pl_rel,pl_median,pl_mean,pl_mean_median_ratio,pl_std,pl_spread,pl_spread_rel_now,pvp,pvp_rel,pvp_median,pvp_mean,pvp_mean_median_ratio,pvp_std,pvp_spread,pvp_spread_rel_now,psr,psr_rel,psr_median,psr_mean,psr_std,psr_spread,psr_spread_rel_now,dy,dy_rel,dy_median,dy_mean,dy_mean_median_ratio,dy_std,dy_spread,dy_spread_rel_now,dy_sequence,dy_sequence_bool,dy_forecast,pa,pa_rel,pa_median,pa_mean,pa_std,pa_spread,pa_spread_rel_now,pcg,pcg_rel,pcg_median,pcg_mean,pcg_std,pcg_spread,pcg_spread_rel_now,pebit,pebit_rel,pebit_median,pebit_mean,pebit_std,pebit_spread,pebit_spread_rel_now,pacl,pacl_rel,pacl_median,pacl_mean,pacl_std,pacl_spread,pacl_spread_rel_now,evebit,evebit_rel,evebit_median,evebit_mean,evebit_std,evebit_spread,evebit_spread_rel_now,mrgebit,mrgebit_rel,mrgebit_median,mrgebit_mean,mrgebit_std,mrgebit_spread,mrgebit_spread_rel_now,mrgliq,mrgliq_rel,mrgliq_median,mrgliq_mean,mrgliq_std,mrgliq_spread,mrgliq_spread_rel_now,roic,roic_rel,roic_median,...,divbpatr_mean,divbpatr_std,divbpatr_spread,divbpatr_spread_rel_now,c5y,c5y_rel,c5y_median,c5y_mean,c5y_mean_median_ratio,c5y_std,c5y_spread,c5y_spread_rel_now,preco_do_petroleo,preco_do_petroleo_rel,preco_do_petroleo_median,preco_do_petroleo_mean,preco_do_petroleo_mean_median_ratio,preco_do_petroleo_std,preco_do_petroleo_spread,preco_do_petroleo_spread_rel_now,dolar_comercial,dolar_comercial_rel,dolar_comercial_median,dolar_comercial_mean,dolar_comercial_mean_median_ratio,dolar_comercial_std,dolar_comercial_spread,dolar_comercial_spread_rel_now,euro,euro_rel,euro_median,euro_mean,euro_mean_median_ratio,euro_std,euro_spread,euro_spread_rel_now,ibovespa,ibovespa_rel,ibovespa_median,ibovespa_mean,ibovespa_mean_median_ratio,ibovespa_std,ibovespa_spread,ibovespa_spread_rel_now,pib_dolarizado,pib_dolarizado_rel,pib_dolarizado_median,pib_dolarizado_mean,pib_dolarizado_mean_median_ratio,pib_dolarizado_std,pib_dolarizado_spread,pib_dolarizado_spread_rel_now,igpm,igpm_rel,igpm_median,igpm_mean,igpm_mean_median_ratio,igpm_std,igpm_spread,igpm_spread_rel_now,ipca,ipca_rel,ipca_median,ipca_mean,ipca_mean_median_ratio,ipca_std,ipca_spread,ipca_spread_rel_now,selic,selic_rel,selic_median,selic_mean,selic_mean_median_ratio,selic_std,selic_spread,selic_spread_rel_now,Data_ult_cot,papel,date,Data_ult_cot_year,mrgliq_field,liq2m_field,patrliq_field,mrgebit_field,c5y_field,year,data,indice_da_industria,month,psr_mean_median_ratio,pa_mean_median_ratio,pcg_mean_median_ratio,pebit_mean_median_ratio,pacl_mean_median_ratio,evebit_mean_median_ratio,mrgebit_mean_median_ratio,mrgliq_mean_median_ratio,roic_mean_median_ratio,liqc_mean_median_ratio,divbpatr_mean_median_ratio
0,ABCB4,PN N2,ABC Brasil PN N2,Intermediários Financeiros,Bancos,0.0308,16.6,1.169838,13.22,13.505,1.021558,3.690056,0.76194,0.0459,7.83,0.894857,7.5,7.005,0.934,1.872173,0.705211,0.090065,1.01,0.827869,0.99,0.965,0.974747,0.251456,0.704663,0.697686,0.0,0.0,0.0,0.0,0.0,0.0,,0.0643,1.92515,0.05205,0.050983,0.979507,0.010434,0.60608,9.425823,6.0,1.0,0.037529,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,-0.0,0.0,-0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,...,0.0,0.0,0.0,,7.652507,12.271654,0.649625,1.746972,2.689199,2.89939,4.224468,0.552037,0.58,0.966667,0.895,0.853333,0.953445,0.22142,0.597656,1.030442,3.173,1.599053,3.15175,2.953567,0.93712,0.726164,0.681989,0.214935,3.9482,1.463326,3.3913,3.485117,1.027664,0.591138,0.476655,0.120727,246444.2,1.216033,207967.5,216234.55,1.039752,18490.953894,0.207416,8.416365e-07,1587.2,0.958049,1607.35,1605.233333,0.998683,120.637053,0.215296,0.000136,0.76,2.235294,0.7,0.686667,0.980952,0.276164,1.165049,1.532959,0.29,0.337209,0.705,0.765,1.085106,0.426556,1.281046,4.417399,0.026481,0.971423,0.043723,0.040313,0.921998,0.011145,0.646194,24.402158,2023-01-04,ABCB4,2018-01-05,2023.0,0.189333,37755370.0,25030700000.0,-0.237636,0.023835,2018.0,2017-12-31,12.3,1.0,,,,,,,,,,,
0,ABCB4,PN N2,ABC Brasil PN N2,Intermediários Financeiros,Bancos,0.0466,16.88,1.377959,14.425,13.953333,0.967302,3.944564,0.737458,0.043688,7.97,1.111576,7.5,6.875,0.916667,1.749854,0.689455,0.086506,0.98,1.010309,0.975,0.925,0.948718,0.219886,0.713514,0.728075,0.0,0.0,0.0,0.0,0.0,0.0,,0.0308,0.636364,0.05205,0.05055,0.971182,0.011326,0.66271,21.516565,6.0,1.0,0.049884,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,-0.0,0.0,-0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,...,0.0,0.0,0.0,,-0.595803,-0.88181,0.557011,1.54374,2.771471,3.033517,5.343069,-8.967848,0.54,0.635294,0.895,0.843333,0.942272,0.236023,0.652174,1.207729,3.6694,1.622193,3.1643,3.234417,1.022159,0.589279,0.536913,0.146322,4.2121,1.348951,3.6731,3.73745,1.017519,0.504836,0.330921,0.078564,283635.2,1.229071,221878.9,229730.016667,1.035385,31545.064657,0.357122,1.259088e-06,1430.8,0.796127,1549.2,1567.583333,1.011866,135.675737,0.233736,0.000163,0.01,0.020833,0.7,0.631667,0.902381,0.374402,1.788918,178.891821,0.32,0.581818,0.465,0.675,1.451613,0.45829,1.451852,4.537037,0.02462,0.612926,0.043723,0.039873,0.911935,0.011797,0.699997,28.432067,2023-01-04,ABCB4,2019-01-02,2023.0,0.17118,44592060.0,26339170000.0,-0.16669,0.037764,2019.0,2018-12-31,12.2,1.0,,,,,,,,,,,
0,ABCB4,PN N2,ABC Brasil PN N2,Intermediários Financeiros,Bancos,0.0322,22.51,1.923932,16.74,15.663333,0.935683,5.110157,0.926367,0.041154,10.58,1.787162,7.9,7.443333,0.942194,2.324303,0.909539,0.085968,1.24,1.458824,0.995,0.97,0.974874,0.255656,0.721649,0.581975,0.0,0.0,0.0,0.0,0.0,0.0,,0.0466,0.836625,0.05205,0.05025,0.965418,0.011418,0.666667,14.306152,6.0,1.0,0.05435,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,-0.0,0.0,-0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,...,0.0,0.0,0.0,,-2.51288,-5.732415,0.35542,1.012317,2.848225,3.464657,10.041705,-3.996094,0.38,0.404255,0.76,0.765,1.006579,0.302109,0.928105,2.44238,5.2404,1.660667,3.4212,3.730817,1.090499,0.816841,0.560869,0.107028,5.7304,1.686404,4.08015,4.1721,1.022536,0.864229,0.562259,0.098119,233720.0,1.159362,223352.85,230221.333333,1.030752,31587.469661,0.35636,1.524729e-06,1624.7,0.99828,1549.2,1538.833333,0.993308,86.739441,0.127824,7.9e-05,0.48,0.631579,0.7,0.631667,0.902381,0.374402,1.788918,3.726913,0.21,0.169355,0.35,0.618333,1.766667,0.496283,1.714286,8.163265,0.014227,0.300916,0.03688,0.03555,0.963924,0.015756,1.077484,75.735117,2023-01-04,ABCB4,2020-01-22,2023.0,0.789757,53400050.0,27844000000.0,-0.054448,0.056827,2020.0,2019-12-31,11.4,1.0,,,,,,,,,,,
0,ABCB4,PN N2,ABC Brasil PN N2,Intermediários Financeiros,Bancos,0.0581,15.36,1.92,16.74,16.273333,0.972123,4.748051,0.891643,0.05805,10.53,2.76378,8.26,8.211667,0.994149,2.476969,0.824437,0.078294,0.79,1.462963,0.995,0.96,0.964824,0.262374,0.729167,0.922996,0.0,0.0,0.0,0.0,0.0,0.0,,0.0322,0.665289,0.0475,0.046333,0.975439,0.013084,0.723022,22.454086,6.0,1.0,0.050287,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,-0.0,0.0,-0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,...,0.0,0.0,0.0,,-5.10366,-6.229778,-0.161663,0.088646,-0.548341,4.288963,143.899683,-28.195388,0.15,0.141509,0.56,0.633333,1.130952,0.374148,1.484211,9.894737,5.4608,1.365678,3.834,4.115017,1.073296,1.011183,0.562063,0.102927,6.5988,1.513729,4.2857,4.705567,1.097969,1.209676,0.683063,0.103513,237834.4,1.171891,235777.2,236261.466667,1.002054,28313.790287,0.341511,1.435919e-06,1394.9,0.923041,1481.4,1500.066667,1.012601,91.056634,0.153193,0.00011,2.58,2.263158,0.7,0.935,1.335714,0.88638,2.748663,1.065373,0.25,0.19685,0.305,0.453333,1.486339,0.40431,2.338235,9.352941,0.007469,0.142183,0.025551,0.028914,1.131661,0.018045,1.558457,208.656692,2023-01-04,ABCB4,2021-02-27,2023.0,2.566022,68117290.0,28303300000.0,-0.315341,0.067207,2021.0,2020-12-31,14.5,1.0,,,,,,,,,,,
0,ABCB4,PN N2,ABC Brasil PN N2,Intermediários Financeiros,Bancos,0.0849,15.76,0.861673,16.74,17.566667,1.049383,2.626752,0.407021,0.025826,6.23,0.728655,8.26,8.615,1.042978,1.688381,0.504933,0.081049,0.76,0.633333,0.995,0.996667,1.001675,0.199867,0.481605,0.633691,0.0,0.0,0.0,0.0,0.0,0.0,,0.0581,1.043088,0.05115,0.04795,0.937439,0.01396,0.698644,12.024861,6.0,1.0,0.055972,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,-0.0,0.0,-0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,...,0.0,0.0,0.0,,-0.781665,-2.868732,-0.688734,-0.178171,0.258693,4.284216,-71.595258,91.59333,0.73,0.669725,0.56,0.578333,1.032738,0.319087,1.62536,2.226521,5.281,1.677626,4.4549,4.32875,0.971683,1.112145,0.534311,0.101176,5.9406,1.755185,4.97125,4.969117,0.999571,1.288816,0.646835,0.108884,240413.5,1.128778,239123.95,242505.5,1.014141,23160.158986,0.291332,1.211794e-06,1512.4,1.041885,1482.0,1500.266667,1.012326,91.087291,0.153173,0.000101,1.82,2.84375,0.7,1.048333,1.497619,0.958382,2.45151,1.346984,0.54,1.421053,0.305,0.331667,1.087432,0.117544,0.994975,1.842546,0.034749,0.721547,0.025551,0.025951,1.015668,0.014503,1.567965,45.122595,2023-01-04,ABCB4,2022-03-14,2023.0,2.501972,78879810.0,28832590000.0,-0.303976,0.077271,2022.0,2021-12-31,11.2,1.0,,,,,,,,,,,


In [47]:
base_dataset[[col for col in base_dataset.columns if 'dy' in col]]

Unnamed: 0,dy_label,dy,dy_rel,dy_median,dy_mean,dy_mean_median_ratio,dy_std,dy_spread,dy_spread_rel_now,dy_sequence,dy_sequence_bool,dy_forecast
0,0.0308,0.0643,1.925150,0.05205,0.050983,0.979507,0.010434,0.606080,9.425823,6.0,1.0,0.037529
0,0.0466,0.0308,0.636364,0.05205,0.050550,0.971182,0.011326,0.662710,21.516565,6.0,1.0,0.049884
0,0.0322,0.0466,0.836625,0.05205,0.050250,0.965418,0.011418,0.666667,14.306152,6.0,1.0,0.054350
0,0.0581,0.0322,0.665289,0.04750,0.046333,0.975439,0.013084,0.723022,22.454086,6.0,1.0,0.050287
0,0.0849,0.0581,1.043088,0.05115,0.047950,0.937439,0.013960,0.698644,12.024861,6.0,1.0,0.055972
...,...,...,...,...,...,...,...,...,...,...,...,...
0,0.0328,0.0169,16.900000,0.01605,0.018117,1.128764,0.013853,2.345906,138.811016,6.0,1.0,0.009960
0,0.0011,0.0000,0.000000,0.00000,0.000000,,0.000000,0.000000,,0.0,0.0,0.000000
0,0.0000,0.0000,0.000000,0.00000,0.000000,,0.000000,0.000000,,0.0,0.0,0.000000
0,0.0000,0.0000,0.000000,0.00000,0.000000,,0.000000,0.000000,,0.0,0.0,0.000000


In [48]:
columns_to_drop = ['cotacao','cotacao_median', 'cotacao_mean']
base_dataset.drop(columns_to_drop,axis=1, inplace=True)

In [49]:
def inpute_median_by_category(data,group_by_col):

    cols_with_nans = data.columns[data.isna().any()].tolist()

    for col in cols_with_nans:
       data[col] = data.groupby(group_by_col)[col].transform(lambda x: x.fillna(x.median()))

    return data


In [50]:
inpute_median_by_category(base_dataset,'Setor').isna().sum().sort_values(ascending=False)

divbpatr_mean_median_ratio    30
divbpatr_spread_rel_now       30
dy_mean_median_ratio          11
dy_spread_rel_now             10
mrgebit_rel                    1
                              ..
mrgebit_std                    0
mrgebit_spread                 0
mrgebit_spread_rel_now         0
mrgliq                         0
liq2m_rel                      0
Length: 235, dtype: int64

In [51]:
base_dataset = inpute_median_by_category(base_dataset,'Setor')
# base_dataset.fillna(np.median, inplace=True)
base_dataset.isna().sum().sort_values(ascending=False)[:20]



divbpatr_mean_median_ratio           30
divbpatr_spread_rel_now              30
dy_mean_median_ratio                 11
dy_spread_rel_now                    10
mrgebit_rel                           1
c5y_rel                               1
mrgliq_rel                            1
dolar_comercial_std                   0
preco_do_petroleo_std                 0
preco_do_petroleo_spread              0
preco_do_petroleo_spread_rel_now      0
dolar_comercial                       0
dolar_comercial_rel                   0
dolar_comercial_median                0
dolar_comercial_mean                  0
dolar_comercial_mean_median_ratio     0
dolar_comercial_spread                0
dolar_comercial_spread_rel_now        0
euro                                  0
euro_rel                              0
dtype: int64

In [52]:
# base_dataset = base_dataset.drop(columns_to_drop, axis=1)
base_dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
base_dataset = base_dataset.dropna()
base_dataset.isna().sum().sort_values(ascending=False)[:20]

Papel                                  0
dolar_comercial_spread                 0
preco_do_petroleo_median               0
preco_do_petroleo_mean                 0
preco_do_petroleo_mean_median_ratio    0
preco_do_petroleo_std                  0
preco_do_petroleo_spread               0
preco_do_petroleo_spread_rel_now       0
dolar_comercial                        0
dolar_comercial_rel                    0
dolar_comercial_median                 0
dolar_comercial_mean                   0
dolar_comercial_mean_median_ratio      0
dolar_comercial_std                    0
dolar_comercial_spread_rel_now         0
ibovespa_std                           0
euro                                   0
euro_rel                               0
euro_median                            0
euro_mean                              0
dtype: int64

### Storing refined data into Google Cloud

In [53]:
client, bucket = initialize_bucket(credentials_path,'storage-barsianize')

# upload base dataset to Google Cloud Storage
blob = bucket.blob('refined/base_dataset.csv')
blob.upload_from_string(base_dataset.to_csv(), 'text/csv',)

> To Dos
* pegar primeiras 23 features e pensar em feature engineering
    * relativizar os valores absolutos (mrgliq, Lucro, patrliq, divbrut, ativo_circulante, receita_liquida, etc)
    * testar outras formas de scaling
    * spread relativo sempre ao now

