# Data Engineering

In [177]:
import pandas as pd
import numpy as np

import fundamentus

from google.cloud import storage
import os
import requests
import requests_cache
import logging
import time

## 1 - Data Collection

For the data gathering, two repositories on Github were combined, so that the historical data could be scraped from the web. This Raw data is beeing stored in the cloud (GCP) and are the main data used for the project.

Repositories utilized:
* https://github.com/mv/fundamentus-api
* https://github.com/Victorcorcos/bovespa-winner

In [178]:
def perc_to_float(val):
    """
    Percent to float
      - replace string in pt-br to float
      - from '45,56%' to 0.4556
    Input:
        (DataFrame, column_name)
    """

    res = val
    res = res.replace( to_replace=r'[%]', value='' , regex=True )
    res = res.replace( to_replace=r'[.]', value='' , regex=True )
    res = res.replace( to_replace=r'[,]', value='.', regex=True )
    res = res.astype(float) / 100

    return res

def _rename_cols(data):
    """
    Rename columns in DataFrame
      - use a valid Python identifier
      - so each column can be a DataFrame property
      - Example:
          df.pl > 0
    """

    df2 = pd.DataFrame()

    ## Fix: rename columns
    df2['cotacao'  ] = data['Cotação'          ]
    df2['pl'       ] = data['P/L'              ]
    df2['pvp'      ] = data['P/VP'             ]
    df2['psr'      ] = data['PSR'              ]
    df2['dy'       ] = data['Div.Yield'        ]
    df2['pa'       ] = data['P/Ativo'          ]
    df2['pcg'      ] = data['P/Cap.Giro'       ]
    df2['pebit'    ] = data['P/EBIT'           ]
    df2['pacl'     ] = data['P/Ativ Circ.Liq'  ]
    df2['evebit'   ] = data['EV/EBIT'          ]
    # df2['evebitda' ] = data['EV/EBITDA'        ]
    df2['mrgebit'  ] = data['Mrg Ebit'         ]
    df2['mrgliq'   ] = data['Mrg. Líq.'        ]
    df2['roic'     ] = data['ROIC'             ]
    df2['roe'      ] = data['ROE'              ]
    df2['liqc'     ] = data['Liq. Corr.'       ]
    df2['liq2m'    ] = data['Liq.2meses'       ]
    df2['patrliq'  ] = data['Patrim. Líq'      ]
    df2['divbpatr' ] = data['Dív.Brut/ Patrim.']
    df2['c5y'      ] = data['Cresc. Rec.5a'    ]

    return df2

def get_resultado_raw(url):
    """
    Get data from fundamentus:
      URL:
        http://fundamentus.com.br/resultado.php
    RAW:
      DataFrame preserves original HTML header names
    Output:
      DataFrame
    """

    ##
    ## Busca avançada por empresa
    ##
    # url = 'http://www.fundamentus.com.br/resultado.php'
    hdr = {'User-agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
           'Accept': 'text/html, text/plain, text/css, text/sgml, */*;q=0.01',
           'Accept-Encoding': 'gzip, deflate',
           }

    with requests_cache.enabled():
        content = requests.get(url, headers=hdr)

        if content.from_cache:
            logging.debug('.../resultado.php: [CACHED]')
        else: # pragma: no cover
            logging.debug('.../resultado.php: sleeping...')
            time.sleep(.500) # 500 ms


    ## parse + load
    df = pd.read_html(content.text, decimal=",", thousands='.')[0]

    ## Fix: percent string
    df['Div.Yield']     = perc_to_float( df['Div.Yield']     )
    df['Mrg Ebit']      = perc_to_float( df['Mrg Ebit']      )
    df['Mrg. Líq.']     = perc_to_float( df['Mrg. Líq.']     )
    df['ROIC']          = perc_to_float( df['ROIC']          )
    df['ROE']           = perc_to_float( df['ROE']           )
    df['Cresc. Rec.5a'] = perc_to_float( df['Cresc. Rec.5a'] )

    ## index by 'Papel', instead of 'int'
    df.index = df['Papel']
    df.drop('Papel', axis='columns', inplace=True)
    df.sort_index(inplace=True)

    ## naming
    df.name = 'Fundamentus: HTML names'
    df.columns.name = 'Multiples'
    df.index.name = 'papel'

    ## return sorted by 'papel'
    return df


def get_resultado(url):
    """
    Data from fundamentus, fixing header names.
      URL:
        given from the user
      Obs:
        DataFrame uses short header names
    Output:
      DataFrame
    """

    ## get RAW data
    data1 = get_resultado_raw(url)

    ## rename!
    data2 = _rename_cols(data1)

    ## metadata
    data2.name = 'Fundamentus: short names'
    data2.columns.name = 'Multiples'
    data2.index.name = 'papel'

    ## remove duplicates
#   df = data2.drop_duplicates(subset=['cotacao','pl','pvp'], keep='last')
    df = data2.drop_duplicates(keep='first')

    return df


In [179]:
years = list(range(2008,2024))

### Get historical data

In [180]:
urls = {
    2008: 'https://web.archive.org/web/20080613050801/http://www.fundamentus.com.br/resultado.php',
    2009: 'https://web.archive.org/web/20090123022224/http://www.fundamentus.com.br/resultado.php',
    2010: 'https://web.archive.org/web/20100115191626/http://www.fundamentus.com.br/resultado.php',
    2011: 'https://web.archive.org/web/20110113192117/http://www.fundamentus.com.br/resultado.php',
    2012: 'https://web.archive.org/web/20120106023830/http://www.fundamentus.com.br/resultado.php',
    2013: 'https://web.archive.org/web/20130105004012/http://www.fundamentus.com.br/resultado.php',
    2014: 'https://web.archive.org/web/20140108164618/http://www.fundamentus.com.br/resultado.php',
    2015: 'https://web.archive.org/web/20150119231047/http://www.fundamentus.com.br/resultado.php',
    2016: 'https://web.archive.org/web/20160106101916/http://www.fundamentus.com.br/resultado.php',
    2017: 'https://web.archive.org/web/20170505164235/http://www.fundamentus.com.br/resultado.php',
    2018: 'https://web.archive.org/web/20180105120409/http://www.fundamentus.com.br/resultado.php',
    2019: 'https://web.archive.org/web/20190102202956/http://www.fundamentus.com.br/resultado.php',
    2020: 'https://web.archive.org/web/20200122200313/http://www.fundamentus.com.br/resultado.php',
    2021: 'https://web.archive.org/web/20210227034423/http://www.fundamentus.com.br/resultado.php',
    2022: 'https://web.archive.org/web/20220314021607/http://www.fundamentus.com.br/resultado.php',
    2023: 'http://fundamentus.com.br/resultado.php'
  }
  
df_full = pd.DataFrame()
for year in years:
    df = get_resultado(urls[year])
    df['year'] = year
    df_full = pd.concat([df_full,df])
    
df_full = df_full.reset_index()

### Get detailed information about each ticker

In [181]:
papeis = df_full['papel'].unique()

df_papeis = pd.DataFrame()
for papel in papeis:
    try:
        df = fundamentus.get_papel(papel)
        df_papeis = pd.concat([df_papeis,df])
    except:
        print(f'fail papel {papel}')

2023-01-04 22:02:00,062 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:00,126 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:00,190 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:00,210 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:00,275 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:00,344 [detalhes.get_papel] INFO: detalhes: call: get..._papel()


fail papel ABNB3


2023-01-04 22:02:00,410 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:00,477 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:00,545 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:00,613 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:00,680 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:00,760 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:00,828 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:00,847 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:00,912 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:00,981 [detalhes.get_papel] INFO: detalhes: call: get..._papel()


fail papel AGEN11


2023-01-04 22:02:01,047 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:01,109 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:01,173 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:01,237 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:01,301 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:01,364 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:01,440 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:01,508 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:01,569 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:01,634 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:01,696 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:01,760 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02

fail papel BRTO3
fail papel BRTO4


2023-01-04 22:02:07,609 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:07,668 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:07,739 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:07,805 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:07,815 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:07,879 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:07,888 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:07,953 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:08,016 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:08,079 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:08,142 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:08,207 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02

fail papel CMGR3
fail papel CMGR4


2023-01-04 22:02:11,187 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:11,252 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:11,316 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:11,379 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:11,444 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:11,509 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:11,580 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:11,648 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:11,714 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:11,779 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:11,845 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:11,910 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02

fail papel CTAX3
fail papel CTAX4


2023-01-04 22:02:13,827 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:13,890 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:13,954 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:14,031 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:14,104 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:14,170 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:14,238 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:14,307 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:14,374 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:14,441 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:14,507 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:14,576 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02

fail papel DROG3


2023-01-04 22:02:15,756 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:15,818 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:15,881 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:15,945 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:16,010 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:16,074 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:16,142 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:16,208 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:16,276 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:16,343 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:16,407 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:16,471 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02

fail papel ECOD3


2023-01-04 22:02:16,812 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:16,876 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:16,946 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:17,010 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:17,076 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:17,145 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:17,215 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:17,280 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:17,343 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:17,406 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:17,479 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:17,549 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02

fail papel GLOB3


2023-01-04 22:02:21,630 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:21,639 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:21,659 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:21,726 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:21,791 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:21,858 [detalhes.get_papel] INFO: detalhes: call: get..._papel()


fail papel GPIV11


2023-01-04 22:02:21,922 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:21,989 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:22,055 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:22,119 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:22,180 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:22,244 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:22,307 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:22,369 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:22,435 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:22,499 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:22,565 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:22,629 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02

fail papel INPR3
fail papel ITAU3
fail papel ITAU4


2023-01-04 22:02:24,198 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:24,266 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:24,334 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:24,344 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:24,409 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:24,472 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:24,536 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:24,603 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:24,666 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:24,731 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:24,796 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:24,858 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02

fail papel MILK11


2023-01-04 22:02:27,899 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:27,964 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:28,032 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:28,094 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:28,157 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:28,226 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:28,292 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:28,359 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:28,424 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:28,488 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:28,511 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:28,581 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02

fail papel MPXE3


2023-01-04 22:02:28,709 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:28,773 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:28,844 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:28,907 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:28,975 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:29,037 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:29,103 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:29,166 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:29,239 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:29,304 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:29,370 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:29,433 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02

fail papel NUTR3M
fail papel OHLB3


2023-01-04 22:02:30,121 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:30,184 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:30,248 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:30,311 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:30,376 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:30,442 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:30,506 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:30,571 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:30,636 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:30,702 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:30,764 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:30,827 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02

fail papel PRGA3


2023-01-04 22:02:32,863 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:32,927 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:32,994 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:33,058 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:33,121 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:33,184 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:33,247 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:33,311 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:33,375 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:33,442 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:33,509 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:33,572 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02

fail papel SATI3


2023-01-04 22:02:36,462 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:36,525 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:36,588 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:36,653 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:36,715 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:36,778 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:36,847 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:36,910 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:36,976 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:36,986 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:36,996 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:37,061 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02

fail papel TCSL3


2023-01-04 22:02:39,927 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:39,994 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:40,060 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:40,125 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:40,197 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:40,260 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:40,324 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:40,387 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:40,455 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:40,519 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:40,530 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:40,540 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02

fail papel TLPP3
fail papel TLPP4


2023-01-04 22:02:40,992 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:41,064 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:41,138 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:41,205 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:41,275 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:41,341 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:41,406 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:41,474 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:41,542 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:41,616 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:41,683 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:41,747 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02

fail papel TRNA11


2023-01-04 22:02:42,738 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:42,804 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:42,868 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:42,930 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:42,995 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:43,058 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:43,121 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:43,183 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:43,255 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:43,323 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:43,389 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:43,451 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02

fail papel BPAT11


2023-01-04 22:02:47,671 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:48,706 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:49,547 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:50,489 [detalhes.get_papel] INFO: detalhes: call: get..._papel()


fail papel LLXL3


2023-01-04 22:02:51,544 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:52,563 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:53,432 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:54,297 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:55,153 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:56,067 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:56,922 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:57,934 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:58,981 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:02:59,809 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:00,643 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:01,518 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03

fail papel CZLT11


2023-01-04 22:03:04,034 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:04,837 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:05,668 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:06,543 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:07,556 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:08,425 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:09,273 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:10,283 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:11,102 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:12,171 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:13,007 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:13,879 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03

fail papel DAGB11


2023-01-04 22:03:23,588 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:24,648 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:25,492 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:26,277 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:27,130 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:27,961 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:28,959 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:29,797 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:30,665 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:31,502 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:32,548 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:33,404 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03

fail papel WSON11


2023-01-04 22:03:38,530 [detalhes.get_papel] INFO: detalhes: call: get..._papel()


fail papel ABRE11


2023-01-04 22:03:39,531 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:40,378 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:41,211 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:42,059 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:42,919 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:43,961 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:44,969 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:45,773 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:46,820 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:47,676 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:48,657 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03:49,481 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:03

fail papel SNSL3M


2023-01-04 22:04:26,695 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:27,544 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:28,347 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:29,163 [detalhes.get_papel] INFO: detalhes: call: get..._papel()


fail papel CTAX11


2023-01-04 22:04:29,975 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:30,824 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:31,685 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:32,696 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:33,573 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:34,434 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:35,214 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:36,051 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:36,900 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:37,951 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:38,774 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:39,620 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04

fail papel OGSA3


2023-01-04 22:04:41,390 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:42,269 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:43,131 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:43,981 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:44,812 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:45,679 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:46,562 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:47,416 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:48,277 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:49,294 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:50,307 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04:51,169 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:04

fail papel LIQO3


2023-01-04 22:05:26,708 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:05:27,855 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:05:28,696 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:05:29,536 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:05:30,371 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:05:31,239 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:05:32,259 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:05:33,115 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:05:33,895 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:05:34,954 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:05:35,773 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:05:36,619 [detalhes.get_papel] INFO: detalhes: call: get..._papel()
2023-01-04 22:05

In [182]:
df_full[df_full['papel']=='ITUB4']

Multiples,papel,cotacao,pl,pvp,psr,dy,pa,pcg,pebit,pacl,...,mrgebit,mrgliq,roic,roe,liqc,liq2m,patrliq,divbpatr,c5y,year
1885,ITUB4,37.91,19.86,3.55,0.0,0.0188,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.1786,0.0,168656000.0,48861600000.0,0.0,0.1767,2010
2654,ITUB4,40.25,14.55,3.22,0.0,0.0246,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.221,0.0,215639000.0,57225100000.0,0.0,0.2734,2011
3440,ITUB4,34.66,13.27,2.09,0.0,0.0292,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.1572,0.0,246331000.0,75916000000.0,0.0,-0.4082,2012
4236,ITUB4,35.51,14.42,1.93,0.0,0.0325,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.1337,0.0,272830000.0,84160400000.0,0.0,-0.4905,2013
5051,ITUB4,31.27,14.83,1.8,0.0,0.0316,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.1214,0.0,312727000.0,87334800000.0,0.0,-0.4519,2014
5870,ITUB4,33.1,12.07,1.88,0.0,0.0299,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.1559,0.0,424769000.0,97269900000.0,0.0,-0.1391,2015
6689,ITUB4,25.34,6.96,1.43,0.0,0.0584,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.205,0.0,461788000.0,108020000000.0,0.0,0.2599,2016
7524,ITUB4,38.1,12.31,2.13,0.0,0.0468,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.1733,0.0,429483000.0,117557000000.0,0.0,0.1019,2017
8372,ITUB4,45.1,13.7,2.35,0.0,0.0334,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.1717,0.0,484131000.0,125575000000.0,0.0,0.0391,2018
9224,ITUB4,35.5,17.59,2.77,0.0,0.0595,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.1576,0.0,707858000.0,125534000000.0,0.0,-0.1029,2019


In [183]:
df_papeis.head()

Unnamed: 0,Papel,Tipo,Empresa,Setor,Subsetor,Cotacao,Data_ult_cot,Min_52_sem,Max_52_sem,Vol_med_2m,...,Rec_Servicos_3m,Lucro_Liquido_3m,Disponibilidades,Ativo_Circulante,Div_Bruta,Div_Liquida,Receita_Liquida_12m,EBIT_12m,Receita_Liquida_3m,EBIT_3m
ABCB3,ABCB3,ON N2,ABC Brasil ON N2,Intermediários Financeiros,Bancos,0.0,1899-12-30,0.0,0.0,0,...,118020000.0,218148000,,,,,,,,
ABCB4,ABCB4,PN N2,ABC Brasil PN N2,Intermediários Financeiros,Bancos,17.91,2023-01-04,14.06,22.3,13880200,...,118020000.0,218148000,,,,,,,,
ABYA3,ABYA3,ON NM,ABYARA ON NM,Construção Civil,Incorporações,4.91,2010-02-11,0.0,0.0,0,...,,-766000,29556000.0,498222000.0,383159000.0,353603000.0,250657000.0,25802000.0,67990000.0,18057000.0
ACES3,ACES3,ON,ARCELORMITTAL INOX BRASIL ON,Siderurgia e Metalurgia,Siderurgia,95.27,2008-04-18,0.0,0.0,0,...,,163789000,922471000.0,2157470000.0,279166000.0,-643305000.0,4182420000.0,983634000.0,949114000.0,174503000.0
ACES4,ACES4,PN,ARCELORMITTAL INOX BRASIL PN,Siderurgia e Metalurgia,Siderurgia,94.73,2008-04-24,0.0,0.0,0,...,,163789000,922471000.0,2157470000.0,279166000.0,-643305000.0,4182420000.0,983634000.0,949114000.0,174503000.0


In [184]:
df_full.head()

Multiples,papel,cotacao,pl,pvp,psr,dy,pa,pcg,pebit,pacl,...,mrgebit,mrgliq,roic,roe,liqc,liq2m,patrliq,divbpatr,c5y,year
0,ABCB3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1107010000.0,0.0,0.0,2008
1,ABCB4,9.0,0.0,1.1,0.0,0.0489,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4160460.0,1107010000.0,0.0,0.0,2008
2,ABNB3,17.35,12.39,3.27,1.845,0.0254,2.59,7.26,8.45,5.04,...,0.2182,0.149,0.3465,0.2639,3.28,2586090.0,273349000.0,0.0,0.1446,2008
3,ABYA3,16.0,25.67,4.13,4.399,0.0079,0.57,2.1,17.45,2.51,...,0.252,0.1714,0.0356,0.1608,1.94,5915230.0,197161000.0,2.4,0.0,2008
4,ACES3,95.27,8.79,2.28,1.693,0.0,1.438,6.4,7.2,3.77,...,0.2352,0.1927,0.269,0.2595,2.05,181013.0,3105800000.0,0.09,0.1334,2008


### Storing the raw data into Google Cloud

In [185]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = 'datascience-capstone-project-05b1642f45c3.json'

In [186]:
client = storage.Client()
bucket = client.bucket('storage-barsianize')
# bucket.location = 'US-EAST1'
# bucket.create()

# upload raw fundamentalist data to Google Cloud Storage
blob = bucket.blob('raw/df_full.csv')
blob.upload_from_string(df_full.to_csv(), 'text/csv')
# upload raw fundamentalist data to Google Cloud Storage
blob = bucket.blob('raw/df_papeis.csv')
blob.upload_from_string(df_papeis.to_csv(), 'text/csv')
# upload raw fundamentalist data to Google Cloud Storage
# blob = bucket.blob('raw/categories.csv')
# blob.upload_from_string(categories.to_csv(), 'text/csv')

## 2 - Data Cleansing

### Load data

In [187]:
path = "gs://storage-barsianize/raw/df_full.csv"
df_full =  pd.read_csv(path, index_col=0)

path = "gs://storage-barsianize/raw/df_papeis.csv"
df_papeis = pd.read_csv(path, index_col=0)

Desired information about each ticker

In [188]:
df_papeis.head()

Unnamed: 0,Papel,Tipo,Empresa,Setor,Subsetor,Cotacao,Data_ult_cot,Min_52_sem,Max_52_sem,Vol_med_2m,...,Rec_Servicos_3m,Lucro_Liquido_3m,Disponibilidades,Ativo_Circulante,Div_Bruta,Div_Liquida,Receita_Liquida_12m,EBIT_12m,Receita_Liquida_3m,EBIT_3m
ABCB3,ABCB3,ON N2,ABC Brasil ON N2,Intermediários Financeiros,Bancos,0.0,1899-12-30,0.0,0.0,0,...,118020000.0,218148000,,,,,,,,
ABCB4,ABCB4,PN N2,ABC Brasil PN N2,Intermediários Financeiros,Bancos,17.91,2023-01-04,14.06,22.3,13880200,...,118020000.0,218148000,,,,,,,,
ABYA3,ABYA3,ON NM,ABYARA ON NM,Construção Civil,Incorporações,4.91,2010-02-11,0.0,0.0,0,...,,-766000,29556000.0,498222000.0,383159000.0,353603000.0,250657000.0,25802000.0,67990000.0,18057000.0
ACES3,ACES3,ON,ARCELORMITTAL INOX BRASIL ON,Siderurgia e Metalurgia,Siderurgia,95.27,2008-04-18,0.0,0.0,0,...,,163789000,922471000.0,2157470000.0,279166000.0,-643305000.0,4182420000.0,983634000.0,949114000.0,174503000.0
ACES4,ACES4,PN,ARCELORMITTAL INOX BRASIL PN,Siderurgia e Metalurgia,Siderurgia,94.73,2008-04-24,0.0,0.0,0,...,,163789000,922471000.0,2157470000.0,279166000.0,-643305000.0,4182420000.0,983634000.0,949114000.0,174503000.0


In [189]:
info_papeis = ['Papel','Tipo', 'Empresa', 'Setor', 'Subsetor','Data_ult_cot']
df_papeis_clean = df_papeis[info_papeis]

df_papeis_clean = df_papeis_clean.dropna()
df_papeis_clean = df_papeis_clean.drop_duplicates()

In [218]:
df_completed = df_papeis_clean.merge(df_full, how='left', left_index=True, right_on='papel').drop_duplicates()

In [219]:
df_full.shape

(13339, 21)

In [220]:
df_completed.isna().mean().sort_values(ascending=False)

Papel           0.0
pebit           0.0
c5y             0.0
divbpatr        0.0
patrliq         0.0
liq2m           0.0
liqc            0.0
roe             0.0
roic            0.0
mrgliq          0.0
mrgebit         0.0
evebit          0.0
pacl            0.0
pcg             0.0
Tipo            0.0
pa              0.0
dy              0.0
psr             0.0
pvp             0.0
pl              0.0
cotacao         0.0
papel           0.0
Data_ult_cot    0.0
Subsetor        0.0
Setor           0.0
Empresa         0.0
year            0.0
dtype: float64

In [221]:
client = storage.Client()
bucket = client.bucket('storage-barsianize')
# bucket.location = 'US-EAST1'
# bucket.create()

# upload raw fundamentalist data to Google Cloud Storage
blob = bucket.blob('trusted/df_completed.csv')
blob.upload_from_string(df_completed.to_csv(), 'text/csv')