In [1]:
import re
import nltk
import requests
import warnings
import pandas as pd

from bs4 import BeautifulSoup
from datetime import datetime

### Aux Functions 

In [24]:
def fix_wrong_range_var(df1, range_var, range_value, text='texto'):
    for _, row in df1.iterrows():
        if len(row[range_var]) > range_value:
            row[text] = row[range_var][range_value:].strip()
            row[range_var] = row[range_var][:range_value].strip()

    return df1

def check_len_row(row, aux_list, append_var):
    if len(row) > 2:
        aux_list.append(
            {append_var: row[0], 'texto': ' '.join(row[1:])}
        )

    else:
        aux_list.append(
            {append_var: row[0], 'texto': row[1]}
        )

    return None

def get_titles_from_camara():
    hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
    url = 'https://www2.camara.leg.br/legin/fed/consti/1988/constituicao-1988-5-outubro-1988-322142-publicacaooriginal-1-pl.html'

    pg = requests.get(url=url, headers=hdr)
    soup = BeautifulSoup(pg.text, 'html.parser')

    txt = soup.find('div', class_='textoNorma').get_text().replace('rt.', 'rt').replace('º', '.').replace('arts.', 'arts ')

    pre_txt = [k.strip() for k in nltk.tokenize.sent_tokenize(
        txt, language='portuguese'
    )]

    pre_txt = [list(filter(None, r.split('\xa0'))) for r in pre_txt]

    titles_dict = [{art[1].replace('Art', 'Art.'): ' '.join(art[0].split('\n')[:2]).strip()} for art in pre_txt if art[0].startswith('TÍTULO')]
    titles_dict = {k:v for i in titles_dict for k, v in i.items()}
    
    return titles_dict

# nltk.download('punkt')
# nltk.download('maxent_ne_chunker')
# warnings.filterwarnings('ignore')

# 1.0. CF88 Data Collect

## Data Collect

In [23]:
soup = BeautifulSoup(open('page/Constituição.html').read(), 'html.parser')

In [24]:
source = ' '.join([k.get_text().strip() for k in soup.find_all('big')][1:])

In [27]:
r_list = ['I','II','III','IV','V','VI','VII','VIII','IX','X','XI','XII','XIII','XIV','XV','XVI','XVII','XVIII','XIX','XX','XXI','XXII','XXIII','XXIV','XXV','XXVI','XXVII','XXVIII','XXIX','XXX','XXXI','XXXII','XXXIII','XXXIV','XXXV','XXXVI','XXXVII','XXXVIII','XXXIX','XL','XLI','XLII','XLIII','XLIV','XLV','XLVI','XLVII','XLVIII','XLIX','L','LI',
          'LII','LIII','LIV','LV','LVI','LVII','LVIII','LIX','LX','LXI','LXII','LXIII','LXIV','LXV','LXVI','LXVII','LXVIII','LXIX','LXX','LXXI','LXXII','LXXIII','LXXIV','LXXV','LXXVI','LXXVII','LXXVIII','LXXIX','LXXX','LXXXI','LXXXII','LXXXIII','LXXXIV','LXXXV','LXXXVI','LXXXVII','LXXXVIII','LXXXIX','XC','XCI','XCII','XCIII','XCIV','XCV','XCVI','XCVII','XCVIII','XCIX','C']

alineas_list = [
    'a)', 'b)', 'c)', 'd)', 'e)', 'f)', 'g)', 'h)', 'i)', 'j)', 'k)', 'l)', 'm)',
    'n)', 'o)', 'p)', 'q)', 'r)', 's)', 't)', 'u)', 'v)', 'w)', 'x)', 'y)', 'z)'
]

df_ref = pd.DataFrame(columns=['artigo', 'texto', 'paragrafo_unico', 'inciso', 'alinea', 'paragrafo'])

In [26]:
full_html_page_list = soup.find_all('div', attrs={'id': 'art'})[:-1]

full_discarted_list = []
for page in full_html_page_list:
    aux_list = []
    discarted_list = []
    
    #txt = soup.find_all('div', attrs={'id': 'art'})[-2].get_text().replace('rt.', 'rt').replace('arts.', 'arts ').replace('\xa0', '\n').replace('\n\t', '')
    txt = page.get_text().replace('rt.', 'rt').replace('arts.', 'arts ').replace('\xa0', '\n').replace('\n\t', '')
    
    for j in range(1, 120):
        txt = txt.replace(f'{j}º', f'{j}.')

    pre_txt = [k.strip() for k in nltk.tokenize.line_tokenize(txt)]
    pre_list = [k.replace(' - ', '. ').replace(')', ').').split('.') for k in pre_txt]
    pre_list = [list(filter(None, row)) for row in pre_list]

    for i in range(0, len(pre_list)):
        row = pre_list[i]

        if row:  
            if row[0].startswith('Art') and len(row) > 1:
                check_len_row(row, aux_list, append_var='artigo')

            elif row[0].startswith('§') and len(row) > 1:
                check_len_row(row, aux_list, append_var='paragrafo')

            elif row[0].startswith('Parágrafo único') and len(row) > 1:
                check_len_row(row, aux_list, append_var='paragrafo_unico')

            elif row[0] in alineas_list and len(row) > 1:
                check_len_row(row, aux_list, append_var='alinea')

            elif row[0] in r_list and len(row) > 1:
                check_len_row(row, aux_list, append_var='inciso')

            else: discarted_list.append(row)
    
    full_discarted_list.append(discarted_list)
    df_arts = pd.DataFrame(aux_list).reset_index(drop=True)
    
    df_ref = pd.concat([df_ref, df_arts], axis=0)

## Data Cleaning

In [27]:
df_ref

Unnamed: 0,artigo,texto,paragrafo_unico,inciso,alinea,paragrafo
0,Art 1,"A República Federativa do Brasil, formada pel...",,,,
1,,a soberania;,,I,,
2,,a cidadania;,,II,,
3,,a dignidade da pessoa humana;,,III,,
4,,os valores sociais do trabalho e da livre ini...,,IV,,
...,...,...,...,...,...,...
605,,as contribuições parceladas devidas ao \tRegi...,,II,,
606,,as contribuições parceladas devidas ao \tresp...,,III,,
607,Art 118,"Os \tlimites, as condições, as normas de aces...",,,,
608,Art 119,Em decorrência do estado de calamidade públic...,,,,


In [28]:
df1 = df_ref.reset_index(drop=True)

### Arts Cleaning

In [29]:
df1.artigo = df1.artigo.fillna(method='ffill')
df1.artigo = df1.artigo.apply(lambda x: x.replace('\t', '').replace(' ', '. ')+'.' )

### Unique Parag, Inciso and Alinea Cleaning

In [30]:
df1.inciso = df1.inciso.fillna('na')
df1.alinea = df1.alinea.fillna('na')
df1.paragrafo_unico = df1.paragrafo_unico.apply(lambda x: 'pu' if not pd.isna(x) else 'na')

df1.head(2)

Unnamed: 0,artigo,texto,paragrafo_unico,inciso,alinea,paragrafo
0,Art. 1.,"A República Federativa do Brasil, formada pel...",na,na,na,
1,Art. 1.,a soberania;,na,I,na,


### Parag Cleaning

In [31]:
df1.paragrafo = df1.paragrafo.fillna('na')                        # Special Case
df1.paragrafo = df1.paragrafo.apply(lambda x: x.replace('\t', '').replace('° do art 22 da Lei n° 8', ''))

### Arts Cleaning

In [32]:
df1.texto = [i.replace(':', '')+'.' if i.endswith(':') else i for i in [r.replace('\t', '').strip().capitalize() for r in df1.texto.tolist()]]

In [33]:
df1 = df1.drop_duplicates()

## Data Verify

In [101]:
df2 = df1.copy()

In [102]:
# Fix Paragrafos without " dot " before parag number 
df2 = fix_wrong_range_var(df2, 'paragrafo', 5)

### Add Titles

In [103]:
df2['titulo'] = np.nan

In [None]:
titles_dict = get_titles_from_camara()

In [107]:
for i, row in df2.iterrows():
    if row['artigo'] in list(titles_dict.keys()):
        df2.loc[df2.index == i, 'titulo'] = titles_dict[row['artigo']]

### Ordering Columns

In [112]:
df2['sigla'] = 'CF88'
df2['link']  = 'https://www.planalto.gov.br/ccivil_03/constituicao/constituicao.htm'
df2['scrapy_datetime'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
df2['process_datetime'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

In [121]:
df2 = df2[['sigla', 'titulo', 'texto', 'artigo', 'inciso', 'alinea', 'paragrafo', 'paragrafo_unico', 
           'link', 'scrapy_datetime', 'process_datetime']]

### Save to CSV

In [126]:
df2.to_csv(f"cf88_scrapy_{datetime.now().strftime('%Y-%m-%d')}.csv", index=False)

## Streamlit App Data Transform

In [1]:
import re
import pandas as pd 

df = pd.read_csv('cf88_scrapy_2022-08-27.csv')
df = df.drop_duplicates().reset_index(drop=True)

Pattern: CF88 - Art. 1. § 9 a) III PU

In [2]:
df.texto  = df.texto.apply(lambda x: x+'.' if not x.endswith(';') else x)
df.texto  = df.texto.apply(lambda x: x[:-1] if x[-2] == '.' else x)

In [3]:
# Manual Title Implace
df.titulo = df.titulo.apply(lambda x: ' '.join(x.split()) if not pd.isna(x) else x)
df.loc[2810:, 'titulo'] = 'ATO DAS DISPOSIÇÕES CONSTITUCIONAIS TRANSITÓRIAS'
df.titulo = df.titulo.fillna(method='ffill')

In [4]:
df['padrao'] = df.apply(lambda x: (x['sigla'] + ' - ' + x['artigo'] + ' ' + 
                                   x['paragrafo'].replace('na', '') + ' ' + 
                                   x['alinea'].replace('na', '') + ' ' + 
                                   x['inciso'].replace('na', '') + ' ' + 
                                   x['paragrafo_unico'].replace('na', '').upper()).strip(), axis=1)

df['padrao'] = df['padrao'].apply(lambda x: ' '.join(x.split()))

In [14]:
df[['titulo', 'padrao', 'texto', 'link']].to_csv('cf88_elastic.csv', index=False)