In [4]:
import nltk
import requests
import warnings
import pandas as pd

from bs4 import BeautifulSoup
from datetime import datetime

### Aux Functions 

In [129]:
nltk.download('punkt')
nltk.download('maxent_ne_chunker')
warnings.filterwarnings('ignore')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\T-Gamer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\T-Gamer\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


# 1.0. CF88 Data Collect

## Data Collect

In [2]:
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
url = 'https://www2.camara.leg.br/legin/fed/consti/1988/constituicao-1988-5-outubro-1988-322142-publicacaooriginal-1-pl.html'

pg = requests.get(url=url, headers=hdr)
soup = BeautifulSoup(pg.text, 'html.parser')

In [5]:
source = soup.title.get_text()
preambulo = soup.find('p', class_='preambulo').get_text()

In [6]:
# Pre-Processing
txt = soup.find('div', class_='textoNorma').get_text().replace('rt.', 'rt').replace('º', '.').replace('arts.', 'arts ')

pre_txt = [k.strip() for k in nltk.tokenize.sent_tokenize(
    txt, language='portuguese'
)]

pre_txt = [list(filter(None, r.split('\xa0'))) for r in pre_txt]

In [44]:
# Fix Titles
aux = []
for r in pre_txt:
    if r[0].startswith('TÍTULO'):
        aux.append(r[:-1][0].split('\n')[:2])
        aux.append(r[-1:])
    elif r[0].startswith('CAP'):
        aux.append([r[-1]])
    else:
        aux.append(r) 

In [47]:
# Create Pandas DataFrame.
a = []
for i in range(0, len(aux[1:])):
    r = aux[1:][i]
    
    if r[0].startswith('Art'):
        a.append(
            {'artigo': ' '.join(r + aux[1:][i+1])}
        )
    
    elif r[0].startswith('Parágrafo único'):
        a.append(
            {'paragrafo_unico': ' '.join(r + aux[1:][i+1])}
        )
    
    elif r[0].startswith('§'):
        a.append(
            {'paragrafo': ' '.join(r + aux[1:][i+1])}
        )
    
    elif r[0].startswith('TÍTULO'):
        a.append(
            {'titulo': ' '.join(r)}
        )

for row in a:
    df = pd.DataFrame(a)

## Data Cleaning

In [48]:
alineas_list = [
    'a)', 'b)', 'c)', 'd)', 'e)', 'f)', 'g)', 'h)', 'i)', 'j)', 'k)', 'l)', 'm)',
    'n)', 'o)', 'p)', 'q)', 'r)', 's)', 't)', 'u)', 'v)', 'w)', 'x)', 'y)', 'z)'
]

base_dict = {
    'sigla': '', 'livro': '', 'titulo': '',
    'capitulo': '', 'sessao': '', 'subsessao': '',
    'artigo': '', 'paragrafo': '', 'incisos': '',
    'alineas': '', 'lei': '', 'scrapy_datetime': '', 'link': '',
}

In [49]:
df.artigo = [k.replace(':', ";") if pd.notna(k) else k for k in df.artigo.tolist()]
df.paragrafo = df.paragrafo.apply(lambda x: x.replace(':', ';') if pd.notnull(x) else x)

In [50]:
df.head(2)

Unnamed: 0,titulo,artigo,paragrafo_unico,paragrafo
0,TÍTULO I DOS PRINCÍPIOS FUNDAMENTAIS,,,
1,,"Art 1. A República Federativa do Brasil, forma...",,


### Arts Cleaning 

In [52]:
df1 = df.artigo.str.split(';', expand=True)

df_arts = pd.DataFrame(columns=['body', 'artigo', 'incisos'])

In [53]:
for col in range(0, df1.T.shape[-1]):
    df2 = df1.T[col]

    aux = df2[pd.notna(df2)]

    if not aux.empty:
        art = aux[0].split('.')[0]

        art_body = {"body": aux[0].split('.')[1]}

        df_art = pd.DataFrame(art_body, index=[0])

        df_art['artigo'] = art

        df_i = pd.DataFrame([i.strip().split(' - ') for i in aux[1:]]).rename(columns={0:"incisos", 1:"body"})
        df_i['artigo'] = art

        df_clean = pd.concat([df_art, df_i], axis=0).reset_index(drop=True)

        df_arts = pd.concat([df_arts, df_clean], axis=0)

df_arts.incisos = df_arts.incisos.fillna(0)

df_arts = df_arts.reset_index(drop=True)

In [54]:
alineas_index = df_arts['incisos'].str.extract("(^[a-z]\)?)").dropna()[0].index

alineas = df_arts.loc[df_arts.incisos.index.isin(alineas_index), "incisos"]
alineas_body = pd.Series({k:v[5:].capitalize() for k, v in zip(alineas_index, alineas.to_list())})
alineas_body.name = "body"

alineas_simbol = alineas.str.extract("([a-z]\))")[0]

alineas_simbol.name = "alineas"

df_arts = pd.concat([df_arts, alineas_simbol], axis=1)

df_arts.body = df_arts.body.combine_first(alineas_body)

df_arts.loc[df_arts.incisos.index.isin(alineas_index), "incisos"] = np.nan

df_arts.incisos = df_arts.incisos.fillna(method="ffill")

In [55]:
df_arts.head(3)

Unnamed: 0,body,artigo,incisos,alineas
0,"A República Federativa do Brasil, formada pel...",Art 1,0,
1,a soberania,Art 1,I,
2,a cidadania,Art 1,II,


In [56]:
df_arts['incisos'] = df_arts['incisos'].replace(0, "na")
df_arts['alineas'] = df_arts['alineas'].fillna("na")

In [57]:
df_ref = pd.DataFrame(base_dict, index=[0])

### Title Cleaning

In [58]:
df2 = df_arts.copy()
df2['titulo'] = np.nan

aux = df[['titulo', 'artigo']].fillna("NaN")

In [59]:
title_index = aux[~aux['titulo'].str.contains('NaN')]['titulo'].index
title_arts = aux.iloc[(aux.titulo.index.isin(title_index)|(aux.artigo.index.isin(title_index+1))), :].replace("NaN", np.nan)

title_arts = pd.concat([title_arts['titulo'].dropna().reset_index(drop=True), title_arts['artigo'].dropna().reset_index(drop=True)], axis=1)

title_arts['artigo'] = title_arts['artigo'].str.extract('(^[A-z]+\s\d+)')[0].tolist()

In [60]:
# Search title location index on DF2
aux = df2.artigo.drop_duplicates()
title_index = aux[aux.isin(title_arts['artigo'])]

df2.iloc[title_index.index, -1] = title_arts['titulo'].to_list() 

df2['titulo'] = df2['titulo'].fillna(method="ffill")

In [63]:
# Dataset Preparation
df2 = df2.rename(columns={"body":"lei"})

df2['lei'] = df2['lei'].apply(lambda x: x.strip().capitalize())

df2['sigla'] = 'CF88'
df2['link']  = 'https://www.planalto.gov.br/ccivil_03/constituicao/constituicaocompilado.htm'
df2['scrapy_datetime'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
df2['process_datetime'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

df2 = df2[['sigla', 'titulo', 'artigo', 'incisos', 'alineas', 'lei', 'link', 'scrapy_datetime', 'process_datetime']]

In [64]:
token_titles = [nltk.tokenize.word_tokenize(k) for k in df2['titulo'].to_list()]
df2['titulo'] = [' '.join(k[:2] + ['-'] + k[2:]) for k in token_titles]

In [67]:
df2.to_csv("../data/arts_update2.csv", index=True)

### Parag Cleaning

In [456]:
aux = df[['artigo', 'paragrafo_unico']].fillna("NaN")

In [465]:
para_index = aux[~aux['paragrafo_unico'].str.contains('NaN')]['paragrafo_unico'].index

para_arts = aux.iloc[(aux.paragrafo_unico.index.isin(para_index)|(aux.artigo.index.isin(para_index+1))), :].replace("NaN", np.nan)

para_arts = pd.concat([para_arts['paragrafo_unico'].dropna().reset_index(drop=True), para_arts['artigo'].dropna().reset_index(drop=True)], axis=1)

para_arts['artigo'] = para_arts['artigo'].str.extract('(^[A-z]+\s\d+)')[0].tolist()

# 2.0. Solution Class

## 2.1. First Class 

In [1]:
import nltk
import requests
import warnings
import pandas as pd

from bs4 import BeautifulSoup
from datetime import datetime

ALINEAS_LIST = [
    'a)', 'b)', 'c)', 'd)', 'e)', 'f)', 'g)', 'h)', 'i)', 'j)', 'k)', 'l)', 'm)',
    'n)', 'o)', 'p)', 'q)', 'r)', 's)', 't)', 'u)', 'v)', 'w)', 'x)', 'y)', 'z)'
]

BASE_DICT = {
    'sigla': '', 'livro': '', 'titulo': '',
    'capitulo': '', 'sessao': '', 'subsessao': '',
    'artigo': '', 'paragrafo': '', 'incisos': '',
    'alineas': '', 'lei': '', 'scrapy_datetime': '', 'link': '',
}


class DataCollectCF():

    def __init__(self):
        self.hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
        self.url = 'https://www2.camara.leg.br/legin/fed/consti/1988/constituicao-1988-5-outubro-1988-322142-publicacaooriginal-1-pl.html'

    def page_collect(self, ):
        pg = requests.get(url=self.url, headers=self.hdr)
        soup = BeautifulSoup(pg.text, 'html.parser')
        
        source = soup.title.get_text()
        preambulo = soup.find('p', class_='preambulo').get_text()
        
        return {'soup': soup, 'source': source, 'preambulo': preambulo}
    
    def pre_processing(self, soup):
        txt = soup.find('div', class_='textoNorma').get_text().replace('rt.', 'rt').replace('º', '.').replace('arts.', 'arts ')

        pre_txt = [k.strip() for k in nltk.tokenize.sent_tokenize(
            txt, language='portuguese'
        )]

        pre_txt = [list(filter(None, r.split('\xa0'))) for r in pre_txt]
        
        return pre_txt

    def fix_titles(self, pre_txt: list):
        aux = []
        for r in pre_txt:
            if r[0].startswith('TÍTULO'):
                aux.append(r[:-1][0].split('\n')[:2])
                aux.append(r[-1:])
            elif r[0].startswith('CAP'):
                aux.append([r[-1]])
            else:
                aux.append(r)
                
        return aux
    
    def create_dataframe(self, aux: list):
        a = []
        for i in range(0, len(aux[1:])):
            r = aux[1:][i]

            if r[0].startswith('Art'):
                a.append(
                    {'artigo': ' '.join(r + aux[1:][i+1])}
                )

            elif r[0].startswith('Parágrafo único'):
                a.append(
                    {'paragrafo_unico': ' '.join(r + aux[1:][i+1])}
                )

            elif r[0].startswith('§'):
                a.append(
                    {'paragrafo': ' '.join(r + aux[1:][i+1])}
                )

            elif r[0].startswith('TÍTULO'):
                a.append(
                    {'titulo': ' '.join(r)}
                )

        for row in a:
            df = pd.DataFrame(a)
            
        return df


    def pre_cleaning(self, df):
        df.artigo = [k.replace(':', ";") if pd.notna(k) else k for k in df.artigo.tolist()]
        df.paragrafo = df.paragrafo.apply(lambda x: x.replace(':', ';') if pd.notnull(x) else x)

        return df

    def arts_cleaning(self, df):
        df1 = df.artigo.str.split(';', expand=True)

        df_arts = pd.DataFrame(columns=['body', 'artigo', 'incisos'])

        for col in range(0, df1.T.shape[-1]):
            df2 = df1.T[col]

            aux = df2[pd.notna(df2)]

            if not aux.empty:
                art = aux[0].split('.')[0]

                art_body = {"body": aux[0].split('.')[1]}

                df_art = pd.DataFrame(art_body, index=[0])

                df_art['artigo'] = art

                df_i = pd.DataFrame([i.strip().split(' - ') for i in aux[1:]]).rename(columns={0:"incisos", 1:"body"})
                df_i['artigo'] = art

                df_clean = pd.concat([df_art, df_i], axis=0).reset_index(drop=True)

                df_arts = pd.concat([df_arts, df_clean], axis=0)

        df_arts.incisos = df_arts.incisos.fillna(0)

        df_arts = df_arts.reset_index(drop=True)

        return df_arts

    def alineas_cleaning(self, df_arts):
        alineas_index = df_arts['incisos'].str.extract("(^[a-z]\)?)").dropna()[0].index

        alineas = df_arts.loc[df_arts.incisos.index.isin(alineas_index), "incisos"]
        alineas_body = pd.Series({k:v[5:].capitalize() for k, v in zip(alineas_index, alineas.to_list())})
        alineas_body.name = "body"

        alineas_simbol = alineas.str.extract("([a-z]\))")[0]

        alineas_simbol.name = "alineas"

        df_arts = pd.concat([df_arts, alineas_simbol], axis=1)

        df_arts.body = df_arts.body.combine_first(alineas_body)

        df_arts.loc[df_arts.incisos.index.isin(alineas_index), "incisos"] = np.nan

        df_arts.incisos = df_arts.incisos.fillna(method="ffill")

        df_arts['incisos'] = df_arts['incisos'].replace(0, "na")
        df_arts['alineas'] = df_arts['alineas'].fillna("na")

        return df_arts

    def title_cleaning(self, df_raw, df_arts):
        df2 = df_arts.copy()
        df2['titulo'] = np.nan

        aux = df_raw[['titulo', 'artigo']].fillna("NaN")

        title_index = aux[~aux['titulo'].str.contains('NaN')]['titulo'].index
        title_arts = aux.iloc[(aux.titulo.index.isin(title_index)|(aux.artigo.index.isin(title_index+1))), :].replace("NaN", np.nan)

        title_arts = pd.concat([title_arts['titulo'].dropna().reset_index(drop=True), title_arts['artigo'].dropna().reset_index(drop=True)], axis=1)

        title_arts['artigo'] = title_arts['artigo'].str.extract('(^[A-z]+\s\d+)')[0].tolist()

        # Search title location index on DF2
        aux = df2.artigo.drop_duplicates()
        title_index = aux[aux.isin(title_arts['artigo'])]

        df2.iloc[title_index.index, -1] = title_arts['titulo'].to_list() 

        df2['titulo'] = df2['titulo'].fillna(method="ffill")
        
        return df2

    def data_preparation(self, df2):
        df2 = df2.rename(columns={"body":"lei"})

        df2['lei'] = df2['lei'].apply(lambda x: x.strip().capitalize())

        df2['sigla'] = 'CF88'
        df2['link']  = 'https://www.planalto.gov.br/ccivil_03/constituicao/constituicaocompilado.htm'
        df2['scrapy_datetime'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        df2['process_datetime'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        df2 = df2[['sigla', 'titulo', 'artigo', 'incisos', 'alineas', 'lei', 'link', 'scrapy_datetime', 'process_datetime']]

        token_titles = [nltk.tokenize.word_tokenize(k) for k in df2['titulo'].to_list()]
        df2['titulo'] = [' '.join(k[:2] + ['-'] + k[2:]) for k in token_titles]

        df2.to_csv("../data/arts_update2.csv", index=True)
        
        return df2

In [2]:
pipeline = DataCollectCF()

info_dict = pipeline.page_collect()
pre_txt_list = pipeline.pre_processing(info_dict['soup'])
fix_title_list = pipeline.fix_titles(pre_txt_list)
df_raw = pipeline.create_dataframe(fix_title_list)
df_raw = pipeline.pre_cleaning(df_raw)
df0 = pipeline.arts_cleaning(df_raw)
df0 = pipeline.alineas_cleaning(df0)

df1 = pipeline.title_cleaning(df_raw, df0)
df1 = pipeline.data_preparation(df1)

In [6]:
df1.head(2)

Unnamed: 0,sigla,titulo,artigo,incisos,alineas,lei,link,scrapy_datetime,process_datetime
0,CF88,TÍTULO I - DOS PRINCÍPIOS FUNDAMENTAIS,Art 1,na,na,"A república federativa do brasil, formada pela...",https://www.planalto.gov.br/ccivil_03/constitu...,2022-08-12 09:27:43,2022-08-12 09:27:43
1,CF88,TÍTULO I - DOS PRINCÍPIOS FUNDAMENTAIS,Art 1,I,na,A soberania,https://www.planalto.gov.br/ccivil_03/constitu...,2022-08-12 09:27:43,2022-08-12 09:27:43


## 2.0. Second Class (Structure)

In [1]:
import os
import nltk
import requests
import warnings
import pandas as pd

from bs4 import BeautifulSoup
from time import sleep
from datetime import datetime
from sqlalchemy import create_engine
from pandas.core.frame import DataFrame
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv())

ALINEAS_LIST = [
    'a)', 'b)', 'c)', 'd)', 'e)', 'f)', 'g)', 'h)', 'i)', 'j)', 'k)', 'l)', 'm)',
    'n)', 'o)', 'p)', 'q)', 'r)', 's)', 't)', 'u)', 'v)', 'w)', 'x)', 'y)', 'z)'
]

BASE_DICT = {
    'sigla': '', 'livro': '', 'titulo': '',
    'capitulo': '', 'sessao': '', 'subsessao': '',
    'artigo': '', 'paragrafo': '', 'incisos': '',
    'alineas': '', 'lei': '', 'scrapy_datetime': '', 'link': '',
}


class DataCollectCF():
    '''
        page_collect:     Get full txt of page.
        pre_processing:   Apply nltk for cleaning paragraph.
        fix_titles:       Transform titles / chapters on lists.
        create_dataframe: Create dataframe from preprocessed data and titles.
        pre_cleaning:     Clean the dataframe removing special chars.
        arts_cleaning:    Clean articles column of dataframe.
        alineas_cleaning: Clean alineas column of dataframe.
        title_cleaning:   Clean title column of dataframe.
        data_preparation: Prepare dataset to SQL and Elastic.
        data_store_postgre: Store dataset on SQL Table.
        data_store_elastic: Store dataset on NoSQL Table.
    '''
    
    def __init__(self):
        self.hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5),AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'}
        self.url = 'https://www2.camara.leg.br/legin/fed/consti/1988/constituicao-1988-5-outubro-1988-322142-publicacaooriginal-1-pl.html'
        self.user = os.getenv('USER')
        self.pswd = os.getenv('PASS')
        self.host = os.getenv('HOST')
        self.port = os.getenv('PORT')
        self.db   = os.getenv('DB')
        
    def page_collect(self) -> dict:
        pg = requests.get(url=self.url, headers=self.hdr)
        soup = BeautifulSoup(pg.text, 'html.parser')
        
        source = soup.title.get_text()
        preambulo = soup.find('p', class_='preambulo').get_text()
        
        return {'soup': soup, 'source': source, 'preambulo': preambulo}
    
    def pre_processing(self, **kwargs) -> list:
        txt = kwargs['soup'].find('div', class_='textoNorma').get_text().replace('rt.', 'rt').replace('º', '.').replace('arts.', 'arts ')

        pre_txt = [k.strip() for k in nltk.tokenize.sent_tokenize(
            txt, language='portuguese'
        )]

        pre_txt = [list(filter(None, r.split('\xa0'))) for r in pre_txt]
        
        return pre_txt

    def fix_titles(self, pre_txt: list) -> list:
        aux = []
        for r in pre_txt:
            if r[0].startswith('TÍTULO'):
                aux.append(r[:-1][0].split('\n')[:2])
                aux.append(r[-1:])
            elif r[0].startswith('CAP'):
                aux.append([r[-1]])
            else:
                aux.append(r)
                
        return aux
    
    def create_dataframe(self, aux: list) -> DataFrame:
        a = []
        for i in range(0, len(aux[1:])):
            r = aux[1:][i]

            if r[0].startswith('Art'):
                a.append(
                    {'artigo': ' '.join(r + aux[1:][i+1])}
                )

            elif r[0].startswith('Parágrafo único'):
                a.append(
                    {'paragrafo_unico': ' '.join(r + aux[1:][i+1])}
                )

            elif r[0].startswith('§'):
                a.append(
                    {'paragrafo': ' '.join(r + aux[1:][i+1])}
                )

            elif r[0].startswith('TÍTULO'):
                a.append(
                    {'titulo': ' '.join(r)}
                )

        for row in a:
            df = pd.DataFrame(a)
            
        return df


    def pre_cleaning(self, df) -> DataFrame:
        df.artigo = [k.replace(':', ";") if pd.notna(k) else k for k in df.artigo.tolist()]
        df.paragrafo = df.paragrafo.apply(lambda x: x.replace(':', ';') if pd.notnull(x) else x)

        return df

    def arts_cleaning(self, df) -> DataFrame:
        df1 = df.artigo.str.split(';', expand=True)

        df_arts = pd.DataFrame(columns=['body', 'artigo', 'incisos'])

        for col in range(0, df1.T.shape[-1]):
            df2 = df1.T[col]

            aux = df2[pd.notna(df2)]

            if not aux.empty:
                art = aux[0].split('.')[0]

                art_body = {"body": aux[0].split('.')[1]}

                df_art = pd.DataFrame(art_body, index=[0])

                df_art['artigo'] = art

                df_i = pd.DataFrame([i.strip().split(' - ') for i in aux[1:]]).rename(columns={0:"incisos", 1:"body"})
                df_i['artigo'] = art

                df_clean = pd.concat([df_art, df_i], axis=0).reset_index(drop=True)

                df_arts = pd.concat([df_arts, df_clean], axis=0)

        df_arts.incisos = df_arts.incisos.fillna(0)

        df_arts = df_arts.reset_index(drop=True)

        return df_arts

    def alineas_cleaning(self, df_arts):
        alineas_index = df_arts['incisos'].str.extract("(^[a-z]\)?)").dropna()[0].index

        alineas = df_arts.loc[df_arts.incisos.index.isin(alineas_index), "incisos"]
        alineas_body = pd.Series({k:v[5:].capitalize() for k, v in zip(alineas_index, alineas.to_list())})
        alineas_body.name = "body"

        alineas_simbol = alineas.str.extract("([a-z]\))")[0]

        alineas_simbol.name = "alineas"

        df_arts = pd.concat([df_arts, alineas_simbol], axis=1)

        df_arts.body = df_arts.body.combine_first(alineas_body)

        df_arts.loc[df_arts.incisos.index.isin(alineas_index), "incisos"] = np.nan

        df_arts.incisos = df_arts.incisos.fillna(method="ffill")

        df_arts['incisos'] = df_arts['incisos'].replace(0, "na")
        df_arts['alineas'] = df_arts['alineas'].fillna("na")

        return df_arts

    def title_cleaning(self, **kwargs) -> DataFrame:
        df2 = kwargs['df_clean'].copy()
        df2['titulo'] = np.nan

        aux = kwargs['df_raw'][['titulo', 'artigo']].fillna("NaN")

        title_index = aux[~aux['titulo'].str.contains('NaN')]['titulo'].index
        title_arts = aux.iloc[(aux.titulo.index.isin(title_index)|(aux.artigo.index.isin(title_index+1))), :].replace("NaN", np.nan)

        title_arts = pd.concat([title_arts['titulo'].dropna().reset_index(drop=True), title_arts['artigo'].dropna().reset_index(drop=True)], axis=1)

        title_arts['artigo'] = title_arts['artigo'].str.extract('(^[A-z]+\s\d+)')[0].tolist()

        # Search title location index on DF2
        aux = df2.artigo.drop_duplicates()
        title_index = aux[aux.isin(title_arts['artigo'])]

        df2.iloc[title_index.index, -1] = title_arts['titulo'].to_list() 

        df2['titulo'] = df2['titulo'].fillna(method="ffill")
        
        return df2

    def data_preparation(self, df2) -> DataFrame:
        df2 = df2.rename(columns={"body":"lei"})

        df2['lei'] = df2['lei'].apply(lambda x: x.strip().capitalize())

        df2['sigla'] = 'CF88'
        df2['link']  = 'https://www.planalto.gov.br/ccivil_03/constituicao/constituicaocompilado.htm'
        df2['scrapy_datetime'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        df2['process_datetime'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        df2 = df2[['sigla', 'titulo', 'artigo', 'incisos', 'alineas', 'lei', 'link', 'scrapy_datetime', 'process_datetime']]

        token_titles = [nltk.tokenize.word_tokenize(k) for k in df2['titulo'].to_list()]
        df2['titulo'] = [' '.join(k[:2] + ['-'] + k[2:]) for k in token_titles]

        #df2.to_csv("../data/arts_update2.csv", index=True)
        
        return df2
    
    def data_store_postgre(self, df) -> None:
        con = create_engine(
            f"postgresql://{self.user}:{self.pswd}@{self.host}:{self.port}/{self.db}"
        ).connect()
        
        df.to_sql("cf88", con=con, if_exists="replace", index=False)
        
        con.close()
        
        return None
    
    def data_store_elastic(self, df) -> None:
        
        sleep(5)
        
        return None

In [2]:
pipeline = DataCollectCF()

info_dict = pipeline.page_collect()

pre_txt_list = pipeline.pre_processing(soup=info_dict['soup'])

fix_title_list = pipeline.fix_titles(pre_txt_list)

df_raw = pipeline.create_dataframe(fix_title_list)

df_raw = pipeline.pre_cleaning(df_raw)

df0 = pipeline.arts_cleaning(df_raw)

df0 = pipeline.alineas_cleaning(df0)

df1 = pipeline.title_cleaning(df_raw=df_raw, df_clean=df0)

df1 = pipeline.data_preparation(df1)

pipeline.data_store_postgre(df1)

pipeline.data_store_elastic(df1)

In [3]:
df1.head(50)

Unnamed: 0,sigla,titulo,artigo,incisos,alineas,lei,link,scrapy_datetime,process_datetime
0,CF88,TÍTULO I - DOS PRINCÍPIOS FUNDAMENTAIS,Art 1,na,na,"A república federativa do brasil, formada pela...",https://www.planalto.gov.br/ccivil_03/constitu...,2022-08-14 16:33:01,2022-08-14 16:33:01
1,CF88,TÍTULO I - DOS PRINCÍPIOS FUNDAMENTAIS,Art 1,I,na,A soberania,https://www.planalto.gov.br/ccivil_03/constitu...,2022-08-14 16:33:01,2022-08-14 16:33:01
2,CF88,TÍTULO I - DOS PRINCÍPIOS FUNDAMENTAIS,Art 1,II,na,A cidadania,https://www.planalto.gov.br/ccivil_03/constitu...,2022-08-14 16:33:01,2022-08-14 16:33:01
3,CF88,TÍTULO I - DOS PRINCÍPIOS FUNDAMENTAIS,Art 1,III,na,A dignidade da pessoa humana,https://www.planalto.gov.br/ccivil_03/constitu...,2022-08-14 16:33:01,2022-08-14 16:33:01
4,CF88,TÍTULO I - DOS PRINCÍPIOS FUNDAMENTAIS,Art 1,IV,na,Os valores sociais do trabalho e da livre inic...,https://www.planalto.gov.br/ccivil_03/constitu...,2022-08-14 16:33:01,2022-08-14 16:33:01
5,CF88,TÍTULO I - DOS PRINCÍPIOS FUNDAMENTAIS,Art 1,V,na,O pluralismo político.,https://www.planalto.gov.br/ccivil_03/constitu...,2022-08-14 16:33:01,2022-08-14 16:33:01
6,CF88,TÍTULO I - DOS PRINCÍPIOS FUNDAMENTAIS,Art 2,na,na,"São poderes da união, independentes e harmônic...",https://www.planalto.gov.br/ccivil_03/constitu...,2022-08-14 16:33:01,2022-08-14 16:33:01
7,CF88,TÍTULO I - DOS PRINCÍPIOS FUNDAMENTAIS,Art 3,na,na,Constituem objetivos fundamentais da república...,https://www.planalto.gov.br/ccivil_03/constitu...,2022-08-14 16:33:01,2022-08-14 16:33:01
8,CF88,TÍTULO I - DOS PRINCÍPIOS FUNDAMENTAIS,Art 3,I,na,"Construir uma sociedade livre, justa e solidária",https://www.planalto.gov.br/ccivil_03/constitu...,2022-08-14 16:33:01,2022-08-14 16:33:01
9,CF88,TÍTULO I - DOS PRINCÍPIOS FUNDAMENTAIS,Art 3,II,na,Garantir o desenvolvimento nacional,https://www.planalto.gov.br/ccivil_03/constitu...,2022-08-14 16:33:01,2022-08-14 16:33:01
