In [2]:
import re
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

from os      import listdir
from os.path import isfile, join

In [3]:
def print_dataframe(df):
    style_df = (df.style.set_properties(**{'text-align': 'left'})
                                        .set_table_styles([ dict(selector='th',
                                                                 props=[('text-align','left')])])
                   )
    return style_df

def get_txts(path):
    years = [join(path, x) for x in listdir(path) if not isfile(join(path, x))]
    txts = []
    for year in years:
        months = [join(year, x) for x in listdir(year) if not isfile(join(year, x))]
        for month in months:
            txts += [join(month, x) for x in listdir(month) if isfile(join(month, x))]
    return txts

In [4]:
dodfs_space_dir = "/exp/knedle/Regex/data/dodfs_txt_espaco"
dodfs_space_files = get_txts(dodfs_space_dir)

dodfs_n_dir = "/exp/knedle/Regex/data/dodfs_txt_barra_n"
dodfs_n_files = get_txts(dodfs_n_dir)

output = "./results"

In [5]:
class Regex:
    
    def __init__(self, text):
        self._text = text
        self._raw_acts = {}
        self._acts = []
        self._columns = []
        self.data_frame = pd.DataFrame()
    
    def find_all(self, rule, flag=0):
        return re.findall(rule, self._text, flags=flag)
    
    def find_in_act(self, rule, act):
        rule = re.compile(rule, re.IGNORECASE)
        for match in re.finditer(rule, act):
            return match.groups()
        return "nan"
    
    def _build_dataframe(self):
        if len(self._acts) > 0:
            df = pd.DataFrame(self._acts)
            df.columns = self._columns
            return df
        return pd.DataFrame()

class Retirements(Regex):
    
    def __init__(self, text):
        super().__init__(text)
        self._columns = ["Tipo do Ato", "SEI", "Nome", "Matrícula", "Tipo de Aposentadoria", "Cargo", "Classe",
               "Padrao", "Quadro", "Fundamento Legal", "Orgao", "Vigencia", "Matricula SIAPE"]
        
        self.rules = {"nome": "\s([^,]*?),\smatricula",
                      "matricula":"matricula\s?n?o?\s([\s\S]*?)[,|\s]",
                      "tipo_ret": "",
                      "cargo": "Cargo de([\s\S]*?)\,",
                      "classe": "[C|c]lasse ([\s\S]*?)\,",
                      "padrao": "[p|P]adr[a|ã]o\s([\s\S]*?),",
                      "quadro": "d?[e|a|o]?(Quadro[\s\S]*?)[,|;|.]",
                      "fundamento": "nos\stermos\sdo\s[a|A]rtigo([\s\S]*?),\sa?\s",
                      "orgao": "Lotacao:|Quadro de Pessoal d[a|e|o]([\s\S]*?)[.|,]",
                      "vigencia": "",
                      "siape": "[S|s][I|i][A|a][P|p][E|e]\s[N|n]?[o|O]?\s([\s\S]*?)[,| | .]"}
                      
        self._raw_acts = self._extract_instances()   
        self._acts = self._acts_props()
        self.data_frame = self._build_dataframe()
        
        
    
    def _act_props(self, sei, act_raw):
        act = {}
        act["tipo_ato"] = "Aposentadoria"
        act["sei"] = sei
        for key in self.rules:
            try:
                act[key], = self.find_in_act(self.rules[key], act_raw)
            except:
                act[key] = "nan"

        return act
    
    def _acts_props(self):
        acts = []
        for sei, raw in self._raw_acts.items():
            act = self._act_props(sei, raw)
            acts.append(act)
        return acts        
        
    
    def _extract_instances(self):
        start = "(APOSENTAR|CONCEDER\sAPOSENTADORIA),?\s?"
        body = "([\s\S]*?)"
        end = "[P|p]rocesso:?\s[s|S]?[e|E]?[i|I]?\s?[n|N]?[o|O]?\s?([\s\S]*?)[.]\s"
        rule = start + body + end
        found = self.find_all(rule)
        results = {}
        for instance in found:
            start, body, sei = instance
            results[sei] = body
            
        return results    

In [6]:
res_dfs = []
raw_acts = {}
acts = []
for txt in dodfs_n_files:
    txt_str = open(txt, "r").read()
    ret = Retirements(txt_str)
    raw_acts.update(ret._raw_acts)
    acts += ret._acts
    if not ret.data_frame.empty:
        res_dfs.append(ret.data_frame)


In [24]:
train_data = []
LABEL = 'FUNDAMENTO'
for i,key in enumerate(raw_acts.keys()):
    for d in acts:
        if key == d['sei']:
            fund = d['fundamento'].strip()
            raw = raw_acts[key].strip()
            init = raw.find(fund)
            if init != -1:
                train_data.append(
                    (raw, {"entities": [(init, init+len(fund), LABEL)]})
                )

NameError: name 'nan' is not defined

In [12]:
print(LABEL)

FUNDAMENTO


In [13]:
from __future__ import unicode_literals, print_function

import plac
import random
import warnings
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

random.seed(0)
n_iter=30
nlp = spacy.blank("pt")
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
ner.add_label(LABEL)
optimizer = nlp.begin_training()
move_names = list(ner.move_names)
# get names of other pipes to disable them during training
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
# only train NER
with nlp.disable_pipes(*other_pipes) and warnings.catch_warnings():
    # show warnings for misaligned entity spans once
    warnings.filterwarnings("once", category=UserWarning, module='spacy')
    
    sizes = compounding(1.0, 4.0, 1.001)
    # batch up the examples using spaCy's minibatch
    for itn in range(n_iter):
        random.shuffle(train_data)
        batches = minibatch(train_data, size=sizes)
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
        print("Losses", losses)


Losses {'ner': 17630.887260628446}
Losses {'ner': 5234.509716062698}
Losses {'ner': 3780.409081236402}
Losses {'ner': 2670.5895904724093}
Losses {'ner': 2951.3014396307967}
Losses {'ner': 2336.076720316085}
Losses {'ner': 2675.4149617356734}
Losses {'ner': 2737.265935601434}
Losses {'ner': 2560.16684286489}
Losses {'ner': 2417.8846285219092}
Losses {'ner': 2032.9189047379477}
Losses {'ner': 1923.073726423117}
Losses {'ner': 2539.712054512324}
Losses {'ner': 1920.806004478319}
Losses {'ner': 1874.8772986847134}
Losses {'ner': 1866.7543950279187}
Losses {'ner': 1929.770917953964}
Losses {'ner': 2077.999766481389}
Losses {'ner': 1878.7888288056886}
Losses {'ner': 2192.172336645453}
Losses {'ner': 2122.971829073325}
Losses {'ner': 2136.7629659119293}
Losses {'ner': 2005.7487168013415}
Losses {'ner': 2083.755299382471}
Losses {'ner': 2110.6965388253107}
Losses {'ner': 1985.2573005220386}
Losses {'ner': 2011.397399628239}
Losses {'ner': 1865.8530672846769}
Losses {'ner': 2220.436418302404}
L

In [27]:
for i,key in enumerate(raw_acts.keys()):
    if i > 200: break
    for d in acts:
        if key == d['sei']:
            if d['fundamento'] != 'nan': continue
            doc = nlp(raw_acts[key])
            print("Entities in '%s'" % raw_acts[key])
            print('----- NER-----')
            for ent in doc.ents:
                print(ent.label_, ent.text)
            print('----- REGEX ------')
            print(d['fundamento'])
            print('------------------')
            print('')

Entities in 'ELISANGELA JUSTINIANO GONCALVES, matricula 204.579-6, no Cargo de Professor
de Educacao Basica, Padrao 20, Etapa IV, do Quadro de Pessoal do Distrito Federal, nos termos do artigo
40, 1o, inciso I, da Constituicao da Republica Federativa do Brasil, na redacao dada pela Emenda
Constitucional no 41, de 31 de dezembro de 2003, combinado com o artigo 6o-A da Emenda Constitucional
no 41, de 31 de dezembro de 2003, incluido pela Emenda Constitucional no 70, de 29 de marco de 2012.
'
----- NER-----
FUNDAMENTO 40, 1o, inciso I, da Constituicao da Republica Federativa do Brasil, na redacao dada pela Emenda
Constitucional no 41, de 31 de dezembro de 2003, combinado com o artigo 6o-A da Emenda Constitucional
no 41, de 31 de dezembro de 2003, incluido pela Emenda Constitucional no 70, de 29 de marco de 2012.

----- REGEX ------
nan
------------------

Entities in 'a ANDREA MASSI CARNEIRO, matricula 66.185-6, no Cargo de
Professor de Educacao Basico, Padrao 25, Etapa IV, do Quadro de P

Entities in 'a ROSANE SELVINA UMBELINO AMARAL, matricula 27.916-1,
no Cargo de Professor de Educacao Basica, Padrao 25, Etapa IV, do Quadro de Pessoal do Distrito
Federal, nos termos do artigo 3o, incisos I, II e III, e paragrafo unico da Emenda Constitucional n
47 de 06 de julho de 2005. '
----- NER-----
FUNDAMENTO 3o, incisos I, II e III, e paragrafo unico da Emenda Constitucional n
----- REGEX ------
nan
------------------

Entities in 'a ROSANGELA MARIA PEREIRA DIAS CUNHA, matricula
34.456-7, no Cargo de Professor de Educacao Basica, Padrao 25, Etapa IV, do Quadro de Pessoal do
Distrito Federal, nos termos do artigo 3o, incisos I, II e III, e paragrafo unico da Emenda
Constitucional n 47 de 06 de julho de 2005. '
----- NER-----
FUNDAMENTO 3o, incisos I, II e III, e paragrafo unico da Emenda
Constitucional n
----- REGEX ------
nan
------------------

Entities in 'a VERA LUCIA SANTIAGO, matricula 69.785-0, no Cargo de
Agente de Gestao Educacional/Conservacao e Limpeza, Nivel 10, Padr

Entities in 'a RUBENS RICARDO AMADOR, matricula 38.683-9, no Cargo de
Professor de Educacao Basica, Padrao 22, Etapa V, do Quadro de Pessoal do Distrito Federal, nos
termos do artigo 3o, incisos I, II e III, e paragrafo unico da Emenda Constitucional n 47 de 06 de
julho de 2005. '
----- NER-----
FUNDAMENTO 3o, incisos I, II e III, e paragrafo unico da Emenda Constitucional n
----- REGEX ------
nan
------------------

Entities in 'a SANDRA DE OLIVEIRA ASENJO, matricula 38.134-9, no Cargo
de Professor de Educacao Basica, Padrao 22, Etapa IV, do Quadro de Pessoal do Distrito Federal, nos
termos do Artigo 6o, incisos I, II, III e IV, da Emenda Constitucional no 41, de 31 de dezembro de
2003 e artigo 2o da Emenda Constitucional no 47, de 06 de julho de 2005. '
----- NER-----
FUNDAMENTO 6o, incisos I, II, III e IV, da Emenda Constitucional no 41, de 31 de dezembro
----- REGEX ------
nan
------------------

Entities in 'ANA CLAUDIA DA SILVA BARROS, matricula 48.800-3, no Cargo de Professor de