### Classes

In [1]:
class Acordao:
    def __init__(self, ementa, assunto):
        self.ementa = ementa
        self.assunto = assunto
        
class AcordaoContainer:
    def __init__(self, acordaos):
        self.acordaos = acordaos
    
    def get_x(self, vectorizer):
        return vectorizer.transform(self.get_ementa())
    
    def get_ementa(self):
        return [x.ementa for x in self.acordaos]
    
    def get_assunto(self):
        return [x.assunto for x in self.acordaos]

### Funções para tratamento

In [3]:
def removerStopWords(ementas):
    nlp = spacy.load('pt_core_news_lg')
    textos = []
    for texto in ementas:
        texto = re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ: ]', ' ', texto)
        doc = nlp(texto)
        text_aux = []
        for token in doc:
            if token.is_stop == False and token.is_digit == False and token.is_space == False and token.like_num == False:
                text_aux.append(token.text.lower()) 
        textos.append(" ".join(text_aux))
    return textos

### Imports

In [4]:
import numpy as np
import pandas as pd
import spacy
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, recall_score, recall_score, precision_score, accuracy_score
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from joblib import dump, load
import pickle

### Carregando dados

In [5]:
# Dados pra treino
acordaosTrain = []
ementasTrain = []
assuntosTrain = []
dataTrain = pd.read_csv (r'C:\Users\willy.silva\Desktop\tcc_algoritmo_classificador_acordaos\Dados\decisoes_com_dados_adicionais_train.csv', sep=";")   
dfTrain = pd.DataFrame(dataTrain, columns= ['ementa','assunto_agrupado'])
ementasTrain = dfTrain['ementa'].apply(lambda x: np.str_(x))
assuntosTrain = dfTrain['assunto_agrupado'].apply(lambda x: np.str_(x))
print(dfTrain['assunto_agrupado'].value_counts())

# Dados pra teste
acordaosTest = []
ementasTest = []
assuntosTest = []
dataTest = pd.read_csv (r'C:\Users\willy.silva\Desktop\tcc_algoritmo_classificador_acordaos\Dados\decisoes_com_dados_adicionais_test.csv', sep=";")   
dfTest = pd.DataFrame(dataTest, columns= ['ementa','assunto_agrupado'])
ementasTest = dfTest['ementa'].apply(lambda x: np.str_(x))
assuntosTest = dfTest['assunto_agrupado'].apply(lambda x: np.str_(x))
print(dfTest['assunto_agrupado'].value_counts())

DESPESA                104
PROCESSUAL              99
LICITACAO               97
PESSOAL                 94
OUTROS                  84
PRESTACAO DE CONTAS     72
CONTRATO                68
PREVIDENCIA             56
AGENTE POLITICO         49
RESPONSABILIDADE        38
Name: assunto_agrupado, dtype: int64
DESPESA                26
LICITACAO              25
PROCESSUAL             25
PESSOAL                24
OUTROS                 22
PRESTACAO DE CONTAS    19
CONTRATO               18
PREVIDENCIA            14
AGENTE POLITICO        13
RESPONSABILIDADE       10
Name: assunto_agrupado, dtype: int64


In [6]:
ementasLimpasTrain = removerStopWords(ementasTrain)
for i in range(0,len(ementasTrain)):
    acordaosTrain.append(Acordao(ementasLimpasTrain[i],assuntosTrain[i]))

ementasLimpasTest = removerStopWords(ementasTest)
for i in range(0,len(ementasTest)):
    acordaosTest.append(Acordao(ementasLimpasTest[i],assuntosTest[i]))

### Separando os dados para treino e teste

In [7]:
train_container = AcordaoContainer(acordaosTrain)
test_container = AcordaoContainer(acordaosTest)

corpus = train_container.get_ementa()
vectorizer = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7)
vectorizer.fit(corpus)

#array para treinamento do modelo 
train_x = train_container.get_x(vectorizer)
train_y = train_container.get_assunto()
#array para teste do modelo
test_x = test_container.get_x(vectorizer)
test_y = test_container.get_assunto()

### Criando modelos

In [8]:
# SVM SVC
svm_model = svm.SVC(C=16, kernel='linear', gamma='auto', probability=True)
svm_model.fit(train_x, train_y)

# Naive Bayes
nv_model = MultinomialNB()
nv_model.fit(train_x, train_y)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_model.fit(train_x, train_y) 

# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(train_x, train_y)

LogisticRegression()

In [14]:
print("valor real "+test_y[1])
print("predição",svm_model.predict_proba(test_x[1]))
print(svm_model.classes_)
print("valor real "+test_y[1])
print("predição", nv_model.predict(test_x[1]))
print("valor real "+test_y[1])
print("predição",rf_model.predict(test_x[1]))
print("valor real "+test_y[1])
print("predição", lr_model.predict(test_x[1]))

valor real LICITACAO
predição [[0.00979941 0.01924559 0.20377699 0.44290259 0.06620563 0.20435873
  0.01059414 0.02217774 0.00894164 0.01199754]]
['AGENTE POLITICO' 'CONTRATO' 'DESPESA' 'LICITACAO' 'OUTROS' 'PESSOAL'
 'PRESTACAO DE CONTAS' 'PREVIDENCIA' 'PROCESSUAL' 'RESPONSABILIDADE']
valor real LICITACAO
predição ['LICITACAO']
valor real LICITACAO
predição ['PESSOAL']
valor real LICITACAO
predição ['LICITACAO']


### Testando os modelos

In [15]:
# SVM
print("SVM")
svm_predict = svm_model.predict(test_x)
print("F1 score:",f1_score(test_y, svm_predict, average='weighted'))
print("Recall score:",recall_score(test_y, svm_predict, average='weighted'))
print("Precision score:",precision_score(test_y, svm_predict, average='weighted'))
print("Accuracy score:",accuracy_score(test_y, svm_predict, normalize = True),'\n')

# Naive Bayers
print("Naive Bayes")
nv_predict = nv_model.predict(test_x)
print("F1 score:",f1_score(test_y, nv_predict, average='weighted'))
print("Recall score:",recall_score(test_y, nv_predict, average='weighted'))
print("Precision score:",precision_score(test_y, nv_predict, average='weighted'))
print("Accuracy score:",accuracy_score(test_y, nv_predict, normalize = True),'\n')

# Random Forest
print("Random Forest")
rf_predict = rf_model.predict(test_x)
print("F1 score:",f1_score(test_y, rf_predict, average='weighted'))
print("Recall score:",recall_score(test_y, rf_predict, average='weighted'))
print("Precision score:",precision_score(test_y, rf_predict, average='weighted'))
print("Accuracy score:",accuracy_score(test_y, rf_predict, normalize = True),'\n')

# Logistic Regression
print("Logistic Regression")
lr_predict = lr_model.predict(test_x)
print("F1 score:",f1_score(test_y, lr_predict, average='weighted'))
print("Recall score:",recall_score(test_y, lr_predict, average='weighted'))
print("Precision score:",precision_score(test_y, lr_predict, average='weighted'))
print("Accuracy score:",accuracy_score(test_y, lr_predict, normalize = True),'\n')

SVM
F1 score: 0.8764236913838859
Recall score: 0.8775510204081632
Precision score: 0.8777501048197588
Accuracy score: 0.8775510204081632 

Naive Bayes
F1 score: 0.650792416843852
Recall score: 0.673469387755102
Precision score: 0.6839185895401485
Accuracy score: 0.673469387755102 

Random Forest


  _warn_prf(average, modifier, msg_start, len(result))


F1 score: 0.828983768589426
Recall score: 0.8367346938775511
Precision score: 0.8364835811858878
Accuracy score: 0.8367346938775511 

Logistic Regression
F1 score: 0.7290680430254465
Recall score: 0.7397959183673469
Precision score: 0.7625489590849929
Accuracy score: 0.7397959183673469 



In [18]:
from sklearn.pipeline import make_pipeline

pipeline = make_pipeline(vectorizer, svm_model)

pipeline.fit(train_container.get_ementa(), train_container.get_assunto())

Pipeline(steps=[('tfidfvectorizer',
                 TfidfVectorizer(max_df=0.7, max_features=1500, min_df=5)),
                ('svc',
                 SVC(C=16, gamma='auto', kernel='linear', probability=True))])

In [19]:
pickle.dump(pipeline, open('pipeline.pickle', 'wb'))