### Classes

In [23]:
class Acordao:
    def __init__(self, ementa, assunto):
        self.ementa = ementa
        self.assunto = assunto
        
class AcordaoContainer:
    def __init__(self, acordaos):
        self.acordaos = acordaos
    
    def get_x(self, vectorizer):
        return vectorizer.transform(self.get_ementa())
    
    def get_ementa(self):
        return [x.ementa for x in self.acordaos]
    
    def get_assunto(self):
        return [x.assunto for x in self.acordaos]

### Imports

In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, recall_score, recall_score, precision_score, accuracy_score
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

### Carregando dados

In [25]:
# Dados pra treino
acordaosTrain = []
ementasTrain = []
assuntosTrain = []
dataTrain = pd.read_csv (r'C:\Users\willy\Desktop\TCC_classificador\Dados\decisoes_com_dados_adicionais_train.csv', sep=";")   
dfTrain = pd.DataFrame(dataTrain, columns= ['ementa','assunto_agrupado'])
ementasTrain = dfTrain['ementa']
assuntosTrain = dfTrain['assunto_agrupado']
print(dfTrain['assunto_agrupado'].value_counts())

# Dados pra teste
acordaosTest = []
ementasTest = []
assuntosTest = []
dataTest = pd.read_csv (r'C:\Users\willy\Desktop\TCC_classificador\Dados\decisoes_com_dados_adicionais_test.csv', sep=";")   
dfTest = pd.DataFrame(dataTest, columns= ['ementa','assunto_agrupado'])
ementasTest = dfTest['ementa']
assuntosTest = dfTest['assunto_agrupado'].apply(lambda x: np.str_(x))
print(dfTest['assunto_agrupado'].value_counts())

DESPESA                104
PROCESSUAL              99
LICITACAO               97
PESSOAL                 94
OUTROS                  84
PRESTACAO DE CONTAS     72
CONTRATO                68
PREVIDENCIA             56
AGENTE POLITICO         49
RESPONSABILIDADE        38
Name: assunto_agrupado, dtype: int64
DESPESA                26
LICITACAO              25
PROCESSUAL             25
PESSOAL                24
OUTROS                 22
PRESTACAO DE CONTAS    19
CONTRATO               18
PREVIDENCIA            14
AGENTE POLITICO        13
RESPONSABILIDADE       10
Name: assunto_agrupado, dtype: int64


In [26]:
for i in range(0,len(ementasTrain)):
    acordaosTrain.append(Acordao(ementasTrain[i],assuntosTrain[i]))

In [27]:
for i in range(0,len(ementasTest)):
    acordaosTest.append(Acordao(ementasTest[i],assuntosTest[i]))

### Separando os dados para treino e teste

In [28]:
train_container = AcordaoContainer(acordaosTrain)
test_container = AcordaoContainer(acordaosTest)

corpus = train_container.get_ementa()
vectorizer = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7)
vectorizer.fit(corpus)

#array para treinamento do modelo 
train_x = train_container.get_x(vectorizer)
train_y = train_container.get_assunto()
#array para teste do modelo
test_x = test_container.get_x(vectorizer)
test_y = test_container.get_assunto()

### Criando modelos

In [29]:
# SVM SVC
svm_model = svm.SVC(C=16, kernel='linear', gamma='auto')
svm_model.fit(train_x, train_y)

# Naive Bayes
nv_model = MultinomialNB()
nv_model.fit(train_x, train_y)

# Random Forest
rf_model = RandomForestClassifier(n_estimators=1000, random_state=0)
rf_model.fit(train_x, train_y) 

# Logistic Regression
lr_model = LogisticRegression()
lr_model.fit(train_x, train_y)

LogisticRegression()

In [30]:
print("valor real "+test_y[1])
print("predição",svm_model.predict(test_x[1]))
print("valor real "+test_y[1])
print("predição", nv_model.predict(test_x[1]))
print("valor real "+test_y[1])
print("predição",rf_model.predict(test_x[1]))
print("valor real "+test_y[1])
print("predição", lr_model.predict(test_x[1]))

valor real LICITACAO
predição ['DESPESA']
valor real LICITACAO
predição ['LICITACAO']
valor real LICITACAO
predição ['PESSOAL']
valor real LICITACAO
predição ['PESSOAL']


### Testando os modelos

In [32]:
# SVM
print("SVM")
svm_predict = svm_model.predict(test_x)
print("F1 score:",f1_score(test_y, svm_predict, average='weighted'))
print("Recall score:",recall_score(test_y, svm_predict, average='weighted'))
print("Precision score:",precision_score(test_y, svm_predict, average='weighted'))
print("Accuracy score:",accuracy_score(test_y, svm_predict, normalize = True),'\n')

# Naive Bayers
print("Naive Bayes")
nv_predict = nv_model.predict(test_x)
print("F1 score:",f1_score(test_y, nv_predict, average='weighted'))
print("Recall score:",recall_score(test_y, nv_predict, average='weighted'))
print("Precision score:",precision_score(test_y, nv_predict, average='weighted'))
print("Accuracy score:",accuracy_score(test_y, nv_predict, normalize = True),'\n')

# Random Forest
print("Random Forest")
rf_predict = rf_model.predict(test_x)
print("F1 score:",f1_score(test_y, rf_predict, average='weighted'))
print("Recall score:",recall_score(test_y, rf_predict, average='weighted'))
print("Precision score:",precision_score(test_y, rf_predict, average='weighted'))
print("Accuracy score:",accuracy_score(test_y, rf_predict, normalize = True),'\n')

# Logistic Regression
print("Logistic Regression")
lr_predict = lr_model.predict(test_x)
print("F1 score:",f1_score(test_y, lr_predict, average='weighted'))
print("Recall score:",recall_score(test_y, lr_predict, average='weighted'))
print("Precision score:",precision_score(test_y, lr_predict, average='weighted'))
print("Accuracy score:",accuracy_score(test_y, lr_predict, normalize = True),'\n')

SVM
F1 score: 0.8609413563517967
Recall score: 0.8622448979591837
Precision score: 0.8621620726212563
Accuracy score: 0.8622448979591837 

Naive Bayes
F1 score: 0.6408830437946192
Recall score: 0.6632653061224489
Precision score: 0.6911992770621491
Accuracy score: 0.6632653061224489 

Random Forest


  _warn_prf(average, modifier, msg_start, len(result))


F1 score: 0.7970694228825992
Recall score: 0.8061224489795918
Precision score: 0.8050954409663883
Accuracy score: 0.8061224489795918 

Logistic Regression
F1 score: 0.7429410240467642
Recall score: 0.7551020408163265
Precision score: 0.7818392031721113
Accuracy score: 0.7551020408163265 

