In [1]:
import pandas as pd
from libs import preprocessa, modelos, vo_X, metricas
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
random_state=26

In [2]:
#Se estiver rodando localmente 
caminhoDados="dados/"

#Esse json é gerado no notebook TrataSegmentosAnotam.ipynb
df  = pd.read_json(caminhoDados+'dfAlvoAnota.json') 
vocabulario = df["textoTotal"]


In [3]:
#filtrar as classes mais relevantes a partir de 0.5% de presença
classesInteresse = [1,3,9,22,24,13,10,4,25]

#União Alt e inc
df =  df[df['idTipoAnotacao'].isin(classesInteresse)]
df.loc[df['idTipoAnotacao'] == 3, 'idTipoAnotacao'] = 1

#dfInteresse.groupby(['idTipoAnotacao','nomeTipoAnotacao']).idSegmento.count()


In [4]:
##Atributos iniciais
y = df['idTipoAnotacao']

dadosOriginais = []
x = preprocessa.vetorizaTFIDF(vocabulario,df["textoIntegraLimpo"].astype(str))
dadosOriginais.append(vo_X.vo_X(x,y,"Segmento"))

x = preprocessa.vetorizaTFIDF(vocabulario,df["verbosTextoIntegra"].astype(str))
dadosOriginais.append(vo_X.vo_X(x,y,"VerbosSegmento"))

x = preprocessa.vetorizaTFIDF(vocabulario,df["textoTotal"].astype(str))
dadosOriginais.append(vo_X.vo_X(x,y,"Segmento+Ementa"))

x = preprocessa.vetorizaTFIDF(vocabulario,df["verbos"].astype(str))
dadosOriginais.append(vo_X.vo_X(x,y,"TodosVerbos"))


In [5]:
## Seleção Atributos
dadosChi = []
for dados in dadosOriginais:
    xChi1 = preprocessa.seleciona(vocabulario,dados.x, y,0.1)
    dadosChi.append(vo_X.vo_X(xChi1,y, dados.estrategia + " Chi 0.1"))
    
    xChi25 = preprocessa.seleciona(vocabulario,dados.x, y,0.25)
    dadosChi.append(vo_X.vo_X(xChi25,y, dados.estrategia + " Chi 0.25"))
    
    xChi50 = preprocessa.seleciona(vocabulario,dados.x, y,0.5)
    dadosChi.append(vo_X.vo_X(xChi50,y, dados.estrategia + " Chi 0.5"))
    
   

In [6]:
#Reamostragem Smote
smote = SMOTE(random_state=random_state)
dadosSmote = []
for dados in dadosOriginais:     
    xs, ys = smote.fit_sample(dados.x,y)
    dadosSmote.append(vo_X.vo_X(xs, ys, dados.estrategia + " Smote"))
    
for dados in dadosChi:     
    xs, ys = smote.fit_sample(dados.x,y)
    dadosSmote.append(vo_X.vo_X(xs, ys, dados.estrategia + " Smote"))

In [7]:
#Reamostragem NearMiss
sampler = NearMiss()
dadosNearMiss = []

for dados in dadosOriginais: 
    xn, yn = sampler.fit_resample(dados.x,y)
    dadosNearMiss.append(vo_X.vo_X(xn, yn, dados.estrategia + " NearMiss"))

for dados in dadosChi:
    xn, yn = sampler.fit_resample(dados.x,y)
    dadosNearMiss.append(vo_X.vo_X(xn, yn, dados.estrategia + " NearMiss"))


In [8]:
listaDados = [*dadosOriginais, *dadosChi, *dadosSmote, *dadosNearMiss]

In [9]:
resultadoGeral = []
for dados in listaDados:
    classificadores = modelos.classificadores()
    classificadores.LSVC = True
    classificadores.MNB = True
    resultadosClassificacao =  classificadores.classificar(dados.x,dados.y)
    for res in resultadosClassificacao:
        res.insert(1, dados.estrategia) 
    resultadoGeral.extend(resultadosClassificacao)   

In [10]:
df_resultado  = pd.DataFrame(resultadoGeral)
df_resultado.columns = ['Modelo','Estratégia',
                        'Acurácia %','DP','Precisão',
                        'DP', 'Revocação','DP', 'F1','DP' ]
df_resultado.loc[df_resultado['Modelo']=='LinearSVC','Modelo']='SVM'
df_resultado.loc[df_resultado['Modelo']=='MultinomialNB','Modelo']='Naive Bayes'
df_resultado.sort_values(['F1'], ascending = False)

Unnamed: 0,Modelo,Estratégia,Acurácia %,DP,Precisão,DP.1,Revocação,DP.2,F1,DP.3
36,SVM,Segmento+Ementa Smote,98.90,0.0084,98.97,0.0069,98.90,0.0084,98.89,0.0086
32,SVM,Segmento Smote,97.60,0.0044,97.84,0.0035,97.60,0.0044,97.63,0.0043
56,SVM,Segmento+Ementa Chi 0.5 Smote,96.96,0.0090,96.98,0.0086,96.96,0.0090,96.93,0.0094
62,SVM,TodosVerbos Chi 0.5 Smote,96.39,0.0070,96.58,0.0058,96.39,0.0070,96.38,0.0074
38,SVM,TodosVerbos Smote,96.25,0.0076,96.45,0.0063,96.25,0.0076,96.24,0.0080
60,SVM,TodosVerbos Chi 0.25 Smote,95.76,0.0073,95.96,0.0060,95.76,0.0073,95.74,0.0078
54,SVM,Segmento+Ementa Chi 0.25 Smote,95.47,0.0095,95.57,0.0097,95.47,0.0095,95.44,0.0099
33,Naive Bayes,Segmento Smote,94.68,0.0056,94.75,0.0056,94.68,0.0056,94.58,0.0058
44,SVM,Segmento Chi 0.5 Smote,94.28,0.0033,95.00,0.0026,94.28,0.0033,94.36,0.0031
37,Naive Bayes,Segmento+Ementa Smote,93.98,0.0137,94.13,0.0118,93.98,0.0137,93.90,0.0145


In [11]:
df_resultado.to_csv("resultado.csv")

In [12]:
print(df_resultado.sort_values(['F1'], ascending = False).to_latex(index=False))

\begin{tabular}{llrrrrrrrr}
\toprule
      Modelo &                         Estratégia &  Acurácia \% &      DP &  Precisão &      DP &  Revocação &      DP &     F1 &      DP \\
\midrule
         SVM &              Segmento+Ementa Smote &       98.90 &  0.0084 &     98.97 &  0.0069 &      98.90 &  0.0084 &  98.89 &  0.0086 \\
         SVM &                     Segmento Smote &       97.60 &  0.0044 &     97.84 &  0.0035 &      97.60 &  0.0044 &  97.63 &  0.0043 \\
         SVM &      Segmento+Ementa Chi 0.5 Smote &       96.96 &  0.0090 &     96.98 &  0.0086 &      96.96 &  0.0090 &  96.93 &  0.0094 \\
         SVM &          TodosVerbos Chi 0.5 Smote &       96.39 &  0.0070 &     96.58 &  0.0058 &      96.39 &  0.0070 &  96.38 &  0.0074 \\
         SVM &                  TodosVerbos Smote &       96.25 &  0.0076 &     96.45 &  0.0063 &      96.25 &  0.0076 &  96.24 &  0.0080 \\
         SVM &         TodosVerbos Chi 0.25 Smote &       95.76 &  0.0073 &     95.96 &  0.0060 &      95.7