In [1]:
import pandas as pd
from libs import preprocessa, modelos, vo_X, metricas
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss
random_state=26

In [2]:
#Se estiver rodando localmente 
caminhoDados="dados/"

#Esse json é gerado no notebook TrataSegmentosAnotam.ipynb
df  = pd.read_json(caminhoDados+'dfAlvoAnota.json') 
vocabulario = df["textoTotal"]


In [3]:
#filtrar as classes mais relevantes a partir de 0.5% de presença
classesInteresse = [1,3,9,22,24,13,10,4,25]

#União Alt e inc
df =  df[df['idTipoAnotacao'].isin(classesInteresse)]
df.loc[df['idTipoAnotacao'] == 3, 'idTipoAnotacao'] = 1

#dfInteresse.groupby(['idTipoAnotacao','nomeTipoAnotacao']).idSegmento.count()


In [4]:
##Atributos iniciais
y = df['idTipoAnotacao']

dadosOriginais = []

vocabulario = df["textoIntegraLimpo"]
x = preprocessa.vetorizaTFIDF(vocabulario,df["textoIntegraLimpo"].astype(str))
dadosOriginais.append(vo_X.vo_X(x,y,"Segmento"))

vocabulario = df["verbosTextoIntegra"]
x = preprocessa.vetorizaTFIDF(vocabulario,df["verbosTextoIntegra"].astype(str))
dadosOriginais.append(vo_X.vo_X(x,y,"VerbosSegmento"))

vocabulario = df["textoTotal"]
x = preprocessa.vetorizaTFIDF(vocabulario,df["textoTotal"].astype(str))
dadosOriginais.append(vo_X.vo_X(x,y,"Segmento+Ementa"))

vocabulario = df["verbos"]
x = preprocessa.vetorizaTFIDF(vocabulario,df["verbos"].astype(str))
dadosOriginais.append(vo_X.vo_X(x,y,"TodosVerbos"))


In [5]:
## Seleção Atributos
dadosChi = []
for dados in dadosOriginais:
    xChi1 = preprocessa.seleciona(vocabulario,dados.x, y,0.1)
    dadosChi.append(vo_X.vo_X(xChi1,y, dados.estrategia + " Chi 0.1"))
    
    xChi25 = preprocessa.seleciona(vocabulario,dados.x, y,0.25)
    dadosChi.append(vo_X.vo_X(xChi25,y, dados.estrategia + " Chi 0.25"))
    
    xChi50 = preprocessa.seleciona(vocabulario,dados.x, y,0.5)
    dadosChi.append(vo_X.vo_X(xChi50,y, dados.estrategia + " Chi 0.5"))
    
   

In [6]:
#Reamostragem Smote
smote = SMOTE(random_state=random_state)
dadosSmote = []
for dados in dadosOriginais:     
    xs, ys = smote.fit_sample(dados.x,y)
    dadosSmote.append(vo_X.vo_X(xs, ys, dados.estrategia + " Smote"))
    
for dados in dadosChi:     
    xs, ys = smote.fit_sample(dados.x,y)
    dadosSmote.append(vo_X.vo_X(xs, ys, dados.estrategia + " Smote"))

In [7]:
#Reamostragem NearMiss
sampler = NearMiss()
dadosNearMiss = []

for dados in dadosOriginais: 
    xn, yn = sampler.fit_resample(dados.x,y)
    dadosNearMiss.append(vo_X.vo_X(xn, yn, dados.estrategia + " NearMiss"))

for dados in dadosChi:
    xn, yn = sampler.fit_resample(dados.x,y)
    dadosNearMiss.append(vo_X.vo_X(xn, yn, dados.estrategia + " NearMiss"))


In [8]:
listaDados = [*dadosOriginais, *dadosChi, *dadosSmote, *dadosNearMiss]

In [9]:
resultadoGeral = []
for dados in listaDados:
    if dados.estrategia == 'Segmento+Ementa Smote':
        classificadores = modelos.classificadores()
        classificadores.LSVC = True
        #classificadores.MNB = True
        resultadosClassificacao =  classificadores.classificar(dados.x,dados.y)
    for res in resultadosClassificacao:
        res.insert(1, dados.estrategia) 
    resultadoGeral.extend(resultadosClassificacao)   

In [10]:
df_resultado  = pd.DataFrame(resultadoGeral)
df_resultado.columns = ['Modelo','Estratégia',
                        'Acurácia %','DP','Precisão',
                        'DP', 'Revocação','DP', 'F1','DP' ]
df_resultado.loc[df_resultado['Modelo']=='LinearSVC','Modelo']='SVM'
df_resultado.loc[df_resultado['Modelo']=='MultinomialNB','Modelo']='Naive Bayes'
df_resultado.sort_values(['F1'], ascending = False)

Unnamed: 0,Modelo,Estratégia,Acurácia %,DP,Precisão,DP.1,Revocação,DP.2,F1,DP.3
18,SVM,Segmento+Ementa Smote,98.9,0.0084,98.97,0.0069,98.9,0.0084,98.89,0.0086
16,SVM,Segmento Smote,97.58,0.0045,97.83,0.0036,97.58,0.0045,97.61,0.0045
19,SVM,TodosVerbos Smote,96.8,0.0089,97.04,0.0075,96.79,0.0089,96.79,0.0094
28,SVM,Segmento+Ementa Chi 0.5 Smote,94.93,0.0098,95.07,0.0099,94.93,0.0098,94.91,0.0101
27,SVM,Segmento+Ementa Chi 0.25 Smote,93.63,0.0096,93.78,0.0095,93.63,0.0096,93.59,0.0098
31,SVM,TodosVerbos Chi 0.5 Smote,92.92,0.0097,93.27,0.01,92.92,0.0097,92.92,0.01
17,SVM,VerbosSegmento Smote,92.68,0.0078,94.05,0.004,92.68,0.0078,92.88,0.0074
22,SVM,Segmento Chi 0.5 Smote,91.93,0.0022,93.14,0.0022,91.93,0.0022,92.13,0.0021
26,SVM,Segmento+Ementa Chi 0.1 Smote,91.3,0.0099,91.56,0.0086,91.3,0.0099,91.23,0.0097
30,SVM,TodosVerbos Chi 0.25 Smote,90.64,0.0081,91.19,0.0094,90.64,0.0081,90.63,0.0084


In [11]:
df_resultado.to_csv("resultado.csv")

In [12]:
print(df_resultado.sort_values(['F1'], ascending = False).to_latex(index=False))

\begin{tabular}{llrrrrrrrr}
\toprule
Modelo &                         Estratégia &  Acurácia \% &      DP &  Precisão &      DP &  Revocação &      DP &     F1 &      DP \\
\midrule
   SVM &              Segmento+Ementa Smote &       98.90 &  0.0084 &     98.97 &  0.0069 &      98.90 &  0.0084 &  98.89 &  0.0086 \\
   SVM &                     Segmento Smote &       97.58 &  0.0045 &     97.83 &  0.0036 &      97.58 &  0.0045 &  97.61 &  0.0045 \\
   SVM &                  TodosVerbos Smote &       96.80 &  0.0089 &     97.04 &  0.0075 &      96.79 &  0.0089 &  96.79 &  0.0094 \\
   SVM &      Segmento+Ementa Chi 0.5 Smote &       94.93 &  0.0098 &     95.07 &  0.0099 &      94.93 &  0.0098 &  94.91 &  0.0101 \\
   SVM &     Segmento+Ementa Chi 0.25 Smote &       93.63 &  0.0096 &     93.78 &  0.0095 &      93.63 &  0.0096 &  93.59 &  0.0098 \\
   SVM &          TodosVerbos Chi 0.5 Smote &       92.92 &  0.0097 &     93.27 &  0.0100 &      92.92 &  0.0097 &  92.92 &  0.0100 \\
   SVM &

In [57]:
from sklearn.model_selection import cross_val_predict
from sklearn.svm import LinearSVC

modelo = LinearSVC(random_state=26, max_iter=1000)

for dado in listaDados:
    if dado.estrategia == 'Segmento+Ementa Smote':
        y_pred = cross_val_predict(modelo, dado.x,dado.y, cv=10)
        y_real = dado.y
        classes = dado.y.unique()
from libs import metricas
matrix = metricas.matrixConfusa(y_real, y_pred, classes)
matrix  

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear]

Unnamed: 0,24,1,9,22,10,25,13,4
24,13143,0,30,212,0,11,1,0
1,69,12578,516,80,0,147,0,7
9,18,32,13310,0,34,3,0,0
22,16,3,0,13378,0,0,0,0
10,0,0,0,0,13397,0,0,0
25,0,0,0,0,0,13397,0,0
13,0,0,0,0,0,0,13397,0
4,0,0,0,0,0,0,0,13397


In [81]:
for dado in listaDados:
    if dado.estrategia == 'Segmento+Ementa Smote':
        model =  LinearSVC(random_state=26, max_iter=1000, penalty='l2', C=1)
        accuracy,std_acc, precision,std_pre, recall, std_rec,f1,std_f1 =                metricas.calculaMetricasCV(model,dado.x, dado.y)

In [83]:
print( accuracy,std_acc, precision,std_pre, recall, std_rec,f1,std_f1)

98.9 0.0084 98.97 0.0069 98.9 0.0084 98.89 0.0086


In [79]:

for dado in listaDados:
    if dado.estrategia == 'Segmento+Ementa Smote':
        model =  LinearSVC(random_state=26, max_iter=1000, penalty='l2', C=1.5)
        accuracy,std_acc, precision,std_pre, recall, std_rec,f1,std_f1 =                metricas.calculaMetricasCV(model,dado.x, dado.y)

In [80]:
print(accuracy,std_acc, precision,std_pre, recall, std_rec,f1,std_f1)

98.92999999999999 0.0083 99.0 0.0068 98.92999999999999 0.0083 98.92 0.0085


In [84]:
for dado in listaDados:
    if dado.estrategia == 'Segmento+Ementa Smote':
        model =  LinearSVC(random_state=26, max_iter=1000, penalty='l2', C=2)
        accuracy,std_acc, precision,std_pre, recall, std_rec,f1,std_f1 =                metricas.calculaMetricasCV(model,dado.x, dado.y)

In [85]:
print( accuracy,std_acc, precision,std_pre, recall, std_rec,f1,std_f1)

98.95 0.0079 99.02 0.0065 98.95 0.0079 98.94 0.0081
