# Teste de Classificadores com ajuste de peso e modelos de arvore com up/downsampling 

O objetivo é testar os modelos XGBoost e adaBoost sem sampling e a floresta aleatoria e arvore de decisão com sampling, analisar qual deles tem a melhor metrica de recall para o banco de dados com todas as 4 tabelas e qual tem o melhor ponto de cotovelo na curva recall-precision.

## Preparar os dados e criar classe de processamento 

### Importando bibliotecas e dados

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from itertools import islice

from tqdm.auto import tqdm
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords as nltk_stopwords

import xgboost as xgb
from xgboost import plot_tree

from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from enelvo.normaliser import Normaliser
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_denuncia_crime=pd.read_csv('/Users/mhctds/Cidade_Social/base de dados/base_apps_denuncia_crime.csv')
df_base_rocinha=pd.read_csv('/Users/mhctds/Cidade_Social/base de dados/base_rocinha_df.csv')
df_protestos_2013=pd.read_csv('/Users/mhctds/Cidade_Social/base de dados/protestos_2013_df.csv')
df_protestos_PMES=pd.read_csv('/Users/mhctds/Cidade_Social/base de dados/protestos_PMES.csv')

### Criar classe de processamento

In [10]:
class DataProcessing:
    vect=None
    # Junta os dataframes dentro do vetor de dataframes
    def append_data(self,df_vector):
        df_final=df_vector[0]
        df_final=df_final[['text','Total(SUM)','Classe de Violência']]
        for df in islice(df_vector, 1, None) :
            df=df[['text','Total(SUM)','Classe de Violência']]
            df_final=pd.concat([df_final]+[df])
        df_final.reset_index()
        df_final['text']=df_final['text'].astype('str')
        return df_final
    
    #Normalização usando spacy
    def text_normalizer_spacy(self,corpus):
        nlp = spacy.load('pt_core_news_sm', disable=['parser', 'ner'])
        lemm=[]
        for text in tqdm(corpus):
            doc = nlp(text)
            #tokens = [token.lemma_ for token in doc if not token.is_stop]
            tokens = [token.lemma_ for token in doc]
            text= ' '.join(tokens)
            
            lemm.append(text)
        return lemm
    # Normalização usando enelvo e vetorização usando nltk 
    def text_preprocessing_nltk(self,corpus):
        stop_words=list(nltk_stopwords.words('portuguese'))
        norm = Normaliser(tokenizer='readable',sanitize=True)
        lemm=[]
        for texts in corpus:
            lemm.append(norm.normalise(texts))
        if self.vect is None:
            self.vect=TfidfVectorizer(stop_words=stop_words)
            self.vect.fit(corpus)
        processed=self.vect.transform(lemm)
        return processed
    #Vetorização usando nltk
    def text_preprocessing_nltk_no_norm(self,corpus):
        stop_words=list(nltk_stopwords.words('portuguese'))
        if self.vect is None:
            self.vect=TfidfVectorizer(stop_words=stop_words)
            self.vect.fit(corpus)
        processed=self.vect.transform(corpus)
        return processed
    #Mudar target para valor numerico
    def numerical_target(target):
        target.replace('Not Violence',0,inplace=True)
        target.replace('Low',1,inplace=True)
        target.replace('Medium',2,inplace=True)
        target.replace('High',3,inplace=True)
        target.replace('VeryHight',4,inplace=True)
        return target
    #fraction é a fração que vai sobrar do original, deve ser colocado um valor entre 0 e 1
    # Se usado 0.3 por exemplo, perderemos 60% dos registros daquele target, sobrando 30 porcento
    def downsample(self,features, target, fraction,value):
        features_true = features[target == value]
        features_false = features[target != value]
        target_true = target[target == value]
        target_false = target[target != value]

        features_downsampled = pd.concat(
            [features_true.sample(frac=fraction, random_state=12345)]
            + [features_false]
        )
        target_downsampled = pd.concat(
            [target_true.sample(frac=fraction, random_state=12345)]
            + [target_false]
        )

        return features_downsampled, target_downsampled
    # repeat é o numero de vezes que aquele target sera clonado, deve ser um int maior que 1
    def upsample(self,features, target, repeat,value):
        features_true = features[target == value]
        features_false = features[target != value]
        target_true = target[target == value]
        target_false = target[target != value]

        features_upsampled = pd.concat([features_false] + [features_true] * repeat)
        target_upsampled = pd.concat([target_false] + [target_true] * repeat)

        return features_upsampled, target_upsampled

### Unir dataframes e separar em features e target

In [11]:
df_vector=[df_base_rocinha,df_denuncia_crime,df_protestos_2013,df_protestos_PMES]
DataProcess=DataProcessing()
df_final=DataProcess.append_data(df_vector)
df_final.info()
print(df_final.head(5))

<class 'pandas.core.frame.DataFrame'>
Index: 2038 entries, 0 to 503
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   text                 2038 non-null   object 
 1   Total(SUM)           2037 non-null   float64
 2   Classe de Violência  2038 non-null   object 
dtypes: float64(1), object(2)
memory usage: 63.7+ KB
                                                text  Total(SUM)  \
0  RT @tvjornalsbt: "Morte" denuncia a precarieda...         8.0   
1  RT @tvjornalsbt: "Morte" denuncia a precarieda...         8.0   
2  "Morte" denuncia a precariedade da BR-101, na ...         8.0   
3  Terremoto de 7.1 no México. Onde vivo não pass...         7.0   
4  RT @vinigrilo1: Moradores com medo e assustado...         8.0   

  Classe de Violência  
0                High  
1                High  
2                High  
3                High  
4                High  


In [None]:
df_final=df_final.drop_duplicates().reset_index()
features=DataProcess.text_preprocessing_nltk(df_final['text'])
target=df_final['Classe de Violência']
train_data, test_data, train_target, test_target = train_test_split(features, target, test_size=0.3, random_state=12345,shuffle=True,stratify=target)

  (0, 6508)	0.18169683456799995
  (0, 5979)	0.4059322946811416
  (0, 5660)	0.18277276388524277
  (0, 5070)	0.3486059109880911
  (0, 4757)	0.3847748321246903
  (0, 3936)	0.32744844843163967
  (0, 1804)	0.3847748321246903
  (0, 1075)	0.32744844843163967
  (0, 32)	0.3697633735445424
  (1, 6508)	0.1988141019894129
  (1, 5660)	0.19999139229015095
  (1, 5070)	0.38144732298767775
  (1, 4757)	0.42102364027903755
  (1, 3936)	0.35829666145559
  (1, 1804)	0.42102364027903755
  (1, 1075)	0.35829666145559
  (1, 32)	0.4045979845197655
  (2, 6508)	0.1370384881407038
  (2, 6313)	0.2700991801104901
  (2, 5791)	0.2788811032444624
  (2, 5660)	0.13784997023026124
  (2, 4969)	0.2516019583874566
  (2, 4735)	0.20091782567980757
  (2, 4433)	0.29020297130837946
  (2, 4296)	0.2516019583874566
  :	:
  (1754, 2249)	0.16706834099689596
  (1754, 1890)	0.4654480529596111
  (1754, 1480)	0.4411885892965845
  (1755, 5384)	0.18222955621179246
  (1755, 5052)	0.5128196430430895
  (1755, 4119)	0.3574836934201622
  (1755, 2

In [21]:
train_target.info()

<class 'pandas.core.series.Series'>
Index: 1230 entries, 1392 to 1433
Series name: Classe de Violência
Non-Null Count  Dtype 
--------------  ----- 
1230 non-null   object
dtypes: object(1)
memory usage: 19.2+ KB


## Testando Modelos de Classificação

### XGBoost

In [6]:
# Consertar kernel e dependencias 
"""
best_recall=0
recall_list_XGBoost=[]
precision_list_XGBoost=[]
xgb_train_target=DataProcessing.numerical_target(train_target)
xgb_test_target=DataProcessing.numerical_target(test_target)
#Talvez não tenha parametro random_state
#Tem parametro learning rate, talvez vale a pena testar
for estimators in tqdm(range(1,100)):
    for depth in range(1,15):
        model = xgb.XGBClassifier(n_estimators=estimators,max_depth=depth, random_state=12345)
        model.fit(train_data,xgb_train_target)
        prediction=model.predict(test_data)
        recall=recall_score(xgb_test_target,prediction)
        precision=precision_score(xgb_test_target,prediction)
        recall_list_XGBoost.append(recall)
        precision_list_XGBoost.append(precision)
        if best_recall<recall:
            best_depth=depth
            best_recall=recall
            best_estimators=estimators
            best_acc=accuracy_score(test_target,prediction)
            best_f1=f1_score(test_target,prediction)
            best_precision=precision
print('best depth:',best_depth)
print('best number of estimators:',best_estimators)
print('best recall:',best_recall)
print('best accuracy:',best_acc)
print('best f1:',best_f1)
print('best precision:',best_precision) 
"""

"\nbest_recall=0\nrecall_list_XGBoost=[]\nprecision_list_XGBoost=[]\nxgb_train_target=DataProcessing.numerical_target(train_target)\nxgb_test_target=DataProcessing.numerical_target(test_target)\n#Talvez não tenha parametro random_state\n#Tem parametro learning rate, talvez vale a pena testar\nfor estimators in tqdm(range(1,100)):\n    for depth in range(1,15):\n        model = xgb.XGBClassifier(n_estimators=estimators,max_depth=depth, random_state=12345)\n        model.fit(train_data,xgb_train_target)\n        prediction=model.predict(test_data)\n        recall=recall_score(xgb_test_target,prediction)\n        precision=precision_score(xgb_test_target,prediction)\n        recall_list_XGBoost.append(recall)\n        precision_list_XGBoost.append(precision)\n        if best_recall<recall:\n            best_depth=depth\n            best_recall=recall\n            best_estimators=estimators\n            best_acc=accuracy_score(test_target,prediction)\n            best_f1=f1_score(test_targ

### AdaBoost

In [7]:
best_recall=0
recall_list_adaboost=[]
precision_list_adaboost=[]
#Talvez não tenha parametro  ou random_state
#Talvez tenha parametro learning rate, talvez vale a pena testar
for estimators in tqdm(range(1,100)):
    for depth in range(1,10):
        model = AdaBoostClassifier(n_estimators=estimators, random_state=12345)
        model.fit(train_data,train_target)
        prediction=model.predict(test_data)
        recall=recall_score(test_target,prediction,average='weighted')
        precision=precision_score(test_target,prediction,average='weighted',zero_division=0)
        recall_list_adaboost.append(recall)
        precision_list_adaboost.append(precision)
        if best_recall<recall:
            best_model=model
            best_depth=depth
            best_recall=recall
            best_estimators=estimators
            best_acc=accuracy_score(test_target,prediction)
            best_f1=f1_score(test_target,prediction,average='weighted')
            best_precision=precision
print('best depth:',best_depth)
print('best number of estimators:',best_estimators)
print('best recall:',best_recall)
print('best accuracy:',best_acc)
print('best f1:',best_f1)
print('best precision:',best_precision)

100%|██████████| 99/99 [05:37<00:00,  3.41s/it]

best depth: 1
best number of estimators: 1
best recall: 0.7348484848484849
best accuracy: 0.7348484848484849
best f1: 0.670388091440723
best precision: 0.691673670796844





### Floresta Aleatoria sem Sampling

In [8]:
best_recall=0
recall_list_florest=[]
precision_list_florest=[]
for size in tqdm(range(1,50)):
    for depth in range(1,30):
        model=RandomForestClassifier(random_state=123456789,max_depth=depth,n_estimators=size)
        model.fit(train_data,train_target)
        prediction=model.predict(test_data)
        recall=recall_score(test_target,prediction,average='weighted')
        precision=precision_score(test_target,prediction,average='weighted',zero_division=0)
        recall_list_florest.append(recall)
        precision_list_florest.append(precision)
        if best_recall<recall:
            best_depth=depth
            best_recall=recall
            best_size=size
            best_acc=accuracy_score(test_target,prediction)
            best_f1=f1_score(test_target,prediction,average='weighted')
            best_precision=precision
print('best depth:',best_depth)
print('best size:',best_size)
print('best recall:',best_recall)
print('best accuracy:',best_acc)
print('best f1:',best_f1)
print('best precision:',best_precision)

100%|██████████| 49/49 [01:19<00:00,  1.62s/it]

best depth: 26
best size: 14
best recall: 0.821969696969697
best accuracy: 0.821969696969697
best f1: 0.7883088358696742
best precision: 0.7776370756231701





### Arvore de decisão sem Sampling

In [9]:
best_recall=0
recall_list_tree=[]
precision_list_tree=[]
for depth in tqdm(range(1,100)):
    model=DecisionTreeClassifier(random_state=123456789,max_depth=depth)
    model.fit(train_data,train_target)
    prediction=model.predict(test_data)
    recall=recall_score(test_target,prediction,average='weighted')
    precision=precision_score(test_target,prediction,average='weighted',zero_division=0)
    recall_list_tree.append(recall)
    precision_list_tree.append(precision)
    if best_recall<recall:
        best_depth=depth
        best_recall=recall
        best_acc=accuracy_score(test_target,prediction)
        best_f1=f1_score(test_target,prediction,average='weighted')
        best_precision=precision
print('best depth:',best_depth)
print('best recall:',best_recall)
print('best accuracy:',best_acc)
print('best f1:',best_f1)
print('best precision:',best_precision)

100%|██████████| 99/99 [00:04<00:00, 23.63it/s]

best depth: 14
best recall: 0.8238636363636364
best accuracy: 0.8238636363636364
best f1: 0.8099456347364743
best precision: 0.8091923620637237





# Naive-Bayes sem sampling

In [12]:
model = GaussianNB()
model.fit(train_data.toarray(),train_target)
prediction=model.predict(test_data.toarray())
recall=recall_score(test_target,prediction,average='weighted')
acc=accuracy_score(test_target,prediction)
f1=f1_score(test_target,prediction,average='weighted')
print('best recall:',recall)
print('best accuracy:',acc)
print('best f1:',f1)
print('best precision:',precision)

best recall: 0.7708333333333334
best accuracy: 0.7708333333333334
best f1: 0.777362752418327
best precision: 0.8101560719981773


#### Sampling os dados

In [20]:
df_sampled,garbage=train_test_split(df_final.drop_duplicates().reset_index(), test_size=0.3, random_state=12345,shuffle=True,stratify=df_final['Classe de Violência'])
print(df_sampled.info())
features_sampled,target_sampled=DataProcess.upsample(df_sampled['text'], df_sampled['Classe de Violência'], 8,'Low')
features_sampled,target_sampled=DataProcess.upsample(features_sampled,target_sampled, 5,'Medium')
features_sampled,target_sampled=DataProcess.upsample(features_sampled,target_sampled, 5,'High')
features_sampled=DataProcess.text_preprocessing_nltk(features_sampled)

<class 'pandas.core.frame.DataFrame'>
Index: 1230 entries, 1392 to 1433
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   level_0              1230 non-null   int64  
 1   index                1230 non-null   int64  
 2   text                 1230 non-null   object 
 3   Total(SUM)           1229 non-null   float64
 4   Classe de Violência  1230 non-null   object 
dtypes: float64(1), int64(2), object(2)
memory usage: 57.7+ KB
None


### Floresta Aleatoria com Sampling

In [None]:
best_recall=0
recall_list_florest=[]
precision_list_florest=[]
for size in tqdm(range(1,50)):
    for depth in range(1,30):
        model=RandomForestClassifier(random_state=123456789,max_depth=depth,n_estimators=size)
        model.fit(features_sampled,target_sampled)
        prediction=model.predict(test_data)
        recall=recall_score(test_target,prediction,average='weighted')
        precision=precision_score(test_target,prediction,average='weighted',zero_division=0)
        recall_list_florest.append(recall)
        precision_list_florest.append(precision)
        if best_recall<recall:
            best_model=model
            best_depth=depth
            best_recall=recall
            best_size=size
            best_acc=accuracy_score(test_target,prediction)
            best_f1=f1_score(test_target,prediction,average='weighted')
            best_precision=precision
print('best depth:',best_depth)
print('best size:',best_size)
print('best recall:',best_recall)
print('best accuracy:',best_acc)
print('best f1:',best_f1)
print('best precision:',best_precision)

100%|██████████| 49/49 [01:40<00:00,  2.05s/it]

best depth: 27
best size: 7
best recall: 0.8208955223880597
best accuracy: 0.8208955223880597
best f1: 0.799664331117894
best precision: 0.8364852991005266





### Arvore de decisão com Sampling

In [19]:
best_recall=0
recall_list_tree=[]
precision_list_tree=[]
for depth in tqdm(range(1,100)):
    model=DecisionTreeClassifier(random_state=123456789,max_depth=depth)
    model.fit(features_sampled,target_sampled)
    prediction=model.predict(test_data)
    recall=recall_score(test_target,prediction,average='weighted')
    precision=precision_score(test_target,prediction,average='weighted',zero_division=0)
    recall_list_tree.append(recall)
    precision_list_tree.append(precision)
    if best_recall<recall:
        best_model=model
        best_depth=depth
        best_recall=recall
        best_acc=accuracy_score(test_target,prediction)
        best_f1=f1_score(test_target,prediction,average='weighted')
        best_precision=precision
print('best depth:',best_depth)
print('best recall:',best_recall)
print('best accuracy:',best_acc)
print('best f1:',best_f1)
print('best precision:',best_precision)

100%|██████████| 99/99 [00:04<00:00, 22.51it/s]

best depth: 37
best recall: 0.8087121212121212
best accuracy: 0.8087121212121212
best f1: 0.7995784424877928
best precision: 0.803981633985345





# Naive-Bayes com sampling

In [None]:
model = GaussianNB()
model.fit(features_sampled.toarray(),target_sampled)
prediction=model.predict(test_data.toarray())
recall=recall_score(test_target,prediction,average='weighted')
acc=accuracy_score(test_target,prediction)
f1=f1_score(test_target,prediction,average='weighted')
print('best recall:',recall)
print('best accuracy:',acc)
print('best f1:',f1)
print('best precision:',precision)

best recall: 0.8
best accuracy: 0.8
best f1: 0.8060639076344198
best precision: 0.8440598487385921


### Curva Recall-Precision dos modelos

# Conclusões

Por enquanto o melhor modelo entre eles, sem contar o XGBoost, parece ser a arvore de decisão.

Parece valer a pena testar a vetorização com por NILC quando testarmos regressão.

Normalização da uma pequena melhora em alguns dos modelos, mas talvez seja por coincidencia (a seed ser boa para aquele novo formato de matriz especifica)