### QUALIFICAÇÃO

__Bases de dados utilizada:__  
- 20-Newsgroups: composta por 18.846 documentos em 20 classes  [(link)](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_20newsgroups.html).  
- Safety Alerts: elaborada pelo autor composta por 100 documentos em 10 classes.

__Classificadores testados:__  
- Regressão Logística  
- Naive Bayes  
- SVM  
- KNN  
- Árvore de Decisão  
- Random Forest   
- Rede Neural  

In [1]:
# Carregamento do dataset Movie Reviews da biblioteca NLTK

from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split

safety_alerts_dir = r'C:\Users\wander\safetyrank\app\static\safety-alerts-database'
safety_alerts_data = load_files(safety_alerts_dir, shuffle=True)
safety_alerts_X, safety_alerts_y = safety_alerts_data.data, safety_alerts_data.target

safety_alerts_X_train, safety_alerts_X_test, safety_alerts_y_train, safety_alerts_y_test = train_test_split(safety_alerts_X, safety_alerts_y, test_size=0.4, random_state=0)

In [2]:
# Carregamento do dataset 20 Newsgroups

import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups

newsgroups_train = fetch_20newsgroups(subset='train')
newsgroups_test = fetch_20newsgroups(subset='test')

newsgroups_X_train = newsgroups_train.data
newsgroups_y_train = newsgroups_train.target
newsgroups_X_test = newsgroups_test.data
newsgroups_y_test = newsgroups_test.target

In [3]:
# Carregamento das bibliotecas comuns do sklearn

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics

In [4]:
# Regressão Logística

from sklearn.linear_model import LogisticRegression

text_clf_logistic_regression = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression(penalty='l2', 
                                                dual=False, 
                                                tol=0.0001, 
                                                C=1.0, 
                                                fit_intercept=True, 
                                                intercept_scaling=1, 
                                                class_weight=None, 
                                                random_state=None, 
                                                solver='lbfgs', 
                                                max_iter=1000, 
                                                multi_class='multinomial', 
                                                verbose=0, 
                                                warm_start=False, 
                                                n_jobs=None, 
                                                l1_ratio=None)),
                     ])

#text_clf_logistic_regression.fit(X_train, y_train)
#predicted = text_clf_logistic_regression.predict(X_test)
#print(metrics.classification_report(y_test, predicted))

In [5]:
# Naive Bayes

from sklearn.naive_bayes import MultinomialNB

text_clf_naive_bayes = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB(alpha=1.0, 
                                           fit_prior=True, 
                                           class_prior=None)),
                     ])

#text_clf_naive_bayes.fit(X_train, y_train)
#predicted = text_clf_naive_bayes.predict(X_test)
#print(metrics.classification_report(y_test, predicted))

In [6]:
# SVM

from sklearn.svm import LinearSVC

text_clf_svm = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LinearSVC(penalty='l2', 
                                       loss='squared_hinge', 
                                       dual=True, 
                                       tol=0.0001, 
                                       C=1.0, 
                                       multi_class='ovr', # treina n_classes classificadores tipo one-vs-rest
                                       fit_intercept=True, 
                                       intercept_scaling=1, 
                                       class_weight=None, 
                                       verbose=0, 
                                       random_state=None, 
                                       max_iter=1000)),
                     ])

#text_clf_svm.fit(X_train, y_train)
#predicted = text_clf_svm.predict(X_test)
#print(metrics.classification_report(y_test, predicted))

In [7]:
# KNN

from sklearn.neighbors import KNeighborsClassifier

text_clf_knn = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier(n_neighbors=5, 
                                                  weights='uniform', 
                                                  algorithm='auto', 
                                                  leaf_size=30, 
                                                  p=2, 
                                                  metric='minkowski', 
                                                  metric_params=None, 
                                                  n_jobs=None)),
                     ])

#text_clf_knn.fit(X_train, y_train)
#predicted = text_clf_knn.predict(X_test)
#print(metrics.classification_report(y_test, predicted))

In [8]:
# Árvore de Decisão

from sklearn.tree import DecisionTreeClassifier

text_clf_decision_tree = Pipeline([('vect', CountVectorizer()),
                         ('tfidf', TfidfTransformer()),
                         ('clf', DecisionTreeClassifier(criterion='gini', 
                                                        splitter='best', 
                                                        max_depth=None, 
                                                        min_samples_split=2, 
                                                        min_samples_leaf=1, 
                                                        min_weight_fraction_leaf=0.0, 
                                                        max_features=None, 
                                                        random_state=None, 
                                                        max_leaf_nodes=None, 
                                                        min_impurity_decrease=0.0, 
                                                        min_impurity_split=None, 
                                                        class_weight=None, 
                                                        presort=False)),
                         ])

#text_clf_decision_tree.fit(X_train, y_train)
#predicted = text_clf_decision_tree.predict(X_test)
#print(metrics.classification_report(y_test, predicted))

In [9]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

text_clf_rf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(n_estimators=100, 
                                                    criterion='gini', 
                                                    max_depth=None, 
                                                    min_samples_split=2, 
                                                    min_samples_leaf=1, 
                                                    min_weight_fraction_leaf=0.0, 
                                                    max_features='auto', 
                                                    max_leaf_nodes=None, 
                                                    min_impurity_decrease=0.0, 
                                                    min_impurity_split=None, 
                                                    bootstrap=True, 
                                                    oob_score=False, 
                                                    n_jobs=None, 
                                                    random_state=None, 
                                                    verbose=0, 
                                                    warm_start=False, 
                                                    class_weight=None)),
                     ])

#text_clf_rf.fit(X_train, y_train)
#predicted = text_clf_rf.predict(X_test)
#print(metrics.classification_report(y_test, predicted))

In [10]:
# Rede Neural (encapsulado dentro da classe base do scikit-learn) conforme dica do Prof. Boltd
# %tensorflow_version 2.x
import tensorflow as tf
from sklearn.base import BaseEstimator, ClassifierMixin

class RedeNeural(BaseEstimator, ClassifierMixin):
    """ Classe customizada com base no sklearn """
    def __init__(self):
        pass
  
    def fit(self, X_train, y_train):
        """ Estrutura da rede neural adaptada de Kowsari (2019) """
        nFeatures = X_train.shape[1]
        nClasses = np.unique(y_train).shape[0]
        nNos = 256
        nCamadas = 4
        dropout=0.5
        
        # O TensorFlow 2.0 ainda não suporta a matriz esparsa do scikit-learn:
        # Failed to find data adapter that can handle input: <class 'scipy.sparse.csr.csr_matrix'>
        # Então converti X_train para array
        X_train = X_train.toarray()

        self.model = tf.keras.models.Sequential()    

        self.model.add(tf.keras.layers.Dense(nNos,input_dim=nFeatures,activation='relu'))
        self.model.add(tf.keras.layers.Dropout(dropout))

        for _ in range(0, nCamadas):
            self.model.add(tf.keras.layers.Dense(nNos,input_dim=nNos,activation='relu'))
            self.model.add(tf.keras.layers.Dropout(dropout))

        self.model.add(tf.keras.layers.Dense(nClasses, activation='softmax'))

        self.model.compile(optimizer='adam',
                      loss='sparse_categorical_crossentropy',
                      metrics=['accuracy'])       

        self.model.fit(X_train, y_train, epochs=5, verbose=0)

    def predict(self, X_test):
        # O TensorFlow 2.0 ainda não suporta a matriz esparsa do scikit-learn:
        # Failed to find data adapter that can handle input: <class 'scipy.sparse.csr.csr_matrix'>
        # Então converti X_test para array
        X_test = X_test.toarray()
        
        predictions = self.model.predict(X_test)
        return np.argmax(predictions,axis=1)


text_clf_ann = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RedeNeural()),
                     ])

#text_clf_ann.fit(X_train, y_train)
#predicted = text_clf_ann.predict(X_test)
#print(metrics.classification_report(y_test, predicted))

In [11]:
# Classe avaliadora de performance dos classificadores (com base no exemplo do Prof. Boldt)

from sklearn.model_selection import KFold

class PerformanceEvaluator():
    """ Classe avaliadora de performance dos classificadores (com base no exemplo do Prof. Boldt) 
        Utiliza validação cruzada para o treinamento.
    """
    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        self.kf = KFold(n_splits=5)
    
    def score(self, clf):
        scores = []
        for train, validate in self.kf.split(self.X_train):
            clf.fit(self.X_train[train],self.y_train[train])
            scores.append(clf.score(self.X_train[validate],self.y_train[validate]))
        return np.mean(scores), np.std(scores)
    
    def treinar(self, clfs):
        print("TREINAMENTO (com validação cruzada) - - - - - -")
        print(f'{"":>20}  Média \t Desvio Padrão')
        for name,clf in clfs:
            score_mean, score_std = self.score(clf)
            print(f'{name:>20}: {score_mean:.4f} \t {score_std:.4f}')

    def testar(self, clfs, X_test, y_test):
        # Testa os classificadores em dados de teste (não vistos no treinamento)
        print("TESTE- - - - - - - - - - - - - -- - - - - - - -")
        for name,clf in clfs:
            score = clf.score(X_test, y_test)
            print(f'{name:>20}: {score:.4f}')

In [13]:
# Avaliação de todos os classificadores

clfs = [
    ('Logistic Regression', text_clf_logistic_regression),
    ('Naive Bayes', text_clf_naive_bayes),
    ('SVM', text_clf_svm),
    ('KNN', text_clf_knn),
    ('Decision Tree', text_clf_decision_tree),
    ('Random Forest', text_clf_rf),
    ('ANN', text_clf_ann),
]

newsgroups_pe = PerformanceEvaluator(np.array(newsgroups_X_train), np.array(newsgroups_y_train))
safety_alerts_pe = PerformanceEvaluator(np.array(safety_alerts_X_train), np.array(safety_alerts_y_train))

### RESULTADOS

#### Dataset SAFETY-ALERTS

In [14]:
%%time
# Treina os classificadores usando validação cruzada na base de Alertas de Seguranca
safety_alerts_pe.treinar(clfs)

# Testa os classificadores em dados de teste (não vistos no treinamento)
safety_alerts_pe.testar(clfs, np.array(safety_alerts_X_test), np.array(safety_alerts_y_test))

TREINAMENTO (com validação cruzada) - - - - - -
                      Média 	 Desvio Padrão
 Logistic Regression: 0.2591 	 0.1472
         Naive Bayes: 0.1909 	 0.1560
                 SVM: 0.4712 	 0.2084
                 KNN: 0.4333 	 0.2226
       Decision Tree: 0.4470 	 0.1220
       Random Forest: 0.3121 	 0.2087
                 ANN: 0.0515 	 0.0422
TESTE- - - - - - - - - - - - - -- - - - - - - -
 Logistic Regression: 0.2500
         Naive Bayes: 0.1500
                 SVM: 0.6500
                 KNN: 0.4750
       Decision Tree: 0.4500
       Random Forest: 0.3000
                 ANN: 0.0750
Wall time: 6.49 s


#### Dataset 20-NEWSGROUPS

In [15]:
%%time
# Treina os classificadores usando validação cruzada na base 20NEWSGROUPS
newsgroups_pe.treinar(clfs)

# Testa os classificadores em dados de teste (não vistos no treinamento)
newsgroups_pe.testar(clfs, np.array(newsgroups_X_test), np.array(newsgroups_y_test))

TREINAMENTO (com validação cruzada) - - - - - -
                      Média 	 Desvio Padrão
 Logistic Regression: 0.8940 	 0.0057
         Naive Bayes: 0.8393 	 0.0122
                 SVM: 0.9264 	 0.0028


KeyboardInterrupt: 