# EnCompIF 2020

## __SafetyRank__: comparação de técnicas de aprendizado de máquina para classificação de alertas de segurança industriais

__Bases de dados utilizada:__  
- Safety Alerts: elaborada pelo autor composta por 80 documentos em 8 classes (pasta anexa).

__Classificadores experimentados:__  
- Regressão Logística  
- Naive Bayes  
- SVM  
- KNN  
- Árvore de Decisão  
- Random Forest   

In [23]:
# Carregamento do dataset
import numpy as np
import pandas as pd
from sklearn.datasets import load_files
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

# Diretorio da base
safety_alerts_dir = './safety-alerts-database/'

safety_alerts_data = load_files(safety_alerts_dir)

safety_alerts_X, safety_alerts_y = safety_alerts_data.data, safety_alerts_data.target

safety_alerts_X_train, safety_alerts_X_test, safety_alerts_y_train, safety_alerts_y_test = train_test_split(safety_alerts_X, safety_alerts_y, test_size=0.4)

safety_alerts_y_test

array([3, 3, 0, 1, 1, 0, 3, 6, 6, 5, 1, 2, 1, 3, 4, 4, 0, 5, 2, 5, 1, 6,
       1, 3, 2, 2, 5, 4])

In [25]:
# Carregamento das bibliotecas comuns do sklearn

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import metrics

In [26]:
# Regressão Logística

from sklearn.linear_model import LogisticRegression

text_clf_logistic_regression = Pipeline([('vect', CountVectorizer(analyzer='word', max_df=0.95, min_df=2,stop_words= 'english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression(penalty='l2', 
                                                dual=False, 
                                                tol=0.0001, 
                                                C=1.0, 
                                                fit_intercept=True, 
                                                intercept_scaling=1, 
                                                class_weight=None, 
                                                random_state=None, 
                                                solver='lbfgs', 
                                                max_iter=1000, 
                                                multi_class='multinomial', 
                                                verbose=0, 
                                                warm_start=False, 
                                                n_jobs=None, 
                                                l1_ratio=None)),
                     ])

#text_clf_logistic_regression.fit(X_train, y_train)
#predicted = text_clf_logistic_regression.predict(X_test)
#print(metrics.classification_report(y_test, predicted))

In [27]:
# Naive Bayes

from sklearn.naive_bayes import MultinomialNB

text_clf_naive_bayes = Pipeline([('vect', CountVectorizer(analyzer='word', max_df=0.95, min_df=2,stop_words= 'english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB(alpha=1.0, 
                                           fit_prior=True, 
                                           class_prior=None)),
                     ])

#text_clf_naive_bayes.fit(X_train, y_train)
#predicted = text_clf_naive_bayes.predict(X_test)
#print(metrics.classification_report(y_test, predicted))

In [28]:
# SVM

from sklearn.svm import LinearSVC

text_clf_svm = Pipeline([('vect', CountVectorizer(analyzer='word', max_df=0.95, min_df=2,stop_words= 'english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LinearSVC(penalty='l2', 
                                       loss='squared_hinge', 
                                       dual=True, 
                                       tol=0.0001, 
                                       C=1.0, 
                                       multi_class='ovr', # treina n_classes classificadores tipo one-vs-rest
                                       fit_intercept=True, 
                                       intercept_scaling=1, 
                                       class_weight=None, 
                                       verbose=0, 
                                       random_state=None, 
                                       max_iter=1000)),
                     ])

#text_clf_svm.fit(X_train, y_train)
#predicted = text_clf_svm.predict(X_test)
#print(metrics.classification_report(y_test, predicted))

In [29]:
# KNN

from sklearn.neighbors import KNeighborsClassifier

text_clf_knn = Pipeline([('vect', CountVectorizer(analyzer='word', max_df=0.95, min_df=2,stop_words= 'english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', KNeighborsClassifier(n_neighbors=5, 
                                                  weights='uniform', 
                                                  algorithm='auto', 
                                                  leaf_size=30, 
                                                  p=2, 
                                                  metric='minkowski', 
                                                  metric_params=None, 
                                                  n_jobs=None)),
                     ])

#text_clf_knn.fit(X_train, y_train)
#predicted = text_clf_knn.predict(X_test)
#print(metrics.classification_report(y_test, predicted))

In [30]:
# Árvore de Decisão

from sklearn.tree import DecisionTreeClassifier

text_clf_decision_tree = Pipeline([('vect', CountVectorizer(analyzer='word', max_df=0.95, min_df=2,stop_words= 'english')),
                         ('tfidf', TfidfTransformer()),
                         ('clf', DecisionTreeClassifier(criterion='gini', 
                                                        splitter='best', 
                                                        max_depth=None, 
                                                        min_samples_split=2, 
                                                        min_samples_leaf=1, 
                                                        min_weight_fraction_leaf=0.0, 
                                                        max_features=None, 
                                                        random_state=None, 
                                                        max_leaf_nodes=None, 
                                                        min_impurity_decrease=0.0, 
                                                        min_impurity_split=None, 
                                                        class_weight=None, 
                                                        presort=False)),
                         ])

#text_clf_decision_tree.fit(X_train, y_train)
#predicted = text_clf_decision_tree.predict(X_test)
#print(metrics.classification_report(y_test, predicted))

In [31]:
# Random Forest

from sklearn.ensemble import RandomForestClassifier

text_clf_rf = Pipeline([('vect', CountVectorizer(analyzer='word', max_df=0.95, min_df=2,stop_words= 'english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier(n_estimators=100, 
                                                    criterion='gini', 
                                                    max_depth=None, 
                                                    min_samples_split=2, 
                                                    min_samples_leaf=1, 
                                                    min_weight_fraction_leaf=0.0, 
                                                    max_features='auto', 
                                                    max_leaf_nodes=None, 
                                                    min_impurity_decrease=0.0, 
                                                    min_impurity_split=None, 
                                                    bootstrap=True, 
                                                    oob_score=False, 
                                                    n_jobs=None, 
                                                    random_state=None, 
                                                    verbose=0, 
                                                    warm_start=False, 
                                                    class_weight=None)),
                     ])

#text_clf_rf.fit(X_train, y_train)
#predicted = text_clf_rf.predict(X_test)
#print(metrics.classification_report(y_test, predicted))

In [32]:
# Classe avaliadora de performance dos classificadores (com base no exemplo do Prof. Boldt)

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


class PerformanceEvaluator():
    """ Classe avaliadora de performance dos classificadores (com base no exemplo do Prof. Boldt) 
        Utiliza validação cruzada para o treinamento.
    """
    def __init__(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train
        self.kf = KFold(n_splits=4)
    
    def score(self, clf):
        scores = []
        for train, validate in self.kf.split(self.X_train):
            clf.fit(self.X_train[train],self.y_train[train])
            scores.append(clf.score(self.X_train[validate],self.y_train[validate]))
        return np.mean(scores), np.std(scores)
    
    def treinar(self, clfs):
        print("TREINAMENTO (com validação cruzada) - - - - - -")
        print(f'{"":>20}  Média \t Desvio Padrão')
        for name,clf in clfs:
            score_mean, score_std = self.score(clf)
            print(f'{name:>20}: {score_mean:.4f} \t {score_std:.4f}')

    def testar(self, clfs, X_test, y_test):
        # Testa os classificadores em dados de teste (não vistos no treinamento)
        print("TESTE- - - - - - - - - - - - - -- - - - - - - -")
        for name,clf in clfs:
            y_pred = clf.predict(X_test)
            score = clf.score(X_test, y_test)
            print(f'{name:>20}: {score:.4f}')
            c = confusion_matrix(y_test, y_pred)
            print(c)
            print(classification_report(y_test, y_pred))

In [33]:
# Avaliação de todos os classificadores

clfs = [
    ('Logistic Regression', text_clf_logistic_regression),
    ('Naive Bayes', text_clf_naive_bayes),
    ('SVM', text_clf_svm),
    ('KNN', text_clf_knn),
    ('Decision Tree', text_clf_decision_tree),
    ('Random Forest', text_clf_rf)
]

#newsgroups_pe = PerformanceEvaluator(np.array(newsgroups_X_train), np.array(newsgroups_y_train))
safety_alerts_pe = PerformanceEvaluator(np.array(safety_alerts_X_train), np.array(safety_alerts_y_train))

### RESULTADOS

#### Treinamento e Teste

In [34]:
%%time
# Treina os classificadores usando validação cruzada na base de Alertas de Seguranca
safety_alerts_pe.treinar(clfs)

# Testa os classificadores em dados de teste (não vistos no treinamento)
safety_alerts_pe.testar(clfs, np.array(safety_alerts_X_test), np.array(safety_alerts_y_test))

TREINAMENTO (com validação cruzada) - - - - - -
                      Média 	 Desvio Padrão
 Logistic Regression: 0.4773 	 0.0982
         Naive Bayes: 0.3318 	 0.0775
                 SVM: 0.6682 	 0.1450
                 KNN: 0.6432 	 0.0360




       Decision Tree: 0.5227 	 0.1570
       Random Forest: 0.4295 	 0.1448
TESTE- - - - - - - - - - - - - -- - - - - - - -
 Logistic Regression: 0.4286
[[3 0 0 0 0 0 0]
 [0 0 0 0 0 0 6]
 [1 0 3 0 0 0 0]
 [4 0 0 0 1 0 0]
 [1 0 0 0 1 0 1]
 [0 0 0 0 0 3 1]
 [0 0 1 0 0 0 2]]
              precision    recall  f1-score   support

           0       0.33      1.00      0.50         3
           1       0.00      0.00      0.00         6
           2       0.75      0.75      0.75         4
           3       0.00      0.00      0.00         5
           4       0.50      0.33      0.40         3
           5       1.00      0.75      0.86         4
           6       0.20      0.67      0.31         3

    accuracy                           0.43        28
   macro avg       0.40      0.50      0.40        28
weighted avg       0.36      0.43      0.36        28

         Naive Bayes: 0.3214
[[3 0 0 0 0 0 0]
 [0 0 0 0 0 0 6]
 [1 0 2 0 0 0 1]
 [5 0 0 0 0 0 0]
 [2 0 0 0 0 0 1]
 [0 0 0 0 0 2 2]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
