Carregando Dataset:

In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

Definindo funções para construção de treino e testes do dataset:

In [2]:
def get_data():
    data = fetch_20newsgroups(
        subset='all', 
        shuffle=True, 
        remove=('headers', 'footers', 'quotes'))
    return data

In [3]:
def prepare_datasets(corpus, labels, test_data_proportion=0.3):
    train_X, test_X, train_Y, test_Y = train_test_split(
        corpus, 
        labels,
        test_size=0.33,
        random_state=42)
    return train_X, test_X, train_Y, test_Y

In [4]:
def remove_empty_docs(corpus, labels):
    filtered_corpus = []
    filtered_labels = []
    for doc, label in zip(corpus, labels):
        if doc.strip():
            filtered_corpus.append(doc)
            filtered_labels.append(label)
    return filtered_corpus, filtered_labels

Analisando dados do meu dataset como seus documentos e classes:

In [5]:
dataset = get_data()
classes = dataset.target_names

corpus, labels = dataset.data, dataset.target 
corpus, labels = remove_empty_docs(corpus, labels)

In [6]:
print('Amostra do Documento:', corpus[10])
print('Indice da Classe:',labels[10])
print('classe do indice atual', dataset.target_names[labels[10]])

('Amostra do Documento:', u'the blood of the lamb.\n\nThis will be a hard task, because most cultures used most animals\nfor blood sacrifices. It has to be something related to our current\npost-modernism state. Hmm, what about used computers?\n\nCheers,\nKent')
('Indice da Classe:', 19)
('classe do indice atual', 'talk.religion.misc')


Preparando dados de treino e teste do meu dataset:

In [7]:
train_corpus, test_corpus, train_labels, test_labels = prepare_datasets(
                                                            corpus, 
                                                            labels, 
                                                            test_data_proportion=0.3)

Importando funcionalidades de feature extractor:

In [8]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [9]:
vect_bow = CountVectorizer()
vect_tfidf = TfidfVectorizer()

Extraindo os recursos de Bag Of Words e TF-IDF dos meus docs e classes de teste e treino

In [10]:
#Bag of Words:
X_train_bow = vect_bow.fit_transform(train_corpus)
X_test_bow= vect_bow.transform(test_corpus)
#TF-IDF
X_train_tfidf = vect_tfidf.fit_transform(train_corpus)
X_test_tfidf = vect_tfidf.transform(test_corpus)

Depois de extrair os recusros, crio uma função para avaliar nossa classificação de acordo com 
as metricas: accuary, precision, recall e f1score:

In [11]:
from sklearn import metrics
import numpy as np

In [12]:
def get_metrics(true_labels, predicted_labels):
    print ('Accuracy:', np.round(metrics.accuracy_score(true_labels,predicted_labels),2))
    print ('Precision:', np.round(metrics.precision_score(true_labels,predicted_labels,average='weighted'),2))
    print ('Recall:', np.round(metrics.recall_score(true_labels,predicted_labels,average='weighted'),2))
    print ('F1 Score:', np.round(metrics.f1_score(true_labels,predicted_labels,average='weighted'),2))
    print("----------------------------------------------------------------")

Função para treinar o modelo com o classificador desejado, realizar predição e resgatar as metricas:

In [13]:
def train_predict_evaluate_model(classifier,train_features, train_labels,test_features, test_labels):  
    classifier.fit(train_features, train_labels)
    predictions = classifier.predict(test_features) 
    # avaliar o desempenho de previsão do modelo 
    get_metrics(true_labels=test_labels,predicted_labels=predictions)
    return predictions  

Importando classificadores:

In [14]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

mnb = MultinomialNB()
svm = SGDClassifier(loss='hinge', max_iter=1000, tol=0.001)

Naive Bayes e SVM com recursos do bow:

In [15]:
mnb_bow_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=X_train_bow,
                                           train_labels=train_labels,
                                           test_features=X_test_bow,
                                           test_labels=test_labels)

('Accuracy:', 0.62)
('Precision:', 0.71)
('Recall:', 0.62)
('F1 Score:', 0.6)
----------------------------------------------------------------


In [16]:
svm_bow_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=X_train_bow,
                                           train_labels=train_labels,
                                           test_features=X_test_bow,
                                           test_labels=test_labels)  

('Accuracy:', 0.66)
('Precision:', 0.68)
('Recall:', 0.66)
('F1 Score:', 0.66)
----------------------------------------------------------------


Naive Bayes e SVM com recursos do TFIDF:

In [17]:
mnb_tfidf_predictions = train_predict_evaluate_model(classifier=mnb,
                                           train_features=X_train_tfidf,
                                           train_labels=train_labels,
                                           test_features=X_test_tfidf,
                                           test_labels=test_labels)

('Accuracy:', 0.68)
('Precision:', 0.78)
('Recall:', 0.68)
('F1 Score:', 0.67)
----------------------------------------------------------------


In [18]:
svm_tfidf_predictions = train_predict_evaluate_model(classifier=svm,
                                           train_features=X_train_tfidf,
                                           train_labels=train_labels,
                                           test_features=X_test_tfidf,
                                           test_labels=test_labels)

('Accuracy:', 0.77)
('Precision:', 0.77)
('Recall:', 0.77)
('F1 Score:', 0.77)
----------------------------------------------------------------


Analisando documentos que foram classificados incorretamente

In [19]:
import pandas as pd

In [20]:
cm = metrics.confusion_matrix(test_labels, svm_tfidf_predictions)
print(pd.DataFrame(cm, index=range(0,20), columns=range(0,20)))

     0    1    2    3    4    5    6    7    8    9    10   11   12   13   14  \
0   144    2    0    1    0    2    2    2    6    1    7    5    5    6    6   
1     0  226    9    5    6   17    6    0    3    1    1    2    3    5    7   
2     1   15  222   19   13   16    6    2    0    1    1    1    7    4    3   
3     1   12   21  217   17    5   11    3    0    3    1    2    4    1    0   
4     0    6    5   17  228    3    4    3    2    2    0    3   12    2    2   
5     0   23   19    2    0  275    1    0    0    0    0    0    1    2    1   
6     0    1    5   12   12    2  273   11    3    4    0    1    9    2    2   
7     2    4    2    2    4    2   12  242   19    2    2    0    9    3    2   
8     2    1    2    2    2    0    5   26  262    5    1    2    2    1    2   
9     1    0    2    0    1    2    5    3    2  286   11    0    0    2    2   
10    0    0    0    0    1    1    0    2    1    7  280    1    1    1    0   
11    3    4    4    3    1 

In [21]:
class_names = dataset.target_names

print (class_names[0], '->', class_names[15])
print (class_names[18], '->', class_names[16]) 
print (class_names[19], '->', class_names[15])

('alt.atheism', '->', 'soc.religion.christian')
('talk.politics.misc', '->', 'talk.politics.guns')
('talk.religion.misc', '->', 'soc.religion.christian')
