![](https://www.kdnuggets.com/wp-content/uploads/text-analysis-acme2.jpg)

# Classificação de texto

 - Tarefa muito importante da área de NLP;
 - Atribui uma categoria a um texto baseado em quê eles falando;
 - Identificação de Spam, classificação de comentários em fóruns de discussões, análise de sentimento em tweets, etc;
 
# Problema
 - Dada a review de um filme dizer se ela é positiva ou negativa;
 
 
    
  


## Carregando o dataset

In [None]:
import pandas as pd

dataset = pd.read_csv("../datasets/imdb-dataset.csv", nrows=2500)

dataset.head()

In [None]:
print(dataset.iloc[0, 0])

In [None]:
print(dataset.iloc[3, 0])

In [None]:
dataset.iloc[:, 1].value_counts()

## Pré-processamento

In [None]:
# Remoção de acentos

import unidecode

def utf8_to_ascii(text):
    return unidecode.unidecode(text)

In [None]:
# Remoção de tags HTML (<div>, <p>, <h1>, <br>)

import re

def delete_html_nodes(text):
    regex = re.compile("<.+>")
    
    return re.sub(regex, "", text)

In [None]:
# Tokenização "I thought this was" -> ["I", "thought", "this", "was"]

import spacy

def tokenize(corpus, deacc=True, trim_html=True, header="review"):
    nlp = spacy.load("en_core_web_md")
    
    tokens = []
    for index, row in corpus.iterrows():
        document = row[header]
        # remove accents
        if deacc:
            document = utf8_to_ascii(document)
        
        # remove HTML tags and its content
        if trim_html:
            document = delete_html_nodes(document)
        
        spacy_doc = nlp(document)
        
        tokens.append([token for token in spacy_doc])
            
    return tokens

In [None]:
# Remoção de stop words (a, an, as, and, at, both, by, for, to)

def remove_stop_words(corpus):
    _tokens = []
    index = -1
    for document in corpus:
        _tokens.append([])
        index += 1
        
        for token in document:
            if not token.is_stop:
                _tokens[index].append(token)
            
    return _tokens

In [None]:
# Lematização

def lemmatize(corpus, remove_punct=True, remove_digits=True):
    lemmatized = []
    index = -1
    for document in corpus:
        lemmatized.append([])
        index += 1
        
        for token in document:
            # punctuation removal
            if remove_punct and token.is_punct:
                continue
                
            # digits removal
            if remove_digits and token.is_digit:
                continue

            lemmatized[index].append(token.lemma_)
            
        lemmatized[index] = " ".join(lemmatized[index])
        
        
    return lemmatized

In [None]:
tokens = tokenize(
        dataset,
        deacc=True,
        trim_html=True)

In [None]:
no_stop_words = remove_stop_words(tokens)

In [None]:
preprocessed_corpus = lemmatize(
        no_stop_words,
        remove_punct=True,
        remove_digits=True)

In [None]:
dataset.iloc[2, 0]

In [None]:
preprocessed_corpus[2]

In [None]:
labels = dataset.iloc[:, 1].map({"negative": 0, "positive": 1})

In [None]:
labels.shape

In [None]:
labels

## Extração de características

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(preprocessed_corpus)

In [None]:
features_dataset = tfidf_vectorizer.transform(preprocessed_corpus)

In [None]:
features_dataset.shape

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features_dataset, labels, shuffle=False, random_state=42)

In [None]:
print(X_train.shape, y_train.shape)

In [None]:
print(X_test.shape, y_test.shape)

## Criação do modelo

In [None]:
from sklearn.svm import SVC

svm_model = SVC(probability=True)

In [None]:
svm_model.fit(X_train, y_train)

In [None]:
predictions = svm_model.predict(X_test)

In [None]:
predictions

## Avaliação de desempenho do modelo

In [None]:
from sklearn.metrics import accuracy_score, recall_score, plot_confusion_matrix

accuracy = accuracy_score(y_test, predictions)

In [None]:
# (TP + TN) / TP + FP + TN + FN

print("Accurácia", accuracy*100, "%")

In [None]:
# TP / (TP + FN)

recall = recall_score(y_test, predictions)

In [None]:
print("Recall {:.2f}%".format(recall*100))

In [None]:
plot_confusion_matrix(svm_model, X_test, y_test)

In [None]:
predictions

## Exercícios

1. Executar os passos do notebook novamente porém para o dataset com todas as reviews (50.000) e verificar de que forma isso influência no desempenho do modelo (bastante custoso, fazer no [google colab](https://colab.research.google.com/)).


2. Extrair as características utilizando o BoW e verificar se o desempenho do novo modelo é melhor que do modelo atual.

## Referências

[[1]](https://developers.google.com/machine-learning/guides/text-classification) Text Classification Guide – Google Developers

[[2]](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn-svm-svc) Documentação do scikit-learn sobre o SVM