# Classificação de Texto e Análise de Sentimento

In [15]:
import pandas as pd
import time

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn import svm

import nltk
import re
import string
import unicodedata

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

## Exemplo básico

In [3]:
df_fiap = pd.DataFrame({
    'text': [
      'Sobre postech? Eu gostei muito do postech da FIAP',
      'O postech da FIAP pode melhorar, não gostei muito',
      'Foi muito importante para meu desenvolvimento',
      'Poderia ser mais técnico. Não gostei'
    ],
    'class': [
      'positivo',
      'negativo',
      'positivo',
      'negativo'
    ]})

df_fiap.head()

Unnamed: 0,text,class
0,Sobre postech? Eu gostei muito do postech da FIAP,positivo
1,"O postech da FIAP pode melhorar, não gostei muito",negativo
2,Foi muito importante para meu desenvolvimento,positivo
3,Poderia ser mais técnico. Não gostei,negativo


Usando Bag of Words para tokenizar e vetorizar um texto

In [5]:
vect = CountVectorizer(ngram_range=(1,1))
vect.fit(df_fiap.text)
text_vect = vect.transform(df_fiap.text)

pd.DataFrame(text_vect.toarray(), columns=vect.get_feature_names_out())

Unnamed: 0,da,desenvolvimento,do,eu,fiap,foi,gostei,importante,mais,melhorar,meu,muito,não,para,pode,poderia,postech,ser,sobre,técnico
0,1,0,1,1,1,0,1,0,0,0,0,1,0,0,0,0,2,0,1,0
1,1,0,0,0,1,0,1,0,0,1,0,1,1,0,1,0,1,0,0,0
2,0,1,0,0,0,1,0,1,0,0,1,1,0,1,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,1,0,1,0,1


Agora vamos usar TF-IDF para vetorizar o texto

In [7]:
vect = TfidfVectorizer(ngram_range=(1,1), use_idf=True)
vect.fit(df_fiap.text)
text_vect = vect.transform(df_fiap.text)

pd.DataFrame(text_vect.toarray(), columns=vect.get_feature_names_out())

Unnamed: 0,da,desenvolvimento,do,eu,fiap,foi,gostei,importante,mais,melhorar,meu,muito,não,para,pode,poderia,postech,ser,sobre,técnico
0,0.287039,0.0,0.364073,0.364073,0.287039,0.0,0.232383,0.0,0.0,0.0,0.0,0.232383,0.0,0.0,0.0,0.0,0.574078,0.0,0.364073,0.0
1,0.342426,0.0,0.0,0.0,0.342426,0.0,0.277223,0.0,0.0,0.434323,0.0,0.277223,0.342426,0.0,0.434323,0.0,0.342426,0.0,0.0,0.0
2,0.0,0.430037,0.0,0.0,0.0,0.430037,0.0,0.430037,0.0,0.0,0.430037,0.274487,0.0,0.430037,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.284626,0.0,0.445922,0.0,0.0,0.0,0.35157,0.0,0.0,0.445922,0.0,0.445922,0.0,0.445922


## Classificação

In [2]:
def normalize_accents(text):
    return unicodedata.normalize('NFKD', text).encode('ASCII', 'ignore').decode('utf-8')

def remove_punctuation(text):
    punctuations = string.punctuation
    table = str.maketrans({key: " " for key in punctuations})
    text = text.translate(table)
    return text

def normalize_str(text):
    text = text.lower()
    text = remove_punctuation(text)
    text = normalize_accents(text)
    text = re.sub(re.compile(r" +"), " ",text)
    return " ".join([w for w in text.split()])

def preprocessing_text(text, lenguage="english"):
    stop_words = nltk.corpus.stopwords.words(lenguage)
    if isinstance(text, str):
        text = normalize_str(text)
        text = "".join([w for w in text if not w.isdigit()])
        text = word_tokenize(text)
        text = [x for x in text if x not in stop_words]
        text = [y for y in text if len(y) > 2]
        return " ".join([t for t in text])
    else:
        return None

In [4]:
df = pd.read_csv("../data/nlp/uci-news-aggregator.csv")
df = df[['TITLE', 'CATEGORY']] #categories: b = business, t = science and technology, e = entertainment, m = health
df = shuffle(df)
df.reset_index(drop=True, inplace=True)
df.shape

(422419, 2)

In [5]:
df.head(10)

Unnamed: 0,TITLE,CATEGORY
0,What you need to know before markets open,b
1,Honest Co. Attracts $70 Million Investment,b
2,"Skin cancer rates, Canadians' sun behaviour co...",m
3,"Kanye West really, really doesn't want to go o...",e
4,Oracle Corporation (ORCL) news: Oracle largely...,b
5,"Hummus Recalled From Target, Trader Joe's Over...",m
6,Ebola cases could exceed 20000: WHO,m
7,Microsoft E3 roundup: Xbox One doesn't mark th...,t
8,Clooney's Pet Pig Was His Baby: Ex-girlfriend,e
9,"After five months, jurors for Madoff aides fin...",b


In [7]:
df['Title_Treated'] = df['TITLE'].apply(preprocessing_text)

In [10]:
print('Antes: ', df['TITLE'][0], '\n')
print('Depois: ', df['Title_Treated'][0])

Antes:  What you need to know before markets open 

Depois:  need know markets open


In [8]:
X = df['Title_Treated'].values
y = df['CATEGORY'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
vect = CountVectorizer(lowercase=False) 
vect.fit(X_train)
X_train = vect.transform(X_train)
X_train

<295693x43908 sparse matrix of type '<class 'numpy.int64'>'
	with 1955459 stored elements in Compressed Sparse Row format>

In [14]:
X_test = vect.transform(X_test)
X_test

<126726x43908 sparse matrix of type '<class 'numpy.int64'>'
	with 831284 stored elements in Compressed Sparse Row format>

In [16]:
clf = svm.SVC(kernel='linear', n_j) 
start_time = time.time()
clf.fit(X_train, y_train)
end_time = time.time()
print('tempo decorrido: ',end_time-start_time, 'segundos')
y_pred = clf.predict(X_test)

tempo decorrido:  3181.1094467639923 segundos


In [17]:
#traduzindo o tempo decorrido 
import datetime
sec = end_time-start_time
print(str(datetime.timedelta(seconds = sec)))

0:53:01.109447


In [18]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9432792007954169
