<a href="https://colab.research.google.com/github/thiagorainmaker77/nlp_ufg/blob/main/nlp_ufg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
import nltk
import gensim
from functools import partial
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer



from sklearn.model_selection import train_test_split


from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.ensemble import ExtraTreesClassifier




from sklearn.metrics import accuracy_score


# processamento de texto

In [2]:
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
uri_train  = 'https://raw.githubusercontent.com/thiagorainmaker77/nlp_ufg/main/train.csv'
df_train = pd.read_csv(uri_train).drop(columns=['Created At', 
                                                'Geo Coordinates.latitude', 
                                                'Geo Coordinates.longitude', 
                                                'User Location',  
                                                'Username', 
                                                'User Screen Name', 
                                                'Retweet Count', 
                                                'Observação', 
                                                'Id'])


In [4]:
df_train

Unnamed: 0,Text,Classificacao
0,Dois são detidos ao tentar jogar celulares e d...,Positivo
1,me matan esas minas q cambian 554 veces su fot...,Neutro
2,Líderes de motim em presídio de Minas Gerais s...,Positivo
3,#Mídia: Press Release from Business Wire : Di...,Neutro
4,Vacinação contra febre amarela é intensificada...,Positivo
...,...,...
6554,Rio faz bloqueio contra febre amarela em munic...,Positivo
6555,Governador Fernando Pimentel entrega 401 veícu...,Positivo
6556,Secretaria de Educação faz reformulações para ...,Positivo
6557,E governo ainda quer indenizar a família dos b...,Neutro


In [5]:
stopwords = nltk.corpus.stopwords.words('portuguese')
for i in stopwords :
    df_train = df_train.replace(to_replace=r'\b%s\b'%i, value="",regex=True)

In [6]:

def replaceAll(text):
  text = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', text)       
  text = re.sub(r'[^\x00-\x7f]',r'',text)
  text = re.sub(r"(\.)\1+", ' multiStop ', text)
  text = re.sub(r"(\?)\1+", ' multiQuestion ', text)
  text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','url',text)
  text = re.sub(r'#([^\s]+)', r'\1', text)
  text = re.sub(r"(\!)\1+", ' multiExclamation ', text)
  text = re.sub('@[^\s]+','atUser',text)

  return text



In [7]:
for index, row in df_train.iterrows():
  row['Text'] = replaceAll(row['Text'])


In [8]:
tf = CountVectorizer()
tf_X = tf.fit_transform(df_train['Text'])

tfidf = TfidfVectorizer()
tfidf_X = tfidf.fit_transform(df_train['Text'])

#Classificação monolítica

In [9]:
X_train, X_test, y_train, y_test = train_test_split( df_train['Text'],  df_train['Classificacao'], test_size=0.33, random_state=42)

results = pd.DataFrame(columns=['Técnica', 'TF', 'TFIDF'])


In [10]:
def desempenho(predict):
    score = accuracy_score(y_test,predict)
    return round(score, 3)

## MNB

In [11]:
clf_mnb_tf = Pipeline([
            ('ext', tf),
            ('cl',   MultinomialNB())])

clf_mnb_tf.fit(X_train, y_train)
mnb_tf = desempenho(clf_mnb_tf.predict(X_test))



clf_mnb_tfidf = Pipeline([
            ('tfidf', tfidf),
            ('cl',   MultinomialNB())])

clf_mnb_tfidf.fit(X_train, y_train)
mnb_tfidf = desempenho(clf_mnb_tfidf.predict(X_test))

In [12]:
 results = results.append({
    'Técnica': 'MultinomialNB', 
    'TF': mnb_tf, 
    'TFIDF': mnb_tfidf
    }, ignore_index=True)
 

## RandomForestClassifier

In [13]:
clf_rf_tf = Pipeline([
            ('ext', tf),
            ('cl',   RandomForestClassifier(random_state=42))])

clf_rf_tf.fit(X_train, y_train)
rf_tf = desempenho(clf_rf_tf.predict(X_test))



clf_rf_tfidf = Pipeline([
            ('tfidf', tfidf),
            ('cl',   RandomForestClassifier(random_state=42))])

clf_rf_tfidf.fit(X_train, y_train)
rf_tfidf = desempenho(clf_rf_tfidf.predict(X_test))

In [14]:
 results = results.append({
    'Técnica': 'RandomForestClassifier', 
    'TF': rf_tf, 
    'TFIDF': rf_tfidf
    }, ignore_index=True)
 

## MLP

In [15]:
clf_mlp_tf = Pipeline([
            ('ext', tf),
            ('cl',   MLPClassifier(random_state=42))])

clf_mlp_tf.fit(X_train, y_train)
mlp_tf = desempenho(clf_mlp_tf.predict(X_test))



clf_mlp_tfidf = Pipeline([
            ('tfidf', tfidf),
            ('cl',   MLPClassifier(random_state=42))])

clf_mlp_tfidf.fit(X_train, y_train)
mlp_tfidf = desempenho(clf_mlp_tfidf.predict(X_test))

In [16]:
 results = results.append({
    'Técnica': 'MLP', 
    'TF': mlp_tf, 
    'TFIDF': mlp_tfidf
    }, ignore_index=True)

##LogisticRegression

In [17]:
clf_lr_tf = Pipeline([
            ('ext', tf),
            ('cl',   LogisticRegression(random_state=42))])

clf_lr_tf.fit(X_train, y_train)
lr_tf = desempenho(clf_lr_tf.predict(X_test))



clf_lr_tfidf = Pipeline([
            ('tfidf', tfidf),
            ('cl',   LogisticRegression(random_state=42))])

clf_lr_tfidf.fit(X_train, y_train)
lr_tfidf = desempenho(clf_lr_tfidf.predict(X_test))

In [18]:
 results = results.append({
    'Técnica': 'LogisticRegression', 
    'TF': lr_tf, 
    'TFIDF': lr_tfidf
    }, ignore_index=True)

##Perceptron

In [19]:
clf_per_tf = Pipeline([
            ('ext', tf),
            ('cl',   Perceptron(random_state=42))])

clf_per_tf.fit(X_train, y_train)
pr_tf = desempenho(clf_per_tf.predict(X_test))



clf_per_tfidf = Pipeline([
            ('tfidf', tfidf),
            ('cl',   Perceptron(random_state=42))])

clf_per_tfidf.fit(X_train, y_train)
pr_tfidf = desempenho(clf_per_tfidf.predict(X_test))

In [20]:
 results = results.append({
    'Técnica': 'Perceptron', 
    'TF': pr_tf, 
    'TFIDF': pr_tfidf
    }, ignore_index=True)

## ExtraTreesClassifier

In [21]:
clf_ext_tf = Pipeline([
            ('ext', tf),
            ('cl',   ExtraTreesClassifier(random_state=42))])

clf_ext_tf.fit(X_train, y_train)
ex_tf = desempenho(clf_ext_tf.predict(X_test))



clf_ext_tfidf = Pipeline([
            ('tfidf', tfidf),
            ('cl',   ExtraTreesClassifier(random_state=42))])

clf_ext_tfidf.fit(X_train, y_train)
ex_tfidf = desempenho(clf_ext_tfidf.predict(X_test))

In [22]:
 results = results.append({
    'Técnica': 'ExtraTreesClassifier', 
    'TF': ex_tf, 
    'TFIDF': ex_tfidf
    }, ignore_index=True)

In [26]:
cls = {
    'ext_tf': clf_ext_tf, 
    'ext_tfidf': clf_ext_tfidf,
    'per_tfidf': clf_per_tfidf, 
    'per_tf': clf_per_tf, 
    'lr_tf': clf_lr_tf, 
    'lr_tfidf': clf_lr_tfidf, 
    'mlp_tf': clf_mlp_tf, 
    'mlp_tfidf': clf_mlp_tfidf, 
    'rf_tf': clf_rf_tf, 
    'rf_tfidf': clf_rf_tfidf, 
    'mnb_tf': clf_mnb_tf, 
    'mnb_tfidf': clf_mnb_tfidf} 


## Analise de viabilidade de combinação

In [50]:
results

Unnamed: 0,Técnica,TF,TFIDF
0,MultinomialNB,0.944,0.942
1,RandomForestClassifier,0.959,0.957
2,MLP,0.956,0.952
3,LogisticRegression,0.958,0.955
4,Perceptron,0.956,0.952
5,ExtraTreesClassifier,0.957,0.959
