<a href="https://colab.research.google.com/github/thiagorainmaker77/nlp_ufg/blob/main/nlp_ufg.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [160]:
import pandas as pd
import re
import nltk
import gensim
from functools import partial
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer



from sklearn.model_selection import train_test_split


from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.ensemble import ExtraTreesClassifier




from sklearn.metrics import accuracy_score


# processamento de texto

In [161]:
nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [162]:
uri_train  = 'https://raw.githubusercontent.com/thiagorainmaker77/nlp_ufg/main/train.csv'
df_train = pd.read_csv(uri_train).drop(columns=['Created At', 
                                                'Geo Coordinates.latitude', 
                                                'Geo Coordinates.longitude', 
                                                'User Location',  
                                                'Username', 
                                                'User Screen Name', 
                                                'Retweet Count', 
                                                'Observação', 
                                                'Id'])


In [163]:
df_train

Unnamed: 0,Text,Classificacao
0,Dois são detidos ao tentar jogar celulares e d...,Positivo
1,me matan esas minas q cambian 554 veces su fot...,Neutro
2,Líderes de motim em presídio de Minas Gerais s...,Positivo
3,#Mídia: Press Release from Business Wire : Di...,Neutro
4,Vacinação contra febre amarela é intensificada...,Positivo
...,...,...
6554,Rio faz bloqueio contra febre amarela em munic...,Positivo
6555,Governador Fernando Pimentel entrega 401 veícu...,Positivo
6556,Secretaria de Educação faz reformulações para ...,Positivo
6557,E governo ainda quer indenizar a família dos b...,Neutro


In [164]:
stopwords = nltk.corpus.stopwords.words('portuguese')
for i in stopwords :
    df_train = df_train.replace(to_replace=r'\b%s\b'%i, value="",regex=True)

In [165]:

def replaceAll(text):
  text = re.sub(r'(\\u[0-9A-Fa-f]+)',r'', text)       
  text = re.sub(r'[^\x00-\x7f]',r'',text)
  text = re.sub(r"(\.)\1+", ' multiStop ', text)
  text = re.sub(r"(\?)\1+", ' multiQuestion ', text)
  text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','url',text)
  text = re.sub(r'#([^\s]+)', r'\1', text)
  text = re.sub(r"(\!)\1+", ' multiExclamation ', text)
  text = re.sub('@[^\s]+','atUser',text)

  return text



In [166]:
for index, row in df_train.iterrows():
  row['Text'] = replaceAll(row['Text'])


In [167]:
tf = CountVectorizer()
tf_X = tf.fit_transform(df_train['Text'])

tfidf = TfidfVectorizer()
tfidf_X = tfidf.fit_transform(df_train['Text'])

#Classificação monolítica

In [168]:
X_train, X_test, y_train, y_test = train_test_split( df_train['Text'],  df_train['Classificacao'], test_size=0.33, random_state=42)

results = pd.DataFrame(columns=['Técnica', 'TF', 'TFIDF'])


In [169]:
def desempenho(predict):
    score = accuracy_score(y_test,predict)
    return round(score, 3)

## MNB

In [170]:
clf_mnb_tf = Pipeline([
            ('ext', tf),
            ('cl',   MultinomialNB())])

clf_mnb_tf.fit(X_train, y_train)
mnb_tf = desempenho(clf_mnb_tf.predict(X_test))



clf_mnb_tfidf = Pipeline([
            ('tfidf', tfidf),
            ('cl',   MultinomialNB())])

clf_mnb_tfidf.fit(X_train, y_train)
mnb_tfidf = desempenho(clf_mnb_tfidf.predict(X_test))

In [171]:
 results = results.append({
    'Técnica': 'MultinomialNB', 
    'TF': mnb_tf, 
    'TFIDF': mnb_tfidf
    }, ignore_index=True)
 

## RandomForestClassifier

In [172]:
clf_rf_tf = Pipeline([
            ('ext', tf),
            ('cl',   RandomForestClassifier(random_state=42))])

clf_rf_tf.fit(X_train, y_train)
rf_tf = desempenho(clf_rf_tf.predict(X_test))



clf_rf_tfidf = Pipeline([
            ('tfidf', tfidf),
            ('cl',   RandomForestClassifier(random_state=42))])

clf_rf_tfidf.fit(X_train, y_train)
rf_tfidf = desempenho(clf_rf_tfidf.predict(X_test))

In [173]:
 results = results.append({
    'Técnica': 'RandomForestClassifier', 
    'TF': rf_tf, 
    'TFIDF': rf_tfidf
    }, ignore_index=True)
 

## MLP

In [174]:
clf_mlp_tf = Pipeline([
            ('ext', tf),
            ('cl',   MLPClassifier(random_state=42))])

clf_mlp_tf.fit(X_train, y_train)
mlp_tf = desempenho(clf_mlp_tf.predict(X_test))




In [175]:
 results = results.append({
    'Técnica': 'MLP', 
    'TF': mlp_tf, 
    'TFIDF': '-'
    }, ignore_index=True)

##LogisticRegression

In [176]:
clf_lr_tf = Pipeline([
            ('ext', tf),
            ('cl',   LogisticRegression(random_state=42))])

clf_lr_tf.fit(X_train, y_train)
lr_tf = desempenho(clf_lr_tf.predict(X_test))



clf_lr_tfidf = Pipeline([
            ('tfidf', tfidf),
            ('cl',   LogisticRegression(random_state=42))])

clf_lr_tfidf.fit(X_train, y_train)
lr_tfidf = desempenho(clf_lr_tfidf.predict(X_test))

In [177]:
 results = results.append({
    'Técnica': 'LogisticRegression', 
    'TF': lr_tf, 
    'TFIDF': lr_tfidf
    }, ignore_index=True)

##Perceptron

In [178]:
clf_per_tf = Pipeline([
            ('ext', tf),
            ('cl',   Perceptron(random_state=42))])

clf_per_tf.fit(X_train, y_train)
pr_tf = desempenho(clf_per_tf.predict(X_test))



clf_per_tfidf = Pipeline([
            ('tfidf', tfidf),
            ('cl',   Perceptron(random_state=42))])

clf_per_tfidf.fit(X_train, y_train)
pr_tfidf = desempenho(clf_per_tfidf.predict(X_test))

In [179]:
 results = results.append({
    'Técnica': 'Perceptron', 
    'TF': pr_tf, 
    'TFIDF': pr_tfidf
    }, ignore_index=True)

## ExtraTreesClassifier

In [180]:
clf_ext_tf = Pipeline([
            ('ext', tf),
            ('cl',   ExtraTreesClassifier(random_state=42))])

clf_ext_tf.fit(X_train, y_train)
ex_tf = desempenho(clf_ext_tf.predict(X_test))



clf_ext_tfidf = Pipeline([
            ('tfidf', tfidf),
            ('cl',   ExtraTreesClassifier(random_state=42))])

clf_ext_tfidf.fit(X_train, y_train)
ex_tfidf = desempenho(clf_ext_tfidf.predict(X_test))

In [181]:
 results = results.append({
    'Técnica': 'ExtraTreesClassifier', 
    'TF': ex_tf, 
    'TFIDF': ex_tfidf
    }, ignore_index=True)

In [182]:
cls = {
    'ext_tf': clf_ext_tf, 
    'ext_tfidf': clf_ext_tfidf,
    'per_tfidf': clf_per_tfidf, 
    'per_tf': clf_per_tf, 
    'lr_tf': clf_lr_tf, 
    'lr_tfidf': clf_lr_tfidf, 
    'mlp_tf': clf_mlp_tf, 
    'rf_tf': clf_rf_tf, 
    'rf_tfidf': clf_rf_tfidf, 
    'mnb_tf': clf_mnb_tf, 
    'mnb_tfidf': clf_mnb_tfidf} 


## Analise de viabilidade de combinação

In [183]:
results

Unnamed: 0,Técnica,TF,TFIDF
0,MultinomialNB,0.944,0.942
1,RandomForestClassifier,0.959,0.957
2,MLP,0.956,-
3,LogisticRegression,0.958,0.955
4,Perceptron,0.956,0.952
5,ExtraTreesClassifier,0.957,0.959


In [184]:
proba_lr = pd.DataFrame(clf_lr_tf.predict_proba(X_test))

proba_ext = pd.DataFrame(clf_ext_tfidf.predict_proba(X_test))

proba_mnb = pd.DataFrame(clf_mnb_tfidf.predict_proba(X_test))

In [185]:
prob = pd.concat([proba_lr, proba_ext, proba_mnb], axis=1)

In [186]:
prob

Unnamed: 0,0,1,2,0.1,1.1,2.1,0.2,1.2,2.2
0,0.998889,0.001097,0.000014,1.00,0.0,0.00,0.999988,0.000011,0.000001
1,0.000790,0.000962,0.998248,0.02,0.0,0.98,0.004093,0.002818,0.993089
2,0.001500,0.002994,0.995506,0.00,0.0,1.00,0.001645,0.003188,0.995167
3,0.999174,0.000722,0.000104,1.00,0.0,0.00,0.999926,0.000055,0.000019
4,0.000550,0.003083,0.996367,0.00,0.0,1.00,0.001244,0.004903,0.993853
...,...,...,...,...,...,...,...,...,...
2160,0.000079,0.000089,0.999832,0.00,0.0,1.00,0.000356,0.000504,0.999140
2161,0.998889,0.001097,0.000014,1.00,0.0,0.00,0.999988,0.000011,0.000001
2162,0.001172,0.010487,0.988341,0.00,0.0,1.00,0.000838,0.003248,0.995914
2163,0.833360,0.137253,0.029388,1.00,0.0,0.00,0.773022,0.139802,0.087177


In [187]:
rs = pd.concat([prob, y_test.reset_index()], axis=1)
rs = rs.drop('index', axis='columns', inplace=True)

In [188]:
mlp = MLPClassifier(random_state=42)
mlp.fit(prob, y_test)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_fun=15000, max_iter=200,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=42, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=False,
              warm_start=False)

# Classificação Final

In [226]:


uri_teste  = 'https://raw.githubusercontent.com/thiagorainmaker77/nlp_ufg/main/test.csv'
df_teste = pd.read_csv(uri_teste).drop(columns=['Created At', 
                                                'Geo Coordinates.latitude', 
                                                'Geo Coordinates.longitude', 
                                                'User Location',  
                                                'Username', 
                                                'User Screen Name', 
                                                'Retweet Count', 
                                                'Observação'])


In [227]:
df_teste

Unnamed: 0,Text,Id
0,RT @JDanieldf: Pedindo para que MG reaja? Reag...,3568
1,Homem que matou ex-mulher e jogou corpo em cis...,1323
2,"New post: ""Três adolescentes são apreendidos p...",7976
3,RT @AnaPaulaVolei: Mais 2 helicópteros!!A cara...,2408
4,"RT @UOLNoticias: Custaram R$ 21,8 milhões: Mes...",4435
...,...,...
1635,RT @ivo123zarate3: Me hace mal ver en instagra...,3536
1636,@PMMG190 - Militares da 22ª Cia prendem autore...,6881
1637,Cadeia em Manaus tem 4 mortos; Estados pedem a...,627
1638,Reforma da Previdência será feita no primeiro ...,2165


In [228]:

stopwords = nltk.corpus.stopwords.words('portuguese')
for i in stopwords :
    df_teste = df_teste.replace(to_replace=r'\b%s\b'%i, value="",regex=True)

In [230]:
for index, row in df_teste.iterrows():
  row['Text'] = replaceAll(row['Text'])

In [231]:
lr_test = pd.DataFrame(clf_lr_tf.predict_proba(df_teste['Text']))

ext_test = pd.DataFrame(clf_ext_tfidf.predict_proba(df_teste['Text']))

mnb_test = pd.DataFrame(clf_mnb_tfidf.predict_proba(df_teste['Text']))

In [232]:
prob_test = pd.concat([lr_test, ext_test, mnb_test], axis=1)

In [233]:
predictions = mlp.predict(prob_test)

In [257]:

ser = pd.Series(predictions)

In [259]:
res_predictions = pd.concat([df_teste['Id'], ser] , axis=1)

In [263]:
res_predictions.to_csv('file.csv', index= False)