# Aplicando Transformers en Análisis de Sentimientos.

**Investigadores**: <br>
  Dr. Ramón Zatarain Cabada<br>
  Dra. María Lucía Barrón Estrada<br>
  M.C. Víctor Manuel Bátiz Beltrán

**Corpus**: SentiText


### Descripción general
Usaremos el dataset SentiText.



### Pasos iniciales
Intslamos e importamos las bibliotecas a utilizar.

In [None]:
!pip install emoji

In [None]:
import re
#import matplotlib.pyplot as plt
import string
from nltk.corpus import stopwords
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk import SnowballStemmer
import unicodedata
from collections import Counter
from wordcloud import WordCloud
from gensim.utils import simple_preprocess
import gensim
from sklearn.model_selection import train_test_split
import spacy
import pickle
import warnings
warnings.filterwarnings('ignore')
#import seaborn as sns
#from sklearn.metrics import confusion_matrix
#import matplotlib.pyplot as plt
import tensorflow as tf
import keras
import numpy as np
import pandas as pd
import emoji
import keras
from keras import backend as K
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
print('Listo')

## 1. Cargando el dataset

### Descargando el corpus desde el sitio Web de PersonApp.

La primera celda de código fue necesaria para poder usar el mode GPU, ya que sin ello marcaba error de encoding.

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
def corpus_download(path, url):
  !wget --no-check-certificate \
     {url} \
     -O {path}

In [None]:
corpus_download("SentiText.csv","https://person-app-itc.web.app/corpus/SentiText.csv")

In [None]:
data = pd.read_csv("SentiText.csv")

### Exploración de los datos

In [None]:
data.head()

In [None]:
len(data)

We change labels to numerical representation 0 = negative and 1= positive.

In [None]:
data['Label'] = data['Label'].replace({'negativo':0, 'positivo':1})

## 2. Limpieza de datos (Data cleaning)

In [None]:
data.head()

In [None]:
print(data.dtypes)


In [None]:
#Check if we have null fields
data.isnull().sum()

In [None]:
#In case we have null texts.
data["Text"].fillna("Sin texto", inplace = True)

### A continuación realizaremos los siguientes pasos:

1. Separar el texto en Tokens
2. Convertir palabras a minúsculas
3. Expandir contracciones
4. Remover urls, correos, saltos de línea
5. Eliminar caracteres repetidos
6. Eliminar nuevas líneas y pestañas
7. Remover saltos de línea
8. Remover comillas simples
9. Eliminar comas " , "
10. Remover números
11. Remover Caracteres no alfanuméricos
12. Eliminar guiones entre palabras
13. Eliminar los guiones dobles y triples
14. Eliminar espacios en blanco (al principio, final y espacios dobles)
15. Eleminar stop words
16. Realizar stemming/Lematizacion  
17. Remover signos de puntuación
18. Destokenizar


In [None]:
def depurar_datos(data):

    #Remover URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    data = url_pattern.sub(r'', data)

    # Removee correos
    data = re.sub('\S*@\S*\s?', '', data)

    # Remover saltos de línea
    data = re.sub('\s+', ' ', data)

    #Convertir palabras a minúscula
    data=data.lower()

    # Remover comillas simples
    data = re.sub("\'", "", data)

    # Remover numeros
    data = re.sub(r'\d+', '', data)

    #Remover Caracteres especiales y numeros
    data = re.sub(r"[^a-zA-Z-á-é,í,ó,ú,ü,Á-É-Í-Ó-Ú-ñ]"," ",data)


    #Eliminar los espacios en blanco al principio
    data= re.sub(r"^\s+", "", data)

    #Eliminar los espacios en blanco al final
    data= re.sub(r"\s+$", "", data)

    #remover espacios dobles
    data = " ".join(data.split())

    return data

In [None]:
def process_text0(text):
    regex = r'https://\S+|\B@\w+\b'
    text = re.sub(regex, '', text)
    text = re.sub(r'([\U0001F300-\U0001F64F\U0001F680-\U0001F6FF])', r'\1 ', text)
    text = emoji.demojize(text)
    text = text.replace("  ", " ")
    return text

def process_text(sentence, norm_user = True, norm_hashtag = True, separate_characters = True):
    # Convert instance to string
    sentence = str(sentence)

    # All text to lowecase
    sentence = sentence.lower()

    # Normalize users and url
    if norm_user == True:
        sentence = re.sub(r'\@\w+','@usuario', sentence)
    if norm_hashtag == True:
        sentence = re.sub(r"http\S+|www\S+|https\S+", 'url', sentence, flags=re.MULTILINE)

    # Separate special characters
    if separate_characters == True:
        sentence = re.sub(r":", " : ", sentence)
        sentence = re.sub(r",", " , ", sentence)
        sentence = re.sub(r"\.", " . ", sentence)
        sentence = re.sub(r"!", " ! ", sentence)
        sentence = re.sub(r"¡", " ¡ ", sentence)
        sentence = re.sub(r"“", " “ ", sentence)
        sentence = re.sub(r"'", " ' ", sentence)
        sentence = re.sub(r"”", " ” ", sentence)
        sentence = re.sub(r"\(", " ( ", sentence)
        sentence = re.sub(r"\)", " ) ", sentence)
        sentence = re.sub(r"\?", " ? ", sentence)
        sentence = re.sub(r"\¿", " ¿ ", sentence)

    # Substituting multiple spaces with single space
    sentence = re.sub(r'\s+', ' ', sentence, flags=re.I)
    # emojis to text
    sentence = emoji.demojize(sentence)

    return sentence

In [None]:
clean_data = data.copy()
clean_data['Text'] = clean_data['Text'].apply(process_text)

In [None]:
clean_data.head()

### Eliminando las palabras que no aportan valor (stopwords)

In [None]:
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [None]:
nltk.download('stopwords')
print(stopwords.words('spanish'))

In [None]:
stop_words = set(stopwords.words('spanish'))

In [None]:
def remove_stopwords(text):
  word_tokens = word_tokenize(text)
  no_stopwords = [word for word in word_tokens if not word in stop_words]
  return " ".join(no_stopwords)

In [None]:
remove_stopwords('el que tiene tienda la debe atender')

In [None]:
clean_data['Text'] = clean_data['Text'].apply(remove_stopwords)

### Lematización

In [None]:
#https://spacy.io/models/es
#We'll use Spacy for Lematization
!python -m spacy download es_core_news_sm

In [None]:
import spacy
import es_core_news_sm
nlp = es_core_news_sm.load()

In [None]:
def lematize(text):
    doc = nlp(text)
    lemms = []
    for token in doc:
        lemms.append(token.lemma_)
    return " ".join(lemms)

In [None]:
lematize('yo soy muy feliz con mi familia')

In [None]:
clean_data['Text'] = clean_data['Text'].apply(lematize)

### Retirando elementos de puntuación y acentos (Punctuation Cleaning)



In [None]:
def cleaning_punct(text):
  token_list = gensim.utils.simple_preprocess(str(text), deacc=True)  # deacc=True remueve puntuación
  return " ".join(token_list)

In [None]:
cleaning_punct('mi méxico querido qué fantástico')

In [None]:
clean_data['Text'] = clean_data['Text'].apply(cleaning_punct)

## 3. Construcción del modelo

In [None]:
#clases = ['Negative','Positive']

### Transformers

Pasos iniciales

In [None]:
!pip install transformers==4.24.0
!pip install simpletransformers==0.63.11

In [None]:
!pip install transformers
!pip install simpletransformers

In [None]:
!pip install emoji
# install simpletransformers
#!pip install simpletransformers

# check installed version
#!pip freeze | grep simpletransformers
# simpletransformers==0.28.2

In [None]:
pip show simpletransformers

### Cargando los modelos preentrenados

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

In [None]:
import logging # Import the logging module

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
#logging.basicConfig(level=logging.INFO)
#transformers_logger = logging.getLogger("transformers")
#transformers_logger.setLevel(logging.WARNING)

In [None]:
clean_data2 = clean_data.copy()
clean_data2.rename(columns = {'Text':'text','Label':'labels'}, inplace = True)

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(clean_data2, test_size=0.20)

print('train shape: ',train_df.shape)
print('test shape: ',test_df.shape)

In [None]:
# Optional model configuration
model_args = ClassificationArgs(num_train_epochs=1)

train_args ={"reprocess_input_data": True,
             "fp16":False,
             "num_train_epochs": 1, # Usaremos una época por cuestiones de tiempo
             "overwrite_output_dir": True}

# Create a ClassificationModel
model = ClassificationModel(
    'bert',
    'bert-base-uncased',
    num_labels=2,
    args=train_args
)

### Entrenamos el modelo

In [None]:
# Train the model
model.train_model(train_df)

In [None]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

In [None]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(test_df,f1=f1_score, acc=accuracy_score, rc=recall_score, pcs=precision_score)

In [None]:
result['acc']

### Probando el modelo

In [None]:
from sklearn.metrics import recall_score
from sklearn import metrics

In [None]:
test_df.head()

In [None]:
test = test_df['text'].to_numpy().tolist()
y = test_df['labels'].to_numpy().tolist()
print(test[0:10])
print(y[0:10])
print(len(test))
print(len(y))

In [None]:
predictions_test = model.predict(test)

In [None]:
test_recall = metrics.recall_score(y, predictions_test[0], average='macro')
test_f1 = metrics.f1_score(y, predictions_test[0], average='macro')
test_precision = metrics.precision_score(y, predictions_test[0], average='macro')
test_accuracy = metrics.accuracy_score(y, predictions_test[0])

In [None]:
print("Metrics results:")
print(f"Accuracy: {test_accuracy}")
print(f"F1: {test_f1}")
print(f"Precision: {test_precision}")
print(f"Recall: {test_recall}")