# Aplicando Transformers en Análisis de Sentimientos.

**Investigadores**: <br>
  Dr. Ramón Zatarain Cabada<br>
  Dra. María Lucía Barrón Estrada<br>
  M.C. Víctor Manuel Bátiz Beltrán

**Corpus**: SentiText


### General description

We'll use the SentiText dataset.

This is my first version using same approach used with first experiments with PersonText.

### First step
We install and import libraries we will be using.

In [None]:
!pip install emoji

In [None]:
import re
#import matplotlib.pyplot as plt
import string
from nltk.corpus import stopwords
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk import SnowballStemmer
import unicodedata
from collections import Counter
from wordcloud import WordCloud
from gensim.utils import simple_preprocess
import gensim
from sklearn.model_selection import train_test_split
import spacy
import pickle
import warnings
warnings.filterwarnings('ignore')
#import seaborn as sns
#from sklearn.metrics import confusion_matrix
#import matplotlib.pyplot as plt
import tensorflow as tf
import keras
import numpy as np
import pandas as pd
import emoji
import keras
from keras import backend as K
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
print('Listo')

## 1. Loading the dataset

### Descargando el corpus desde el sitio Web de PersonApp.

La primera celda de código fue necesaria para poder usar el mode GPU, ya que sin ello marcaba error de encoding.

In [None]:
import locale
def getpreferredencoding(do_setlocale = True):
    return "UTF-8"
locale.getpreferredencoding = getpreferredencoding

In [None]:
def corpus_download(path, url):
  !wget --no-check-certificate \
     {url} \
     -O {path}

In [None]:
corpus_download("SentiText.csv","https://person-app-itc.web.app/corpus/SentiText.csv")

In [None]:
data = pd.read_csv("SentiText.csv")

### Exploración de los datos

In [None]:
data.head()

In [None]:
len(data)

We change labels to numerical representation 0 = negative and 1= positive.

In [None]:
data['Label'] = data['Label'].replace({'negativo':0, 'positivo':1})

## 2. Data cleaning

In [None]:
data.head()

In [None]:
print(data.dtypes)


In [None]:
#Check if we have null fields
data.isnull().sum()

In [None]:
#In case we have null texts.
data["Text"].fillna("Sin texto", inplace = True)

### A continuación realizaremos los siguientes pasos:

1. Separar el texto en Tokens
2. Convertir palabras a minúsculas
3. Expandir contracciones
4. Remover urls, correos, saltos de línea
5. Eliminar caracteres repetidos
6. Eliminar nuevas líneas y pestañas
7. Remover saltos de línea
8. Remover comillas simples
9. Eliminar comas " , "
10. Remover números
11. Remover Caracteres no alfanuméricos
12. Eliminar guiones entre palabras
13. Eliminar los guiones dobles y triples
14. Eliminar espacios en blanco (al principio, final y espacios dobles)
15. Eleminar stop words
16. Realizar stemming/Lematizacion  
17. Remover signos de puntuación
18. Destokenizar


In [None]:
def depurar_datos(data):

    #Remover URLs
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    data = url_pattern.sub(r'', data)

    # Removee correos
    data = re.sub('\S*@\S*\s?', '', data)

    # Remover saltos de línea
    data = re.sub('\s+', ' ', data)

    #Convertir palabras a minúscula
    data=data.lower()

    # Remover comillas simples
    data = re.sub("\'", "", data)

    # Remover numeros
    data = re.sub(r'\d+', '', data)

    #Remover Caracteres especiales y numeros
    data = re.sub(r"[^a-zA-Z-á-é,í,ó,ú,ü,Á-É-Í-Ó-Ú-ñ]"," ",data)


    #Eliminar los espacios en blanco al principio
    data= re.sub(r"^\s+", "", data)

    #Eliminar los espacios en blanco al final
    data= re.sub(r"\s+$", "", data)

    #remover espacios dobles
    data = " ".join(data.split())

    return data

In [None]:
def process_text0(text):
    regex = r'https://\S+|\B@\w+\b'
    text = re.sub(regex, '', text)
    text = re.sub(r'([\U0001F300-\U0001F64F\U0001F680-\U0001F6FF])', r'\1 ', text)
    text = emoji.demojize(text)
    text = text.replace("  ", " ")
    return text

def process_text(sentence, norm_user = True, norm_hashtag = True, separate_characters = True):
    # Convert instance to string
    sentence = str(sentence)

    # All text to lowecase
    sentence = sentence.lower()

    # Normalize users and url
    if norm_user == True:
        sentence = re.sub(r'\@\w+','@usuario', sentence)
    if norm_hashtag == True:
        sentence = re.sub(r"http\S+|www\S+|https\S+", 'url', sentence, flags=re.MULTILINE)

    # Separate special characters
    if separate_characters == True:
        sentence = re.sub(r":", " : ", sentence)
        sentence = re.sub(r",", " , ", sentence)
        sentence = re.sub(r"\.", " . ", sentence)
        sentence = re.sub(r"!", " ! ", sentence)
        sentence = re.sub(r"¡", " ¡ ", sentence)
        sentence = re.sub(r"“", " “ ", sentence)
        sentence = re.sub(r"'", " ' ", sentence)
        sentence = re.sub(r"”", " ” ", sentence)
        sentence = re.sub(r"\(", " ( ", sentence)
        sentence = re.sub(r"\)", " ) ", sentence)
        sentence = re.sub(r"\?", " ? ", sentence)
        sentence = re.sub(r"\¿", " ¿ ", sentence)

    # Substituting multiple spaces with single space
    sentence = re.sub(r'\s+', ' ', sentence, flags=re.I)
    # emojis to text
    sentence = emoji.demojize(sentence)

    return sentence

In [None]:
clean_data = data.copy()
clean_data['Text'] = clean_data['Text'].apply(process_text)

In [None]:
clean_data.head()

## Removing stopwords

In [None]:
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [None]:
nltk.download('stopwords')
print(stopwords.words('spanish'))

In [None]:
stop_words = set(stopwords.words('spanish'))

In [None]:
def remove_stopwords(text):
  word_tokens = word_tokenize(text)
  no_stopwords = [word for word in word_tokens if not word in stop_words]
  return " ".join(no_stopwords)

In [None]:
remove_stopwords('el que tiene tienda la debe atender')

In [None]:
clean_data['Text'] = clean_data['Text'].apply(remove_stopwords)

## Lematización

In [None]:
#https://spacy.io/models/es
#We'll use Spacy for Lematization
!python -m spacy download es_core_news_sm

In [None]:
import spacy
import es_core_news_sm
nlp = es_core_news_sm.load()

In [None]:
def lematize(text):
    doc = nlp(text)
    lemms = []
    for token in doc:
        lemms.append(token.lemma_)
    return " ".join(lemms)

In [None]:
lematize('yo soy muy feliz con mi familia')

In [None]:
clean_data['Text'] = clean_data['Text'].apply(lematize)

## Punctuation Cleaning



In [None]:
def cleaning_punct(text):
  token_list = gensim.utils.simple_preprocess(str(text), deacc=True)  # deacc=True remueve puntuación
  return " ".join(token_list)

In [None]:
cleaning_punct('mi méxico querido qué fantástico')

In [None]:
clean_data['Text'] = clean_data['Text'].apply(cleaning_punct)

### Codificación de las etiquetas

Como el conjunto de datos es categórico, necesitamos convertir las etiquetas de personalidad de Neutral, No y Sí a un tipo float que nuestro modelo pueda entender. Para lograr esta tarea, implementaremos el método to_categorical de Keras.

In [None]:
labels = np.array(clean_data['Label'])


In [None]:
labels = tf.keras.utils.to_categorical(labels, 2, dtype="float32")

In [None]:
len(labels)

In [None]:
print(labels[102:110])

### Secuenciado de datos y sepación en matrices

Implementaremos el tokenizador de Keras así como su método pad_sequences para transformar nuestros datos de texto en datos flotantes 3D, de lo contrario nuestras redes neuronales no podrán ser entrenadas con ellos.

This class allows to vectorize a text corpus, by turning each text into either a sequence of integers (each integer being the index of a token in a dictionary)

In [None]:
#Se ocupa para cargar pad_sequences
!pip install Keras-Preprocessing

In [None]:
from keras.models import Sequential
from keras import layers
#from keras.optimizers import RMSprop, Adam
from keras.preprocessing.text import Tokenizer
#from keras.preprocessing.sequence import pad_sequences
from keras_preprocessing.sequence import pad_sequences
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint


In [None]:

#tokenizer = Tokenizer(num_words=max_words)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(clean_data['Text'])
word_index = tokenizer.word_index
total_unique_words = len(tokenizer.word_index) + 1
sequences = tokenizer.texts_to_sequences(clean_data['Text'])
max_seq_length = max([len(x) for x in sequences])
#textos = pad_sequences(sequences, maxlen=max_len)
texts = pad_sequences(sequences, maxlen=max_seq_length)
#print(textos[100:110])
max_words = total_unique_words
max_len = max_seq_length

print(texts[14])

In [None]:
print("Unique words: ",total_unique_words)
print("Max length: ",max_seq_length)

In [None]:
print(texts)
print(labels)

### We divide the dataset

We will use 80% for training and validation and the remaining 20% will be used to test the models.

In [None]:
#We divide the data set into training and validation 80% and 20%.
X_train_original, X_test, y_train_original, y_test = train_test_split(texts,labels, test_size=0.20, random_state=0)
print(len(X_train_original),len(X_test),len(y_train_original),len(y_test))

We divided the 80% selected for training into training and validation data sets (80% and 20%).

In [None]:
#Dividimos el conjunto de datos en entrenamiento y validación 80% y 20%
X_train, X_val, y_train, y_val = train_test_split(X_train_original,y_train_original, test_size=0.20, random_state=0)
print (len(X_train),len(X_val),len(y_train),len(y_val))

## Construcción del modelo

### Perceptrón Multicapa

---



In [None]:
model = Sequential()
model.add(layers.Embedding(max_words, 128, input_length=max_len))
model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(2, activation='softmax'))  #model.add(layers.Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

earlyStopping = keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=2, mode='min')
mcp = ModelCheckpoint("bestmodel_mlp.h5", monitor="val_accuracy", save_best_only=True, save_weights_only=False, verbose=1)
model_history = model.fit(X_train, y_train, verbose=1,
                          validation_data = (X_test, y_test), epochs=50,
                          callbacks=[mcp], batch_size= 64, shuffle=True)
                          #callbacks=[earlyStopping,mcp], batch_size= 64, shuffle=True)

### RNN

In [None]:
model0 = Sequential()
model0.add(layers.Embedding(input_dim=max_words, input_length = max_len, output_dim=64))
model0.add(layers.SimpleRNN(64))
model0.add(layers.Dense(2,activation='softmax'))



In [None]:
model0.compile(optimizer='rmsprop',loss='categorical_crossentropy', metrics=['accuracy'])
#Implementing model checkpoints to save the best metric and do not lose it on training.
checkpoint0 = ModelCheckpoint("best_model0.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', save_freq='epoch',save_weights_only=False)
history = model0.fit(X_train, y_train, epochs=10,validation_data=(X_test, y_test),callbacks=[checkpoint0])

### LSTM Sencilla

In [None]:
model1 = Sequential()
model1.add(layers.Embedding(input_dim=max_words, input_length = max_len, output_dim=64))
model1.add(layers.LSTM(64,dropout=0.5))
model1.add(layers.Dense(2,activation='softmax'))

model1.compile(optimizer='rmsprop',loss='categorical_crossentropy', metrics=['accuracy'])
#Implementing model checkpoins to save the best metric and do not lose it on training.
checkpoint1 = ModelCheckpoint("bestmodel_lstmsimple.h5", monitor='val_accuracy', verbose=2,save_best_only=True, mode='auto', save_freq='epoch',save_weights_only=False)
history = model1.fit(X_train, y_train, epochs=10,validation_data=(X_test, y_test),callbacks=[checkpoint1])

### LSTM Bidireccional

In [None]:
model2 = Sequential()
model2.add(layers.Embedding(input_dim=max_words, input_length = max_len, output_dim=64))
model2.add(layers.Bidirectional(layers.LSTM(64,dropout=0.5)))
model2.add(layers.Dense(2,activation='softmax'))
model2.compile(optimizer='rmsprop',loss='categorical_crossentropy', metrics=['accuracy'])
#Implementing model checkpoints to save the best metric and do not lose it on training.
checkpoint2 = ModelCheckpoint("bestmodel_lstmbidir.hdf5", monitor='val_accuracy', verbose=1,save_best_only=True, mode='auto', save_freq='epoch',save_weights_only=False)
history = model2.fit(X_train, y_train, epochs=10,validation_data=(X_test, y_test),callbacks=[checkpoint2])

### Modelo Convolucional 1D

 Según la teoría tiende a sobreajustarse extremadamente rápido en conjuntos de datos pequeños.

In [None]:
#from keras import regularizers
#model3 = Sequential()
#model3.add(layers.Embedding(max_words, 40, input_length=max_len))
#model3.add(layers.Conv1D(20, 6, activation='relu',kernel_regularizer=regularizers.l1_l2(l1=2e-3, l2=2e-3),bias_regularizer=regularizers.l2(2e-3)))
#model3.add(layers.MaxPooling1D(5))
#model3.add(layers.Conv1D(20, 6, activation='relu',kernel_regularizer=regularizers.l1_l2(l1=2e-3, l2=2e-3),bias_regularizer=regularizers.l2(2e-3)))
#model3.add(layers.GlobalMaxPooling1D())
#model3.add(layers.Dense(2,activation='softmax'))
#model3.compile(optimizer='rmsprop',loss='categorical_crossentropy',metrics=['acc'])
#metric = 'val_acc'
#checkpoint3 = ModelCheckpoint("best_model3.hdf5", monitor=metric, verbose=2,save_best_only=True, mode='auto', period=1,save_weights_only=False)
#checkpoint3 = ModelCheckpoint("bestmodel_cnn1d.hdf5", monitor=metric, verbose=2,save_best_only=True, mode='auto',save_weights_only=False)

#history = model3.fit(X_train, y_train, epochs=100,validation_data=(X_test, y_test),callbacks=[checkpoint3])

#metric = 'val_accuracy'
#ModelCheckpoint(filepath=r"C:\Users\reda.elhail\Desktop\checkpoints\{}".format(Name), monitor=metric,
                    #verbose=2, save_best_only=True, mode='max')]

### Rendimiento


Probamos el modelo con el conjunto de datos de prueba.

In [None]:
from sklearn.metrics import recall_score
from sklearn import metrics

#### Multilayer Perceptron

In [None]:
#Cargamos el mejor modelo almacenado
best_model_mlp = keras.models.load_model("bestmodel_mlp.hdf5")
#Calculamos las predicciones
predicts_perceptron = best_model_mlp.predict(X_test)
predicts_mlp = np.around(predicts_perceptron, decimals=0)
#Realizamos la evaluación y obtenemos las métricas
mlp_recall = metrics.recall_score(y_test, predicts_mlp, average='macro')
mlp_f1 = metrics.f1_score(y_test, predicts_mlp, average='macro')
mlp_precision = metrics.precision_score(y_test, predicts_mlp, average='macro')
mlp_accuracy = metrics.accuracy_score(y_test, predicts_mlp)
#Imprimimos los valores de las métricas
print(f"Rasgo evaluado: {rasgo}")
print(f"MLP Accuracy: {mlp_accuracy}")
print(f"MLP F1: {mlp_f1}")
print(f"MLP Precision: {mlp_precision}")
print(f"MLP Recall: {mlp_recall}")

### RNN

In [None]:
#We load the best model
best_model_rnn = keras.models.load_model("best_model0.hdf5")
#We get the predictions
predicts_rnn = best_model_rnn.predict(X_test)
predicts_rnn = np.around(predicts_rnn, decimals=0)
#We perform the evaluation and get the metrics
rnn_recall = metrics.recall_score(y_test, predicts_rnn, average='macro')
rnn_f1 = metrics.f1_score(y_test, predicts_rnn, average='macro')
rnn_precision = metrics.precision_score(y_test, predicts_rnn, average='macro')
rnn_accuracy = metrics.accuracy_score(y_test, predicts_rnn)
#Metrics values
print(f"RNN Accuracy: {rnn_accuracy}")
print(f"RNN F1: {rnn_f1}")
print(f"RNN Precision: {rnn_precision}")
print(f"RNN Recall: {rnn_recall}")

#### LSTM Sencilla

In [None]:
#We load the best model
best_model_lstm = keras.models.load_model("bestmodel_lstmsimple.h5")
#We get the predictions
predicts_ls = best_model_lstm.predict(X_test)
predicts_lstm = np.around(predicts_ls, decimals=0)
#We perform the evaluation and get the metrics
lstm_recall = metrics.recall_score(y_test, predicts_lstm, average='macro')
lstm_f1 = metrics.f1_score(y_test, predicts_lstm, average='macro')
lstm_precision = metrics.precision_score(y_test, predicts_lstm, average='macro')
lstm_accuracy = metrics.accuracy_score(y_test, predicts_lstm)
#Metrics values
print(f"LSTM Accuracy: {lstm_accuracy}")
print(f"LSTM F1: {lstm_f1}")
print(f"LSTM Precision: {lstm_precision}")
print(f"LSTM Recall: {lstm_recall}")

#### LSTM Bidireccional

In [None]:
#We load the best model
best_model_lstmbidir = keras.models.load_model("bestmodel_lstmbidir.hdf5")
#We get the predictions
predicts_lsbidir = best_model_lstmbidir.predict(X_test)
predicts_lstmbidir = np.around(predicts_lsbidir, decimals=0)
#We perform the evaluation and get the metrics
lstmbidir_recall = metrics.recall_score(y_test, predicts_lstmbidir, average='macro')
lstmbidir_f1 = metrics.f1_score(y_test, predicts_lstmbidir, average='macro')
lstmbidir_precision = metrics.precision_score(y_test, predicts_lstmbidir, average='macro')
lstmbidir_accuracy = metrics.accuracy_score(y_test, predicts_lstmbidir)
#Metrics values
print(f"LSTM Bidirectional Accuracy: {lstmbidir_accuracy}")
print(f"LSTM Bidirectional F1: {lstmbidir_f1}")
print(f"LSTM Bidirectional Precision: {lstmbidir_precision}")
print(f"LSTM Bidirectional Recall: {lstmbidir_recall}")

#### CNN 1D

In [None]:
#Cargamos el mejor modelo almacenado
#best_model_cnn = keras.models.load_model("bestmodel_cnn1d.hdf5")
#Calculamos las predicciones
#predicts_cnn = best_model_cnn.predict(X_test)
#predicts_cnn1d = np.around(predicts_cnn, decimals=0)
#Realizamos la evaluación y obtenemos las métricas
#cnn_recall = metrics.recall_score(y_test, predicts_cnn1d, average='macro')
#cnn_f1 = metrics.f1_score(y_test, predicts_cnn1d, average='macro')
#cnn_precision = metrics.precision_score(y_test, predicts_cnn1d, average='macro')
#cnn_accuracy = metrics.accuracy_score(y_test, predicts_cnn1d)
#Imprimimos los valores de las métricas
#print(f"Rasgo evaluado: {rasgo}")
#print(f"CNN Accuracy: {cnn_accuracy}")
#print(f"CNN F1: {cnn_f1}")
#print(f"CNN Precision: {cnn_precision}")
#print(f"CNN Recall: {cnn_recall}")

### Validación de los modelos

In [None]:
#Cargamos uno de los modelos
modelos = ['','','','','']
best_model = keras.models.load_model("bestmodel_lstmsimple.h5")

In [None]:
test_loss, test_acc = best_model.evaluate(X_test, y_test, verbose=2)
print('Model accuracy: ',test_acc)

In [None]:
predictions = best_model.predict(X_test)

In [None]:
print(predictions)
print(y_test)

###  Probando uno de los modelos

In [None]:
clases = ['Negative','Positive']

In [None]:
# Ej Correctos: 62 = No, 101 = Neutro; 10, 30, 45, 171,212 = Sí;
# Ej Erróneos: 60
indice_prueba = 100
texto_prueba = clean_data["Text"][indice_prueba]
print(texto_prueba)
print(clean_data["Label"][indice_prueba])

In [None]:
sequence = tokenizer.texts_to_sequences([texto_prueba])
test = pad_sequences(sequence, maxlen=max_len)
clases[np.around(best_model.predict(test), decimals=0).argmax(axis=1)[0]]

### Matriz de Confusión
Revisaremos la matriz de confusión, para entender mejor la capacidad de clasificación y generalización. Vamos a trazarla.

In [None]:
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test.argmax(axis=1), np.around(predictions, decimals=0).argmax(axis=1))

In [None]:
import seaborn as sns
conf_matrix = pd.DataFrame(matrix, index = ['Neutro','No','Sí'],columns = ['Neutro','No','Sí'])
#Normalizando
conf_matrix = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
plt.figure(figsize = (15,15))
sns.heatmap(conf_matrix, annot=True, annot_kws={"size": 15})

### Transformers

Initial Setup

In [None]:
!pip install transformers==4.24.0
!pip install simpletransformers==0.63.11

In [None]:
!pip install transformers
!pip install simpletransformers

In [None]:
!pip install emoji
# install simpletransformers
#!pip install simpletransformers

# check installed version
#!pip freeze | grep simpletransformers
# simpletransformers==0.28.2

In [None]:
pip show simpletransformers

### Load pretrained models

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs



In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
clean_data2 = clean_data.copy()
clean_data2.rename(columns = {'Text':'text','Label':'labels'}, inplace = True)

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(clean_data2, test_size=0.20)

print('train shape: ',train_df.shape)
print('test shape: ',test_df.shape)

In [None]:
# Optional model configuration
model_args = ClassificationArgs(num_train_epochs=1)

train_args ={"reprocess_input_data": True,
             "fp16":False,
             "num_train_epochs": 3,
             "overwrite_output_dir": True}

# Create a ClassificationModel
model = ClassificationModel(
    'bert',
    'bert-base-uncased',
    num_labels=2,
    args=train_args
)

In [None]:
# Train the model
model.train_model(train_df)

In [None]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

In [None]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(test_df,f1=f1_score, acc=accuracy_score, rc=recall_score, pcs=precision_score)

In [None]:
result

### Model testing

In [None]:
from sklearn.metrics import recall_score
from sklearn import metrics

In [None]:
test_df.head()

In [None]:
test = test_df['text'].to_numpy().tolist()
y = test_df['labels'].to_numpy().tolist()
print(test[0:10])
print(y[0:10])
print(len(test))
print(len(y))

In [None]:
predictions_test = model.predict(test)

In [None]:
test_recall = metrics.recall_score(y, predictions_test[0], average='macro')
test_f1 = metrics.f1_score(y, predictions_test[0], average='macro')
test_precision = metrics.precision_score(y, predictions_test[0], average='macro')
test_accuracy = metrics.accuracy_score(y, predictions_test[0])

In [None]:
print("Metrics results:")
print(f"Accuracy: {test_accuracy}")
print(f"F1: {test_f1}")
print(f"Precision: {test_precision}")
print(f"Recall: {test_recall}")