# Aplicando Transformers en Reconocimiento de Personalidad.

**Investigadores**: <br>
  Dr. Ramón Zatarain Cabada<br>
  Dra. María Lucía Barrón Estrada<br>
  M.C. Víctor Manuel Bátiz Beltrán

# PersonText

En este cuaderno se muestra el uso de Transformers con el corpus PersonText.

**Referencias**:

- Bátiz Beltrán, V. M., Zatarain Cabada, R., & Barrón Estrada, M. L. (2024). Creation of a Corpus in Spanish for the recognition of personality traits. Computación y Sistemas, Volumen 28, No. 3, 2024. ISSN: 2007-9737. Artículo disponible en línea: https://cys.cic.ipn.mx/ojs/index.php/CyS/article/view/4619/3734

In [None]:
!pip install emoji

In [None]:
!pip install gensim

In [None]:
import re
#import matplotlib.pyplot as plt
import string
from nltk.corpus import stopwords
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk import SnowballStemmer
import unicodedata
from collections import Counter
from wordcloud import WordCloud
from gensim.utils import simple_preprocess
import gensim
from sklearn.model_selection import train_test_split
import spacy
import pickle
import warnings
warnings.filterwarnings('ignore')
#import seaborn as sns
#from sklearn.metrics import confusion_matrix
#import matplotlib.pyplot as plt
import tensorflow as tf
import keras
import numpy as np
import pandas as pd
import emoji
import keras
from keras import backend as K
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
print('Listo')

#Procedimiento

## 1.Carga del conjunto de Datos

## Descarga del Corpus

In [None]:
def corpus_download(path, url):
  !wget --no-check-certificate \
     {url} \
     -O {path}

In [None]:
corpus_download("persontext.csv","https://catalabs.mx/datasets/persontext/corpus_persontext_v1.csv")
corpus_download("persontextV2.csv","https://catalabs.mx/datasets/persontext/corpus_persontext_v2.csv")
corpus_download("mypersonality_persontext.csv","https://catalabs.mx/datasets/persontext/corpus_mypersonality_esp.csv")

## Selección del Corpus a utilizar.

In [None]:
corpus_dict = {
    'PersonText' : 'persontext.csv',
    'PersonTextV2' : 'persontextV2.csv',
    'MyPersonalityPT' : 'mypersonality_persontext.csv'
}

corpus_seleccionado = 'PersonText' #@param ['PersonText','PersonTextV2','MyPersonalityPT']
corpus = pd.read_csv(corpus_dict[corpus_seleccionado])


## Exploración del Corpus

In [None]:
corpus.tail()

Dejamos solo las columnas con los datos que vamos a trabajar.

In [None]:
#                    0                     1                       2                      3                       4
rasgos = ['presenta_apertura','presenta_responsabilidad','presenta_sociabilidad','presenta_amabilidad','presenta_neuroticismo']

#rasgos = ['presenta_apertura', 'presenta_neuroticismo']

rasgo = rasgos[0] #Trabajaremos solo con un rasgo

corpus_limpio = corpus[['texto',rasgo]]


Visualizamos el Corpus resultante.

In [None]:
corpus_limpio.tail()

Reemplazamos las etiquetas "Sí" y "No" por "1" y "0" respectivamente.

In [None]:
replacement_mapping = {'Sí': 1, 'No': 0, 'y': 1, 'n':0}
corpus_limpio[rasgo] = corpus_limpio[rasgo].replace(replacement_mapping)

In [None]:
corpus_limpio.tail()

## 2. Limpieza de los Datos

In [None]:
def process_text(sentence, norm_user = True, norm_hashtag = True, separate_characters = True):
    # Convert instance to string
    sentence = str(sentence)

    # All text to lowecase
    sentence = sentence.lower()

    # Normalize users and url
    if norm_user == True:
        sentence = re.sub(r'\@\w+','@usuario', sentence)
    if norm_hashtag == True:
        sentence = re.sub(r"http\S+|www\S+|https\S+", 'url', sentence, flags=re.MULTILINE)

    # Separate special characters
    if separate_characters == True:
        sentence = re.sub(r":", " : ", sentence)
        sentence = re.sub(r",", " , ", sentence)
        sentence = re.sub(r"\.", " . ", sentence)
        sentence = re.sub(r"!", " ! ", sentence)
        sentence = re.sub(r"¡", " ¡ ", sentence)
        sentence = re.sub(r"“", " “ ", sentence)
        sentence = re.sub(r"'", " ' ", sentence)
        sentence = re.sub(r"”", " ” ", sentence)
        sentence = re.sub(r"\(", " ( ", sentence)
        sentence = re.sub(r"\)", " ) ", sentence)
        sentence = re.sub(r"\?", " ? ", sentence)
        sentence = re.sub(r"\¿", " ¿ ", sentence)

    # Substituting multiple spaces with single space
    sentence = re.sub(r'\s+', ' ', sentence, flags=re.I)
    # emojis to text
    sentence = emoji.demojize(sentence)

    return sentence

In [None]:
clean_data = corpus_limpio.copy()
clean_data['texto'] = clean_data['texto'].apply(process_text)

### Eliminando las palabras que no aportan valor (stopwords)

In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('stopwords')
print(stopwords.words('spanish'))

In [None]:
stop_words = set(stopwords.words('spanish'))

In [None]:
def remove_stopwords(text):
  word_tokens = word_tokenize(text)
  no_stopwords = [word for word in word_tokens if not word in stop_words]
  return " ".join(no_stopwords)

In [None]:
remove_stopwords('el que tiene tienda la debe atender')

In [None]:
clean_data['texto'] = clean_data['texto'].apply(remove_stopwords)

### Lematización

In [None]:
#https://spacy.io/models/es
#We'll use Spacy for Lematization
!python -m spacy download es_core_news_sm

In [None]:
import spacy
import es_core_news_sm
nlp = es_core_news_sm.load()

In [None]:
def lematize(text):
    doc = nlp(text)
    lemms = []
    for token in doc:
        lemms.append(token.lemma_)
    return " ".join(lemms)

In [None]:
lematize('yo soy muy feliz con mi familia')

In [None]:
clean_data['Text'] = clean_data['Text'].apply(lematize)

### Retirando elementos de puntuación y acentos (Punctuation Cleaning)

In [None]:
def cleaning_punct(text):
  token_list = gensim.utils.simple_preprocess(str(text), deacc=True)  # deacc=True remueve puntuación
  return " ".join(token_list)

In [None]:
cleaning_punct('mi méxico querido qué fantástico')

In [None]:
clean_data['texto'] = clean_data['texto'].apply(cleaning_punct)

## 3. Inicialización de Modelo y utilidades

Selección de Modelos



In [None]:
modelsNames = {
    'bert-base-multilingual-cased' : 'BERT_b_C_mñ',
    'bert-base-multilingual-uncased': 'BERT_b_U_ml',
    'distilbert-base-uncased' : 'distilBERT_b_U',
    'roberta-base' : 'roBERTa_b',
    'dccuchile/bert-base-spanish-wwm-cased' : 'BETO_b_C'
}

modelo_seleccionado = "dccuchile/bert-base-spanish-wwm-cased" #@param ['bert-base-multilingual-cased', 'bert-base-multilingual-uncased','distilbert-base-uncased','roberta-base','dccuchile/bert-base-spanish-wwm-cased']

Definimos la conversión de etiquetas.

In [None]:
id2label = {0: "No", 1: "Si"}
label2id = {"No": 0, "Si": 1}

## Transformers

In [None]:
!pip install transformers==4.24.0
!pip install simpletransformers==0.63.11

### Cargando los modelos preentrenados

In [None]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

In [None]:
import logging # Import the logging module

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
clean_data2 = clean_data.copy()
clean_data2.rename(columns = {'texto':'text',rasgo:'labels'}, inplace = True)

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(clean_data2, test_size=0.20)

print('train shape: ',train_df.shape)
print('test shape: ',test_df.shape)

In [None]:
# Optional model configuration
model_args = ClassificationArgs(num_train_epochs=1)

train_args ={"reprocess_input_data": True,
             "fp16":False,
             "num_train_epochs": 1, # Usaremos una época por cuestiones de tiempo
             "overwrite_output_dir": True}

# Create a ClassificationModel
model = ClassificationModel(
    'bert',
    modelo_seleccionado,
    #'bert-base-uncased',
    num_labels=2,
    args=train_args
)

## Entrenamos el modelo

In [None]:
# Train the model
model.train_model(train_df)

In [None]:
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

In [None]:
# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(test_df,f1=f1_score, acc=accuracy_score, rc=recall_score, pcs=precision_score)

In [None]:
print(f" Exactitud (Accuracy): {result['acc']}")
print(f" F1-Score: {result['f1']}")
print(f" Recall: {result['rc']}")
print(f" Precisión: {result['pcs']}")

### Probando el modelo

In [None]:
from sklearn.metrics import recall_score
from sklearn import metrics

In [None]:
#Recordemos nuestras clases 0 = Negativo, 1 = Positivo
clases = ['No','Sí']

In [None]:
#corpus_limpio.texto[0]
corpus_limpio.head()

In [None]:
# Vamos a usar un diccionario para crear el dataset de prueba
# Frases:
datos = {
    'text': ['Sigo sin entender porque la música ahorita actualmente es tan explicita por qué todo está de la manera de fácilmente lo que se dedica alguien o algo del tipo de música todos podemos hablar de él pero okey pero ya entendimos el pedo pero la necesidad de hacerlo es otro rollo'],
    'labels': [0]
}

# Crear un DataFrame a partir del diccionario
df = pd.DataFrame(datos)

In [None]:
df.head()

In [None]:
test = df['text'].to_numpy().tolist()
y = df['labels'].to_numpy().tolist()
print(test[0])
print(y[0])
print(len(test))
print(len(y))

In [None]:
predictions_test = model.predict(test)

In [None]:
#Accediendo la clase elegida por el modelo
print(clases[predictions_test[0][0]])

In [None]:
#Usando el vector de probabilidades
print(clases[np.argmax(predictions_test[1])])

In [None]:
test_recall = metrics.recall_score(y, predictions_test[0], average='macro')
test_f1 = metrics.f1_score(y, predictions_test[0], average='macro')
test_precision = metrics.precision_score(y, predictions_test[0], average='macro')
test_accuracy = metrics.accuracy_score(y, predictions_test[0])

In [None]:
print("Metrics results:")
print(f"Accuracy: {test_accuracy}")
print(f"F1: {test_f1}")
print(f"Precision: {test_precision}")
print(f"Recall: {test_recall}")