In [1]:
#!pip install transformers

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from transformers import BertTokenizer, BertConfig, TFBertForSequenceClassification
import tensorflow as tf

from sklearn.model_selection import train_test_split

pd.options.display.max_colwidth = None
%matplotlib inline

In [2]:
#modelo transformer a utilizar
nombre_modelo = 'bert-base-multilingual-uncased'

In [3]:
sent_list = ['anger','anticipation','disgust','fear','joy','love','optimism','pessimism','sadness','surprise','trust']
sent_list

['anger',
 'anticipation',
 'disgust',
 'fear',
 'joy',
 'love',
 'optimism',
 'pessimism',
 'sadness',
 'surprise',
 'trust']

## Limpieza de datos

In [5]:
import re, string

pattern1 = re.compile(r'@[\w_]+') #elimina menciones
pattern2 = re.compile(r'https?://[\w_./]+') #elimina URL
pattern3 = re.compile(r'#[\w_]+') #elimina hashtags
pattern4 = re.compile('[{}]+'.format(re.escape(string.punctuation))) #elimina símbolos de puntuación

def clean_text(text):
    """Limpiamos las menciones, URL y hashtags del texto. Luego 
    quitamos signos de puntuación"""
    text = pattern1.sub('mención', text)
    text = pattern2.sub('URL', text)
    text = pattern3.sub('hashtag', text)
    text = pattern4.sub(' ', text)
    
    return text

## Fine Tunning con BERT

In [9]:
#definimos modelo de clasificación
config = BertConfig.from_pretrained(nombre_modelo, hidden_dropout_prob=0.1, num_labels=2)
model = TFBertForSequenceClassification.from_pretrained(nombre_modelo, config=config)
model.bert.trainable = False #congelamos la actualización de las capas del BERT
# recommended learning rate for Adam 5e-5, 3e-5, 2e-5
learning_rate = 2e-5

optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=1e-08)

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

model.summary()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  167356416 
                                                                 
 dropout_75 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 167,357,954
Trainable params: 1,538
Non-trainable params: 167,356,416
_________________________________________________________________


In [12]:
#main Loop
tokenizer = BertTokenizer.from_pretrained(nombre_modelo)

for data in sent_list:
    ###PREPARANDO EL DATASET-------------------------------------------------------------
    #limpiamos texto y quitamos tweets que se han quedado vacíos
    df = pd.read_csv("data/df_"+data+".csv")
    
    df.Tweet=df.Tweet.apply(clean_text)
    df = df[df['Tweet']!='']
    #el conjunto de salida es la polaridad, hay que convertir a binario
    #codificamos 'P' como 1 y 'N' se queda como 0
    Y=df.anger

    #Separamos entrenamiento y test
    #realmente habría que sacar los tokens sólo del conjunto de entrenamiento...
    X_train_tweets, X_test_tweets, Y_train, Y_test = train_test_split(df.Tweet,Y, test_size = 0.3, random_state = 42)
    print(X_train_tweets.shape,Y_train.shape)
    print(X_test_tweets.shape,Y_test.shape)
    
    ##PREPARAMOS LOS DATOS PARA ENTRADA AL MODELO-----------------------------------------
    #Tokenizamos y codificamos como Dataset

    train_encodings = tokenizer(X_train_tweets.to_list(), truncation=True, padding=True, return_tensors="tf")
    MAX_SEQUENCE_LENGTH=train_encodings['input_ids'].shape[1]
    test_encodings = tokenizer(X_test_tweets.to_list(), truncation=True, padding='max_length', max_length=MAX_SEQUENCE_LENGTH, return_tensors="tf")
    
    train_dataset = tf.data.Dataset.from_tensor_slices((
        dict(train_encodings),
        Y_train
    ))
    test_dataset = tf.data.Dataset.from_tensor_slices((
        dict(test_encodings),
        Y_test
    ))
    
    ##TRAINING----------------------------------------------------------------------------
    batch_size=8
    
    print("Training Sentiment: "+data)
    history=model.fit(train_dataset.batch(batch_size), epochs=5, batch_size=batch_size, validation_data=test_dataset.batch(batch_size))
    
    ##VISUALIZANDO EL ENTRENAMIENTO-------------------------------------------------------
    # Plot training & validation accuracy values
    plt.plot(history.history['accuracy'])
    plt.plot(history.history['val_accuracy'])
    plt.title('Fine-tuning BERT:'+data)
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')

    plt.legend(['Train', 'Test'], loc='upper left')
    plt.show()
    
    

(2492,) (2492,)
(1069,) (1069,)
Training Sentiment: anger
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


NameError: name 'plt' is not defined