In [95]:
# Install deps
# !python -m spacy download en_core_web_md
# !pip install torch torchvision
# !pip install transformers
# !pip3 install nltk emoji==0.6.0

In [1]:
#dataset
import pandas as pd

# utils
from sklearn.utils import shuffle

#visualize
import matplotlib.pyplot as plt
%matplotlib inline

# modeling
import spacy
from transformers import AutoTokenizer, TFAutoModel
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import numpy as np

In [2]:
def reader_df(topic):
  path_train = "cleaned_df/stance_" + topic + "_train_cleaned.csv"
  path_test = "cleaned_df/stance_" + topic + "_test_cleaned.csv"
  path_val = "cleaned_df/stance_" + topic + "_validation_cleaned.csv"
  df_train = pd.read_csv(path_train)
  df_val = pd.read_csv(path_val)
  df_test = pd.read_csv(path_test)

  X_train = df_train.loc[:, 'text'].values
  y_train = df_train.loc[:, 'label'].values

  X_test = df_test.loc[:, 'text'].values
  y_test = df_test.loc[:, 'label'].values

  X_val = df_val.loc[:, 'text'].values
  y_val = df_val.loc[:, 'label'].values

  return X_train, X_test, y_train, y_test, X_val, y_val

We define the metrics we want to call:

In [3]:
def get_metrics(y_trues, y_preds, verbose=True):

  recall = recall_score(y_trues, y_preds, average='weighted') * 100
  precision = precision_score(y_trues, y_preds, average='weighted') * 100

  if verbose:
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')

  return recall, precision

In [4]:
# generate the batches

def get_batches(X_train, y_train, tokenizer, batch_size, max_length):
    """
    Objective: from features and labels yield a random batch of batch_size of (features, labels),
               each time we reached all data we shuffle again the (features, labels) 
               and we do it again (infinite loop)

    Inputs:
        - X_train, np.array: the texts (features)
        - y_train, np.array: the labels
        - tokenizer, transformers.tokenization_distilbert.DistilBertTokenizer: the tokenizer of the model
        - batch_size, int: the size of the batch we yield
        - max_length, int: the input shape of the data
    Outputs: (generator)
        - inputs, np.array : two arrays one with ids from the tokenizer, and the masks associated with the padding
        - targets, np.array: the label array of the associated inputs
    """
    X_train, y_train = shuffle(X_train, y_train, random_state=11)

    i, j = 0, 0

    while i > -1:

        if (len(X_train) - j*batch_size) < batch_size:
            j = 0
            X_train, y_train = shuffle(X_train, y_train, random_state=11)

        sentences = X_train[j*batch_size: (j+1) * batch_size]
        targets = y_train[j*batch_size: (j+1) * batch_size, :]
        j += 1

        input_ids, input_masks = [],[]

        # see if puting following before the loop may improve the training in time and RAM used
        inputs = tokenizer.batch_encode_plus(list(sentences), add_special_tokens=True, max_length=max_length, 
                                            padding='max_length',  return_attention_mask=True,
                                            return_token_type_ids=True, truncation=True,
                                             return_tensors="np")

        ids = np.asarray(inputs['input_ids'], dtype='int32')
        masks = np.asarray(inputs['attention_mask'], dtype='int32')

        #till here and use the same shuffle on ids, masks instead of X_train

        inputs = [ids, masks] 

        yield inputs, targets

### Stance: Feminist

In [7]:
X_train, X_test, y_train, y_test, X_val, y_val = reader_df("feminist")

+ **DistilBERT** model

In [14]:
checkpoint="distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModel.from_pretrained(checkpoint)

batch_size=64
max_length=64
rate = 0.5
num_labels = 3
input_ids_in = tf.keras.layers.Input(shape=(max_length,), name=f'input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(max_length,), name=f'masked_token', dtype='int32') 

embedding_layer = model(input_ids_in, attention_mask=input_masks_in)[0][:,0,:]
output_layer = tf.keras.layers.Dropout(rate, name='do_layer')(embedding_layer)
weight_initializer = tf.keras.initializers.GlorotNormal(seed=42)
output = tf.keras.layers.Dense(num_labels, activation='softmax')(output_layer)
bert_model = tf.keras.Model(inputs=[input_ids_in, input_masks_in], outputs = output)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading tf_model.h5:   0%|          | 0.00/363M [00:00<?, ?B/s]

Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_transform', 'activation_13', 'vocab_projector', 'vocab_layer_norm']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


In the next chunk of code:


`OneHotEncoder(handle_unknown='ignore')` creates a OneHotEncoder object with the parameter handle_unknown set to 'ignore'. This means that if the encoder encounters a new category in y_train during the encoding process, it will ignore it instead of raising an error.

`y_train.reshape(-1, 1)` reshapes the 1-dimensional y_train array into a 2-dimensional array with a single column. This is required by the fit_transform() method of the OneHotEncoder object.

`enc.fit_transform()` fits the encoder on the reshaped y_train data and transforms it into a one-hot encoded matrix. The resulting matrix is a sparse matrix representation of the one-hot encoded data.

`.toarray()` converts the sparse matrix into a dense numpy array.

`_y_train` is assigned the dense numpy array containing the one-hot encoded representation of y_train.

In [22]:
#train the model

enc = OneHotEncoder(handle_unknown='ignore')
_y_train = enc.fit_transform(y_train.reshape(-1, 1)).toarray()



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1b4a9d70ac0>

We train the model:

In [None]:
steps_per_epoch = int(len(X_train) / batch_size)

batches = get_batches(X_train, _y_train, tokenizer, batch_size, max_length)

bert_model.compile(optimizer=Adam(2e-5),
                   metrics=[tf.keras.metrics.Recall(), 
                   tf.keras.metrics.Precision()],
                   loss=tf.keras.losses.CategoricalCrossentropy())

bert_model.fit(batches, epochs=10, steps_per_epoch=steps_per_epoch)

And evaluate on the test set:

In [32]:
inputs = tokenizer.batch_encode_plus(list(X_test), 
                                     add_special_tokens=True, max_length=max_length, 
                                    padding='max_length',  return_attention_mask=True,
                                    return_token_type_ids=True, truncation=True)

input_test = [np.asarray(inputs['input_ids'], dtype='int32'), 
              np.asarray(inputs['attention_mask'], dtype='int32')]
y_preds = bert_model.predict(input_test)
y_preds = np.argmax(y_preds, axis=1)
r_bert, p_bert = get_metrics(y_test, y_preds, verbose=True)

Precision: 55.93
Recall: 56.69


We have pretty poor results. Why?

BERT is a pre-trained language model that has been shown to achieve state-of-the-art performance on a wide range of natural language processing tasks. However, BERT was pre-trained on a large corpus of general text, which may not be representative of the language used in tweets.

Tweets are known to have unique characteristics that can make them more challenging to classify compared to other types of text. For example, tweets are often shorter, contain a lot of noise (such as typos and slang), and can have complex grammatical structures that are not found in more formal writing.

Additionally, the use of hashtags, emojis, and other special characters in tweets can make it difficult for BERT to understand the context and sentiment of the tweet. Pre-processing and cleaning the tweets can help to mitigate some of these issues, but there is still a limit to the effectiveness of this approach.

To address these challenges, researchers have developed specialized versions of BERT for use with social media data. For example, BERTweet is a variant of BERT that has been trained specifically on tweets and has been shown to outperform generic BERT on tweet classification tasks.

+ **BERTweet**

Trying BERTweet. For that we should work with the raw data, since BERTweet includes its own tokenizer method.

In [10]:
def raw_reader(topic):
    path_X_train = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/stance/" + topic + "/train_text.txt"
    path_X_test = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/stance/" + topic + "/test_text.txt"
    path_X_val = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/stance/" + topic + "/val_text.txt"
    path_y_train = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/stance/" + topic + "/train_labels.txt"
    path_y_test = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/stance/" + topic + "/test_labels.txt"
    path_y_val = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/stance/" + topic + "/val_labels.txt"

    X_train = pd.read_table(path_X_train, header=None)
    X_test = pd.read_table(path_X_test, header=None)
    X_val = pd.read_table(path_X_val, header=None)
    y_train = pd.read_table(path_y_train, header=None)
    y_test = pd.read_table(path_y_test, header=None)
    y_val = pd.read_table(path_y_val, header=None)

    X_train = X_train.rename(columns={0: "text"})
    X_test = X_test.rename(columns={0: "text"})
    y_train = y_train.rename(columns={0: "label"})
    y_test = y_test.rename(columns={0: "label"})
    X_val = X_val.rename(columns={0: "text"})
    y_val = y_val.rename(columns={0: "label"})

    return X_train, X_test, X_val, y_train, y_test, y_val
    



In [13]:
# Import necessary libraries
import pandas as pd
import torch
import transformers
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


# Load pre-trained BERTweet tokenizer and model
tokenizer = transformers.AutoTokenizer.from_pretrained("vinai/bertweet-base", normalization=True)
model = transformers.AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=3)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.decoder.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSe

In [17]:
# Define function to preprocess tweet data
def preprocess_tweet(tweet):
    # Tokenize tweet text
    tokens = tokenizer.encode(tweet, add_special_tokens=True)
    
    # Truncate or pad token IDs to fixed length
    max_len = 64
    pad_token_id = tokenizer.pad_token_id
    if len(tokens) > max_len:
        tokens = tokens[:max_len-1] + [tokens[-1]]
    else:
        tokens = tokens + [pad_token_id] * (max_len - len(tokens))
    
    # Create attention mask
    attention_mask = [1 if token != pad_token_id else 0 for token in tokens]
    
    return tokens, attention_mask

In [15]:

# Define function to train and evaluate model
def train_and_evaluate(X_train, X_test, y_train, y_test, model, num_epochs):
    # Define optimizer and learning rate scheduler
    optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
    total_steps = len(X_train) * num_epochs
    scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
    
    # Define loss function
    loss_fn = torch.nn.CrossEntropyLoss()
    
    # Convert data to DataLoader objects
    train_dataset = torch.utils.data.TensorDataset(torch.tensor([preprocess_tweet(tweet)[0] for tweet in X_train['text']]),
                                                   torch.tensor(y_train['label'].values))
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32)
    
    test_dataset = torch.utils.data.TensorDataset(torch.tensor([preprocess_tweet(tweet)[0] for tweet in X_test['text']]),
                                                  torch.tensor(y_test['label'].values))
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32)

    test_preds = []
    test_targets = []
    
    # Train model
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        i=0
        for inputs, labels in train_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs, attention_mask=(inputs != tokenizer.pad_token_id))
            loss = loss_fn(outputs.logits, labels)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)

        
        # Evaluate model on test set
        model.eval()
        predictions = []
        targets = []
        test_loss = 0
        with torch.no_grad():
            for inputs, labels in test_loader:
                inputs = inputs.to(device)
                labels = labels.to(device)
                outputs = model(inputs, attention_mask=(inputs != tokenizer.pad_token_id))
                loss = loss_fn(outputs.logits, labels)
                test_loss += loss.item()
                predictions.extend
                predicted_labels = torch.argmax(outputs.logits, dim=1)
                predictions.extend(predicted_labels.cpu().numpy())
                targets.extend(labels.cpu().numpy())
            test_loss /= len(test_loader)

            # Store predicted and target values for computing metrics
            test_preds.extend(predictions)
            test_targets.extend(targets)
        
        # Print epoch-level metrics
        print('Epoch [{}/{}], Train Loss: {:.4f}, Test Loss: {:.4f}'.format(epoch+1, num_epochs, train_loss, test_loss))
        
        # Print evaluation metrics
        precision, recall, f1_score, _ = precision_recall_fscore_support(targets, predictions, average='weighted')
        print('Precision: {:.4f}, Recall: {:.4f}, F1 Score: {:.4f}'.format(precision, recall, f1_score))

    # Compute precision, recall, F1-score
    test_precision, test_recall, test_f1, _ = precision_recall_fscore_support(test_targets, test_preds, average='weighted')
    return test_precision, test_recall, test_f1

In [16]:
# Define device (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

Now we run the model on our different *stance* datasets:

+ Feminist

In [11]:
# Import raw datasets (stance feminist)

X_train, X_test, X_val, y_train, y_test, y_val = raw_reader("feminist")

In [19]:
test_precision, test_recall, test_f1 = train_and_evaluate(X_train=X_train, X_test=X_test, y_test=y_test, y_train=y_train, model=model, num_epochs=5)

Epoch [1/5], Train Loss: 0.8903, Test Loss: 0.7370
Precision: 0.7597, Recall: 0.7692, F1 Score: 0.6972


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [2/5], Train Loss: 0.5966, Test Loss: 0.4844
Precision: 0.7923, Recall: 0.8402, F1 Score: 0.8047


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [3/5], Train Loss: 0.3902, Test Loss: 0.4342
Precision: 0.8013, Recall: 0.8580, F1 Score: 0.8286


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [4/5], Train Loss: 0.3185, Test Loss: 0.5790
Precision: 0.7903, Recall: 0.8284, F1 Score: 0.8047


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [5/5], Train Loss: 0.2690, Test Loss: 0.5286
Precision: 0.7914, Recall: 0.8402, F1 Score: 0.8140


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
print('Precision Stance feminist:'+str(test_precision))
print('Recall Stance feminist:'+str(test_recall))
print('F1 Stance feminist:'+str(test_f1))


Precision Stance feminist:0.7701838731802633
Recall Stance feminist:0.8272189349112427
F1 Stance feminist:0.7963254340425556


+ Climate

In [18]:
# Import raw datasets (stance climate)

X_train, X_test, X_val, y_train, y_test, y_val = raw_reader("climate")

In [87]:
test_precision, test_recall, test_f1 = train_and_evaluate(X_train=X_train, X_test=X_test, y_test=y_test, y_train=y_train, model=model, num_epochs=5)

Epoch [1/5], Train Loss: 1.2727, Test Loss: 0.5541
Precision: 0.7566, Recall: 0.8047, F1 Score: 0.7626


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [2/5], Train Loss: 0.3957, Test Loss: 0.4439
Precision: 0.7891, Recall: 0.8462, F1 Score: 0.8162


  _warn_prf(average, modifier, msg_start, len(result))


Epoch [3/5], Train Loss: 0.1789, Test Loss: 0.5018
Precision: 0.8549, Recall: 0.8462, F1 Score: 0.8180
Epoch [4/5], Train Loss: 0.0621, Test Loss: 0.7016
Precision: 0.8306, Recall: 0.8343, F1 Score: 0.8322
Epoch [5/5], Train Loss: 0.0765, Test Loss: 1.1228
Precision: 0.8135, Recall: 0.7988, F1 Score: 0.8001


In [88]:
print('Precision Stance climate:'+str(test_precision))
print('Recall Stance climate:'+str(test_recall))
print('F1 Stance climate:'+str(test_f1))


Precision:0.8106807671546118
Recall:0.8260355029585799
F1:0.810752187522876


+ Abortion

In [21]:
# Import raw datasets (stance abortion)

X_train, X_test, X_val, y_train, y_test, y_val = raw_reader("abortion")

In [22]:
test_precision, test_recall, test_f1 = train_and_evaluate(X_train=X_train, X_test=X_test, y_test=y_test, y_train=y_train, model=model, num_epochs=5)

Epoch [1/5], Train Loss: 1.0179, Test Loss: 0.9576
Precision: 0.6602, Recall: 0.5679, F1 Score: 0.5754
Epoch [2/5], Train Loss: 0.6347, Test Loss: 0.9178
Precision: 0.6936, Recall: 0.5786, F1 Score: 0.5964
Epoch [3/5], Train Loss: 0.4556, Test Loss: 0.8351
Precision: 0.7298, Recall: 0.6214, F1 Score: 0.6394
Epoch [4/5], Train Loss: 0.2959, Test Loss: 1.0240
Precision: 0.7546, Recall: 0.6321, F1 Score: 0.6500
Epoch [5/5], Train Loss: 0.1664, Test Loss: 1.1508
Precision: 0.7496, Recall: 0.6786, F1 Score: 0.6927


In [23]:
print('Precision Stance abortion:'+str(test_precision))
print('Recall Stance abortion:'+str(test_recall))
print('F1 Stance abortion:'+str(test_f1))

Precision Stance abortion:0.7099296754389114
Recall Stance abortion:0.6157142857142858
F1 Stance abortion:0.6332141118265852


+ Atheism

In [24]:
# Import raw datasets (stance abortion)

X_train, X_test, X_val, y_train, y_test, y_val = raw_reader("atheism")

In [25]:
test_precision, test_recall, test_f1 = train_and_evaluate(X_train=X_train, X_test=X_test, y_test=y_test, y_train=y_train, model=model, num_epochs=5)

Epoch [1/5], Train Loss: 0.9734, Test Loss: 0.5306
Precision: 0.7753, Recall: 0.7909, F1 Score: 0.7724
Epoch [2/5], Train Loss: 0.3772, Test Loss: 0.5884
Precision: 0.8151, Recall: 0.7682, F1 Score: 0.7810
Epoch [3/5], Train Loss: 0.2031, Test Loss: 0.8595
Precision: 0.8117, Recall: 0.7273, F1 Score: 0.7462
Epoch [4/5], Train Loss: 0.1186, Test Loss: 0.9208
Precision: 0.8203, Recall: 0.7591, F1 Score: 0.7743
Epoch [5/5], Train Loss: 0.1334, Test Loss: 0.9100
Precision: 0.8241, Recall: 0.7455, F1 Score: 0.7650


In [26]:
print('Precision Stance atheism:'+str(test_precision))
print('Recall Stance atheism:'+str(test_recall))
print('F1 Stance atheism:'+str(test_f1))

Precision Stance atheism:0.7976145935853368
Recall Stance atheism:0.7581818181818182
F1 Stance atheism:0.7706104850836318
