# Emotion Regression: How angry are you?

In [1]:
!pip install emoji
!pip install simpletransformers
!pip install sentencepiece



## Data Preparation

In [2]:
import pandas as pd

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Define data paths
train_path = "/content/drive/MyDrive/NLP/ex5/EI-reg-En-anger-train.txt"
test_path = "/content/drive/MyDrive/NLP/ex5/2018-EI-reg-En-anger-test-gold.txt"

# Load the training and test datasets
train_dataset = pd.read_csv(train_path, sep="\t")
test_dataset = pd.read_csv(test_path, sep="\t")

# Check unique values in the 'Affect Dimension' column for training data
unique_values_train = train_dataset['Affect Dimension'].unique()
print("Unique values in 'Affect Dimension' for training data:", unique_values_train)

Unique values in 'Affect Dimension' for training data: ['anger']


In [5]:
# Drop unnecessary columns for both datasets
columns_to_drop = ['ID', 'Affect Dimension']
train_dataset = train_dataset.drop(columns=columns_to_drop)
test_dataset = test_dataset.drop(columns=columns_to_drop)

# Display datasets
print("Training Dataset:")
print(train_dataset.head())

print("\nTest Dataset:")
print(test_dataset.head())

Training Dataset:
                                               Tweet  Intensity Score
0  @xandraaa5 @amayaallyn6 shut up hashtags are c...            0.562
1  it makes me so fucking irate jesus. nobody is ...            0.750
2         Lol Adam the Bull with his fake outrage...            0.417
3  @THATSSHAWTYLO passed away early this morning ...            0.354
4  @Kristiann1125 lol wow i was gonna say really?...            0.438

Test Dataset:
                                               Tweet  Intensity Score
0  @PageShhh1 I know you mean well but I'm offend...            0.734
1  Let go of resentment, it will hold you back, d...            0.422
2  No, I'm not 'depressed because of the weather,...            0.663
3  #AmarnathTerrorAttack  Muslims are killing eve...            0.703
4  Prepare to suffer the sting of Ghost Rider's p...            0.719


## Data Preprocessing

In [6]:
import re
import emoji
from textblob import TextBlob
from sklearn.model_selection import train_test_split

In [7]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.compile('<.*?>').sub('', text)  # Remove HTML tags
    text = re.compile('https?://\S+|www\.\S+').sub('', text)  # Remove URLs
    text = re.compile('@\S+').sub('', text)  # Remove user IDs
    return text

# Function to preprocess the dataset
def preprocess(df):
    df['Tweet'] = df['Tweet'].apply(preprocess_text)
    # df['Tweet'] = df['Tweet'].apply(TextBlob)
    df['Tweet'] = df['Tweet'].apply(emoji.demojize)
    return df

# Preprocess the training and test datasets
train_dataset = preprocess(train_dataset)
test_dataset = preprocess(test_dataset)

# Display the preprocessed datasets
print("Training Dataset:")
print(train_dataset.head())

print("\nTest Dataset:")
print(test_dataset.head())

Training Dataset:
                                               Tweet  Intensity Score
0                shut up hashtags are cool #offended            0.562
1  it makes me so fucking irate jesus. nobody is ...            0.750
2         lol adam the bull with his fake outrage...            0.417
3   passed away early this morning in a fast and ...            0.354
4   lol wow i was gonna say really?! haha have yo...            0.438

Test Dataset:
                                               Tweet  Intensity Score
0      i know you mean well but i'm offended. prick.            0.734
1  let go of resentment, it will hold you back, d...            0.422
2  no, i'm not 'depressed because of the weather,...            0.663
3  #amarnathterrorattack  muslims are killing eve...            0.703
4  prepare to suffer the sting of ghost rider's p...            0.719


In [8]:
# Split dataset into training and validation sets
trainset, valset = train_test_split(train_dataset, test_size=0.2, shuffle=True, random_state=42)

## Regression

### 1. RoBERTa Model

In [9]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import logging

In [10]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# RoBERTa Regression Model
model_args_roberta = ClassificationArgs(num_train_epochs=5, regression=True, overwrite_output_dir=True)

model_roberta = ClassificationModel(
    "roberta",
    "roberta-base",
    num_labels=1,
    args=model_args_roberta,
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
# Create a ClassificationModel
model_roberta = ClassificationModel(
    "roberta",
    "roberta-base",
    num_labels=1,
    args=model_args_roberta
)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# Train the model
model_roberta.train_model(trainset)



  0%|          | 0/1360 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/170 [00:00<?, ?it/s]



Running Epoch 1 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

(850, 0.023643463119891857)

In [13]:
# Evaluate the model
result, model_outputs, wrong_predictions = model_roberta.eval_model(valset)

print(result)



  0%|          | 0/341 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/43 [00:00<?, ?it/s]

{'eval_loss': 0.011800654773968597}


### 2. Electra Model

In [14]:
# Electra Regression Model
model_args_electra = ClassificationArgs(num_train_epochs=5, regression=True, overwrite_output_dir=True)

model_electra = ClassificationModel(
    "electra",
    "google/electra-base-discriminator",
    num_labels=1,
    args=model_args_electra,
)

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Train the model
model_electra.train_model(trainset)

  0%|          | 0/1360 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

(850, 0.015929476654924016)

In [16]:
# Evaluate the model
result, model_outputs, wrong_predictions = model_electra.eval_model(valset)

print(result)

  0%|          | 0/341 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/43 [00:00<?, ?it/s]

{'eval_loss': 0.012541108797196039}


### 3. DistilBERT Model

In [17]:
# DistilBERT Regression Model
model_args_distilbert = ClassificationArgs(num_train_epochs=5, regression=True, overwrite_output_dir=True)

model_distilbert = ClassificationModel(
    "distilbert",
    "distilbert-base-uncased",
    num_labels=1,
    args=model_args_distilbert,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# Train the model
model_distilbert.train_model(trainset)

  0%|          | 0/1360 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

(850, 0.014093427623156457)

In [19]:
# Evaluate the model
result, model_outputs, wrong_predictions = model_distilbert.eval_model(valset)

print(result)

  0%|          | 0/341 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/43 [00:00<?, ?it/s]

{'eval_loss': 0.011318235777231843}


### 4. XLNet Model


In [20]:
# XLNet Regression Model
model_args_xlnet = ClassificationArgs(num_train_epochs=5, regression=True, overwrite_output_dir=True)

model_xlnet = ClassificationModel(
    "xlnet",
    "xlnet-base-cased",
    num_labels=1,
    args=model_args_xlnet,
)

Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary.summary.bias', 'sequence_summary.summary.weight', 'logits_proj.bias', 'logits_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# Train the model
model_xlnet.train_model(trainset)

  0%|          | 0/1360 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

(850, 0.034493109916391616)

In [22]:
# Evaluate the model
result, model_outputs, wrong_predictions = model_xlnet.eval_model(valset)

print(result)

  0%|          | 0/341 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/43 [00:00<?, ?it/s]

{'eval_loss': 0.01332802236415879}


### 5. ALBERT Model

In [23]:
# ALBERT Regression Model
model_args_albert = ClassificationArgs(num_train_epochs=5, regression=True, overwrite_output_dir=True)

model_albert = ClassificationModel(
    "albert",
    "albert-base-v2",
    num_labels=1,
    args=model_args_albert,
)

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
# Train the model
model_albert.train_model(trainset)

  0%|          | 0/1360 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/170 [00:00<?, ?it/s]

(850, 0.028298963331206538)

In [25]:
# Evaluate the model
result, model_outputs, wrong_predictions = model_albert.eval_model(valset)

print(result)

  0%|          | 0/341 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/43 [00:00<?, ?it/s]

{'eval_loss': 0.011653462554826292}


## Testing with the Best Model

By comparing the Validation Losses of the above models, we find the best model for this task is the DistilBERT Model.

In [35]:
best_model = model_distilbert

In [36]:
from scipy import stats

In [37]:
# Get the predictions and true labels
predictions, _ = best_model.predict(list(test_dataset['Tweet']))
true_labels = list(test_dataset['Intensity Score'])

# Calculate Pearson correlation
pearson_corr, _ = stats.pearsonr(predictions, true_labels)

print(f"Pearson Correlation: {pearson_corr}")

  0%|          | 0/1002 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

Pearson Correlation: 0.7811776624711652
