In [1]:
!pip install emoji
!pip install simpletransformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting emoji
  Downloading emoji-2.2.0.tar.gz (240 kB)
[K     |████████████████████████████████| 240 kB 12.8 MB/s 
[?25hBuilding wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-2.2.0-py3-none-any.whl size=234926 sha256=ae5e51f1ba19795885b9c270e7d41334f712c85fdb7ef82c11a5f6c5be6fa08a
  Stored in directory: /root/.cache/pip/wheels/86/62/9e/a6b27a681abcde69970dbc0326ff51955f3beac72f15696984
Successfully built emoji
Installing collected packages: emoji
Successfully installed emoji-2.2.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simpletransformers
  Downloading simpletransformers-0.63.9-py3-none-any.whl (250 kB)
[K     |████████████████████████████████| 250 kB 12.6 MB/s 
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)

In [2]:
import csv
import re
import warnings
import emoji
import logging
import string
import numpy as np
import pandas as pd
import scipy.stats as stats
warnings.filterwarnings("ignore") 
from simpletransformers.classification import ClassificationModel, ClassificationArgs

# Import the dataset

In [3]:
data = pd.read_csv('EI-reg-En-anger-train.txt', header=None, names=['ID', 'Tweet', 'Affect Dimension', 'Intensity Score'], sep="\t")
data = data.drop(labels=None,axis=0, index=0, columns=None, inplace=False)
data.index = range(len(data))
test = pd.read_csv('2018-EI-reg-En-anger-test-gold.txt', header=None, names=['ID', 'Tweet', 'Affect Dimension', 'Intensity Score'], sep="\t")
test = test.drop(labels=None,axis=0, index=0, columns=None, inplace=False)
test.index = range(len(test))

# Preprocessing

In [4]:
def preprocessing(text):
  text = str(text)
# Uniformity of case  
  text = text.lower()

# Remove @someone
  re_at_tags = re.compile('\@\w+')
  text = re_at_tags.sub(r'', text)
  
# Remove punctuation
  exclude = string.punctuation
  text = text.translate(str.maketrans('', '', exclude))

# Remove numbers
  text = ''.join([i for i in text if not i.isdigit()])

# Remove emoji
  text = emoji.demojize(text)
  text = re.sub(':\S+?:', ' ', text)

  return text

In [5]:
data_text = data['Tweet']
data_t = data_text.apply(preprocessing)
data_i = data['Intensity Score']
test_text = test['Tweet']
test_t = test_text.apply(preprocessing)
test_i = test['Intensity Score']

In [6]:
# build the train dataset
train=pd.DataFrame(columns=['text','label'])
train['text'] = data_t
train['label'] = data_i
train_set=train
train_set['label'] = np.asarray(train_set['label'], dtype='float64')

In [7]:
# build the test dataset
test_set=pd.DataFrame(columns=['text','label'])
test_set['text'] = test_t
test_set['label'] = test_i
test_set['label'] = np.asarray(test_set['label'], dtype='float64')

# Train the model

In [8]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

# Enabling regression
# Setting optional model configuration
model_args = ClassificationArgs(special_tokens_list=["[MASK]", "[cls]"])
model_args.num_train_epochs = 5
model_args.regression = True
model_args.train_batch_size = 32

# Create a ClassificationModel
model = ClassificationModel(
    "roberta",
    "roberta-base",
    num_labels=1,
    args=model_args
)

# Train the model
model.train_model(train_set)


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

  0%|          | 0/1701 [00:00<?, ?it/s]

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/54 [00:00<?, ?it/s]

(270, 0.027200204911814244)

In [9]:
test = test_set['text'].tolist()
predictions, raw_outputs = model.predict(test)

  0%|          | 0/1002 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

In [10]:
pred = [round(i,3) for i in raw_outputs]
df = pd.DataFrame(columns = ['true','pred'])
for i in range(len(test_set)):
  df.loc[i]=[test_set['label'][i],pred[i]]
r = stats.pearsonr(df['true'], df['pred'])
print("the Pearson-r2 score is: ", r)

the Pearson-r2 score is:  (0.7703837071201431, 1.1217100473951953e-197)
