In [1]:
import torch 
from transformers import RobertaTokenizer, RobertaForSequenceClassification 
from torch.nn.functional import softmax

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request

def preprocess(text):
    new_text = []
    #remove mentions and urls
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.save_pretrained(MODEL)


In [11]:
def get_roberta_score(text):
    text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    for i in range(scores.shape[0]):
        l = labels[ranking[i]]
        s = scores[ranking[i]]
        return l, np.round(float(s), 4)

In [6]:
import os 
import glob 
import pandas as pd
data_path = 'twitter_data/data/'
files = os.path.join(data_path, '*.csv')
for file in glob.glob(files):
    df = pd.read_csv(file)
    df['Content'] = df['Content'].apply(lambda x: x if isinstance(x, str) else '')
    sentiment_results = [get_roberta_score(tweet) for tweet in df['Content']]
    df['roberta_irony_label'] = [result[0] for result in sentiment_results]
    df['roberta_irony_score'] = [result[1] for result in sentiment_results]
    df.to_csv(file, index=False)

In [5]:
import pandas as pd

df = pd.read_csv('sampled_train.csv')

df['Content'] = df['Content'].apply(preprocess)

label_dict = {label: i for i, label in enumerate(labels)}
df['sentiment_label'] = df['roberta_sentiment_label'].map(label_dict)

from torch.utils.data import Dataset, DataLoader
import torch

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create the dataset
dataset = SentimentDataset(df['Content'].tolist(), df['sentiment_label'].tolist(), tokenizer)

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [7]:
from transformers import Trainer, TrainingArguments

model.to(device)
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

# Train the model
trainer.train()

  6%|▌         | 10/165 [00:27<06:49,  2.64s/it]

{'loss': 0.3008, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.18}


 12%|█▏        | 20/165 [00:56<06:23,  2.65s/it]

{'loss': 0.2821, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.36}


 18%|█▊        | 30/165 [01:28<07:33,  3.36s/it]

{'loss': 0.2789, 'learning_rate': 3e-06, 'epoch': 0.55}


 24%|██▍       | 40/165 [01:50<04:30,  2.17s/it]

{'loss': 0.2307, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.73}


 30%|███       | 50/165 [02:10<03:54,  2.04s/it]

{'loss': 0.241, 'learning_rate': 5e-06, 'epoch': 0.91}


 36%|███▋      | 60/165 [02:30<03:27,  1.97s/it]

{'loss': 0.1623, 'learning_rate': 6e-06, 'epoch': 1.09}


 42%|████▏     | 70/165 [02:50<03:08,  1.99s/it]

{'loss': 0.1365, 'learning_rate': 7.000000000000001e-06, 'epoch': 1.27}


 48%|████▊     | 80/165 [03:11<03:10,  2.24s/it]

{'loss': 0.1564, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.45}


 55%|█████▍    | 90/165 [03:35<02:46,  2.22s/it]

{'loss': 0.1371, 'learning_rate': 9e-06, 'epoch': 1.64}


 61%|██████    | 100/165 [03:58<02:22,  2.19s/it]

{'loss': 0.1494, 'learning_rate': 1e-05, 'epoch': 1.82}


 67%|██████▋   | 110/165 [04:17<01:31,  1.66s/it]

{'loss': 0.1717, 'learning_rate': 1.1000000000000001e-05, 'epoch': 2.0}


 73%|███████▎  | 120/165 [04:35<01:24,  1.87s/it]

{'loss': 0.093, 'learning_rate': 1.2e-05, 'epoch': 2.18}


 79%|███████▉  | 130/165 [04:54<01:05,  1.88s/it]

{'loss': 0.0545, 'learning_rate': 1.3000000000000001e-05, 'epoch': 2.36}


 85%|████████▍ | 140/165 [05:13<00:46,  1.85s/it]

{'loss': 0.0539, 'learning_rate': 1.4000000000000001e-05, 'epoch': 2.55}


 91%|█████████ | 150/165 [05:31<00:27,  1.85s/it]

{'loss': 0.1071, 'learning_rate': 1.5e-05, 'epoch': 2.73}


 97%|█████████▋| 160/165 [05:50<00:09,  1.85s/it]

{'loss': 0.0393, 'learning_rate': 1.6000000000000003e-05, 'epoch': 2.91}


100%|██████████| 165/165 [05:58<00:00,  2.17s/it]

{'train_runtime': 358.8352, 'train_samples_per_second': 7.274, 'train_steps_per_second': 0.46, 'train_loss': 0.16186873840563226, 'epoch': 3.0}





TrainOutput(global_step=165, training_loss=0.16186873840563226, metrics={'train_runtime': 358.8352, 'train_samples_per_second': 7.274, 'train_steps_per_second': 0.46, 'train_loss': 0.16186873840563226, 'epoch': 3.0})

In [8]:
new_path = 'fine-tuned-roberta-model'
model.save_pretrained(new_path)
tokenizer.save_pretrained(new_path)


('fine-tuned-roberta-model\\tokenizer_config.json',
 'fine-tuned-roberta-model\\special_tokens_map.json',
 'fine-tuned-roberta-model\\vocab.json',
 'fine-tuned-roberta-model\\merges.txt',
 'fine-tuned-roberta-model\\added_tokens.json',
 'fine-tuned-roberta-model\\tokenizer.json')

In [9]:
model_path = "fine-tuned-roberta-model"

# Load the model
model = AutoModelForSequenceClassification.from_pretrained(model_path)

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_path)

model.eval()


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [13]:
import os 
import glob 
import pandas as pd
data_path = 'twitter_data/data/'
files = os.path.join(data_path, '*.csv')
for file in glob.glob(files):
    df = pd.read_csv(file)
    df['Content'] = df['Content'].apply(lambda x: x if isinstance(x, str) else '')
    sentiment_results = [get_roberta_score(tweet) for tweet in df['Content']]
    df['label_fine_tuned'] = [result[0] for result in sentiment_results]
    df['score_fine_tuned'] = [result[1] for result in sentiment_results]
    df.to_csv('fine-tuned/'+file, index=False)

In [14]:
data_path = 'kaggle_tweets/'
files = os.path.join(data_path, '*.csv')
for file in glob.glob(files):
    df = pd.read_csv(file)
    df['text'] = df['text'].apply(lambda x: x if isinstance(x, str) else '')
    sentiment_results = [get_roberta_score(tweet) for tweet in df['text']]
    df['label_fine_tuned'] = [result[0] for result in sentiment_results]
    df['score_fine_tuned'] = [result[1] for result in sentiment_results]
    df.to_csv('fine-tuned/'+file, index=False)