In [1]:
from datasets import load_dataset,Dataset,DatasetDict
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
import pandas as pd

In [33]:
data=load_dataset("json",data_files="news-headlines-dataset-for-sarcasm-detection.zip")
data=data.rename_column("is_sarcastic","label")

data=data.remove_columns(['article_link'])

data.set_format('pandas')
data=data['train'][:]
data

Using custom data configuration default-a9db47ecc5b6cc90
Found cached dataset json (C:/Users/User/.cache/huggingface/datasets/json/default-a9db47ecc5b6cc90/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,headline,label
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0
...,...,...
55323,jews to celebrate rosh hashasha or something,1
55324,internal affairs investigator disappointed con...,1
55325,the most beautiful acceptance speech this week...,0
55326,mars probe destroyed by orbiting spielberg-gat...,1


In [44]:
data=load_dataset("json",data_files="news-headlines-dataset-for-sarcasm-detection.zip")
data=data.rename_column("is_sarcastic","label")

data=data.remove_columns(['article_link'])

data.set_format('pandas')
data=data['train'][:]

data.drop_duplicates(subset=['headline'],inplace=True)
data = data.reset_index()[['headline','label']]
data = Dataset.from_pandas(data)

# 80% train, 20% test + validation
train_testvalid = data.train_test_split(test_size=0.2,seed=15)

# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size=0.5,seed=15)

# gather everyone if you want to have a single DatasetDict
data = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

data

Using custom data configuration default-a9db47ecc5b6cc90
Found cached dataset json (C:/Users/User/.cache/huggingface/datasets/json/default-a9db47ecc5b6cc90/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['headline', 'label'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['headline', 'label'],
        num_rows: 2851
    })
    valid: Dataset({
        features: ['headline', 'label'],
        num_rows: 2850
    })
})

In [3]:
checkpoint = "cardiffnlp/twitter-roberta-base-emotion"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.model_max_len=512

In [4]:
def tokenize(batch):
    return tokenizer(batch["headline"], truncation=True,max_length=512)

tokenized_dataset = data.map(tokenize, batched=True)
tokenized_dataset

tokenized_dataset.set_format("torch",columns=["input_ids", "attention_mask", "label"])
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

In [158]:
print(data['train'][0:4])

{'headline': ['the best reason to love our bodies', 'motivational poster inspires 264 layoffs', 'merle haggard haggard', 'women in business q&a: sarah davanzo, chief cultural strategy officer, sparks & honey'], 'label': [0, 1, 1, 0]}


In [159]:
print(tokenized_dataset['train'][0:4])

{'label': tensor([0, 1, 1, 0]), 'input_ids': [tensor([   0,  627,  275, 1219,    7,  657,   84, 3738,    2]), tensor([    0, 25331,  1879,  5033, 11566, 25362, 36223, 22788,     2]), tensor([   0, 2089,  459, 1368, 7165, 1120, 1368, 7165, 1120,    2]), tensor([    0, 22197,    11,   265,  2231,   947,   102,    35,   579, 36000,
          385, 10937,  6527,     6,   834,  4106,  1860,  1036,     6, 30555,
          359, 10658,     2])], 'attention_mask': [tensor([1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])]}


{'input_ids': [0, 627, 275, 1219, 7, 657, 84, 3738, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [5]:
class CustomModel(nn.Module):
    def __init__(self,checkpoint,num_labels): 
        super(CustomModel,self).__init__() 
        self.num_labels = num_labels 

        #Load Model with given checkpoint and extract its body
        self.model = model = AutoModel.from_pretrained(checkpoint,config=AutoConfig.from_pretrained(checkpoint, output_attentions=True,output_hidden_states=True))
        self.dropout = nn.Dropout(0.1) 
        self.classifier = nn.Linear(768,num_labels) # load and initialize weights

    def forward(self, input_ids=None, attention_mask=None,labels=None):
        #Extract outputs from the body
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)

        #Add custom layers
        sequence_output = self.dropout(outputs[0]) #outputs[0]=last hidden state

        logits = self.classifier(sequence_output[:,0,:].view(-1, 768)) # calculate losses

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states,attentions=outputs.attentions)

In [80]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model=CustomModel(checkpoint=checkpoint,num_labels=2).to(device)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-emotion were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

In [32]:
next(iter(data['train']))

{'headline': 'the best reason to love our bodies', 'label': 0}

In [27]:
# Display image and label.
train_input_ids, train_attention_mask, train_labels = [next(iter(train_dataloader))[i] for i in ['input_ids', 'attention_mask', 'labels']]
print(f"Feature batch shape: {train_input_ids.size()}")
print(f"Labels batch shape: {train_attention_mask.size()}")
print(f"a batch shape: {train_labels}")

Feature batch shape: torch.Size([8, 23])
Labels batch shape: torch.Size([8, 21])
a batch shape: tensor([1, 1, 1, 0, 1, 1, 0, 1])


In [20]:
next(iter(train_dataloader))['input_ids']

tensor([[    0,   560,     5, 24409,   693,  9701,    23,     5, 22297,  2103,
             2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [    0,  7907,   313,   115,   304,     5,  6120,  6992,     2,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1],
        [    0,   417, 28828,    34,  1334,    90, 10648, 28689,    81,  1363,
          2134,  1857,    16,  8653,   615,     2,     1,     1,     1,     1,
             1],
        [    0,   245,    12,   180,    12,   279,    34,  1144,    12,   605,
         18723,   154, 14094,    19,  3795,    71,  3062,  6848,     2,     1,
             1],
        [    0, 24139, 20125, 25362,    92,   295, 26649,   605,  3099,     9,
             5, 29703,   128,   571,  1517,   108,     2,     1,     1,     1,
             1],
        [    0,   298,  5865,  7049,  1430,    35,    99,    47,   240,     7,
           216,    15,   263, 47153,   883,   

In [18]:
train_input_ids

'input_ids'

In [206]:
data_collator

DataCollatorWithPadding(tokenizer=PreTrainedTokenizerFast(name_or_path='cardiffnlp/twitter-roberta-base-emotion', vocab_size=50265, model_max_len=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [7]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset["train"], shuffle=True, batch_size=8, collate_fn=data_collator
)
eval_dataloader = DataLoader(
    tokenized_dataset["valid"], batch_size=8, collate_fn=data_collator
)

In [8]:
from transformers import AdamW,get_scheduler

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

8553




In [9]:
from datasets import load_metric
metric = load_metric("f1")

  metric = load_metric("f1")


In [10]:
from tqdm.auto import tqdm

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epochs * len(eval_dataloader)))


for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
 
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)

    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])
        progress_bar_eval.update(1)
    
    print(metric.compute())

  0%|          | 0/8553 [00:00<?, ?it/s]

  0%|          | 0/1071 [00:00<?, ?it/s]

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'f1': 0.8902870625245773}
{'f1': 0.9330323551542512}
{'f1': 0.9374534623976172}


In [11]:
model.eval()

test_dataloader = DataLoader(
    tokenized_dataset["test"], batch_size=8, collate_fn=data_collator
)

for batch in test_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'f1': 0.915707389528407}

In [201]:
def text_pipeline(text):
    input_ids = torch.tensor([tokenizer(text, truncation=True,max_length=512)['input_ids']], device='cuda:0')
    return {'input_ids': input_ids}

def predict(text):
    with torch.no_grad():

        # Retrieve item
        text_ = text_pipeline(text)

        # Loading the saved model
        save_path = 'model_roberta_binary.pth'
        model = CustomModel(checkpoint=checkpoint,num_labels=2).to(device)
        model.load_state_dict(torch.load(save_path))
        model.eval()

        # Return prediction
        outputs = model(**text_)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)

        if predictions[0] == 0:
            print('Non-sarcastic')
        else:
            print('Sarcastic')

In [190]:
# torch.save(model.state_dict(), 'model_roberta_binary.pth')

In [202]:
predict('dad clarifies this not a food stop	')

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-emotion were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Sarcastic


In [203]:
predict('someone thinks he is good in this game')

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-emotion were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Sarcastic


In [205]:
predict('you are so short, i wonder what have you done in your childhood')

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-emotion were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-emotion and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Non-sarcastic
