In [1]:
!nvidia-smi

Tue Apr 19 21:18:16 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.57.02    Driver Version: 470.57.02    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro RTX 8000     On   | 00000000:86:00.0 Off |                    0 |
| N/A   27C    P8    14W / 250W |      0MiB / 45556MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------

In [2]:
# !export CUDA_HOME=/usr/local/cuda-10.1
# !git clone https://github.com/NVIDIA/apex
# !pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

In [3]:
!pip install transformers
!pip install pytorch_lightning
!pip install sentencepiece
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/share/apps/python/3.8.6/intel/bin/python -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting setuptools==59.5.0
  Using cached setuptools-59.5.0-py3-none-any.whl (952 kB)
Installing collected packages: setuptools
Successfully installed setuptools-59.5.0
You should consider upgrading via the '/share/apps/python/3.8.6/intel/bin/python -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/share/apps/python/3.8.6/intel/bin/python -m pip install --upgrade pip' command.[0m
Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/share/apps/python/3.8.6/intel/bin/python -m pip install --upgrade pip' command.[0m


## T5 fine-tuning





In [4]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
import transformers

from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /home/mss9240/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

## Model



In [6]:
class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(T5FineTuner, self).__init__()
    print(hparams) ########
    self.save_hyperparameters(hparams)
    
    self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
    self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
  
  def is_logger(self):
    return self.trainer.global_rank <= 0
  
  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=labels,
    )

  def _step(self, batch):
    labels = batch["target_ids"]
    labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels=labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]

    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}
  
  def training_epoch_end(self, outputs):
    avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    #return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}
    self.log("avg_train_loss", avg_train_loss) 
    self.log("log", tensorboard_logs)
    self.log('progress_bar', tensorboard_logs)

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    return {"val_loss": loss}
  
  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}
    #return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}
    self.log("avg_val_loss", avg_loss) 
    self.log("log", tensorboard_logs)
    self.log('progress_bar', tensorboard_logs)

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]
  


  def optimizer_step(self,
                     epoch=None, 
                    batch_idx=None, 
                    optimizer=None, 
                    optimizer_idx=None, 
                    optimizer_closure=None, 
                    on_tpu=None, 
                    using_native_amp=None, 
                    using_lbfgs=None
                     ):

    optimizer.step(closure=optimizer_closure)
    optimizer.zero_grad()
    self.lr_scheduler.step()
  
  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = get_dataset(tokenizer=self.tokenizer, input_data=train_data, args=self.hparams)
    dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(tokenizer=self.tokenizer, input_data=val_data, args=self.hparams)
    return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

In [7]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
  def on_validation_end(self, trainer, pl_module):
    logger.info("***** Validation results *****")
    if pl_module.is_logger():
      metrics = trainer.callback_metrics
      # Log results
      for key in sorted(metrics):
        if key not in ["log", "progress_bar"]:
          logger.info("{} = {}\n".format(key, str(metrics[key])))

  def on_test_end(self, trainer, pl_module):
    logger.info("***** Test results *****")

    if pl_module.is_logger():
      metrics = trainer.callback_metrics

      # Log and save results to file
      output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
      with open(output_test_results_file, "w") as writer:
        for key in sorted(metrics):
          if key not in ["log", "progress_bar"]:
            logger.info("{} = {}\n".format(key, str(metrics[key])))
            writer.write("{} = {}\n".format(key, str(metrics[key])))

In [9]:
args_dict = dict(
    data_dir="News_dataset/Sarcasm_Headlines_Dataset_v2.json", # path for data files
    output_dir="out_dir", # path to save the checkpoints
    model_name_or_path='t5-base',
    tokenizer_name_or_path='t5-base',
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=8,
    eval_batch_size=8,
    num_train_epochs=5,
    gradient_accumulation_steps=16,
    n_gpu=1,
    #early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
)

## Sarcasm Detection

### Viewing data

In [10]:
path_to_json_file = 'News_dataset/Sarcasm_Headlines_Dataset_v2.json'

import json 

with open(path_to_json_file, 'r') as j:
     json_data = [json.loads(line) for line in j]


In [11]:
random.shuffle(json_data)

train_data = json_data[:17171]
test_data = json_data[17171:22895]
val_data = json_data[22895:]

In [12]:
print(len(train_data))

17171


In [13]:
for i in range (5):
  print(train_data[i])

{'is_sarcastic': 1, 'headline': "'the natural' not on tv often enough for area dad", 'article_link': 'https://local.theonion.com/the-natural-not-on-tv-often-enough-for-area-dad-1819573401'}
{'is_sarcastic': 0, 'headline': "kim davis's anti-gay views are going to cost her state big time", 'article_link': 'https://www.huffingtonpost.com/entry/kim-davis-kentucky-ruling_us_59725b64e4b09e5f6ccf43f0'}
{'is_sarcastic': 0, 'headline': 'love letters from wwii: in memory of my father', 'article_link': 'https://www.huffingtonpost.com/entry/love-letters-from-wwiiin-_b_5465645.html'}
{'is_sarcastic': 0, 'headline': 'muslims respond to hateful protests with voter registration drives', 'article_link': 'https://www.huffingtonpost.com/entry/muslims-respond-to-hateful-protests-with-voter-registration-drives_us_5617cc68e4b0082030a20b2c'}
{'is_sarcastic': 1, 'headline': 'hypothetical multi-ethnic customer base smiles down from hmo billboard', 'article_link': 'https://www.theonion.com/hypothetical-multi-et

In [14]:
for i in range(5):
  print(test_data[i])
  print(type(test_data[i]))
  print(type(test_data[i]['headline']))

{'is_sarcastic': 1, 'headline': 'biological life regrets waiting 2.3 billion years to try sex', 'article_link': 'https://www.theonion.com/biological-life-regrets-waiting-2-3-billion-years-to-tr-1819579828'}
<class 'dict'>
<class 'str'>
{'is_sarcastic': 1, 'headline': "trump's switzerland trip cancelled as president deemed flight risk", 'article_link': 'https://politics.theonion.com/trumps-switzerland-trip-cancelled-as-president-deemed-f-1822392985'}
<class 'dict'>
<class 'str'>
{'is_sarcastic': 1, 'headline': 'guy typing in all caps supports edward snowden', 'article_link': 'https://local.theonion.com/guy-typing-in-all-caps-supports-edward-snowden-1819575118'}
<class 'dict'>
<class 'str'>
{'is_sarcastic': 0, 'headline': "instead of arresting panhandlers, albuquerque's giving them jobs", 'article_link': 'https://www.huffingtonpost.com/entry/instead-of-arresting-panhandlers-albuquerques-giving-them-jobs_us_56686076e4b0f290e52174ab'}
<class 'dict'>
<class 'str'>
{'is_sarcastic': 1, 'headl

In [15]:
pand = pd.DataFrame(train_data)
input, target = pand.loc[10, 'is_sarcastic'], pand.loc[10, 'headline']
print(input, target)
print(type(input))

1 idiotic tree keeps trying to plant seeds on sidewalk
<class 'numpy.int64'>


In [16]:
print(len(test_data))

5724


In [17]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

### Dataset

Lets check how t5 encodes the following labels

In [18]:
tags = ['sarcasm', 'non_sarcasm']
for em in tags:
  print(len(tokenizer.encode(em)))
  print(tokenizer.encode(em))

7
[3, 7, 4667, 9, 7, 51, 1]
8
[529, 834, 7, 4667, 9, 7, 51, 1]


In [19]:
class SarcasmDataset(Dataset):
  def __init__(self, tokenizer, input_data, max_len=512):
    self.data = pd.DataFrame(input_data)
    self.max_len = max_len
    self.tokenizer = tokenizer
    self.inputs = []
    self.targets = []
    self._build()

  def __len__(self):
    return len(self.inputs)

  def __getitem__(self, index):
    source_ids = self.inputs[index]["input_ids"].squeeze()
    target_ids = self.targets[index]["input_ids"].squeeze()

    src_mask    = self.inputs[index]["attention_mask"].squeeze()  # might need to squeeze
    target_mask = self.targets[index]["attention_mask"].squeeze()  # might need to squeeze

    return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}
  
  def _build(self):
    for idx in range(len(self.data)):
      input_, target = self.data.loc[idx, 'headline'], self.data.loc[idx, 'is_sarcastic']
      if target==1:
        target = 'sarcasm'
      else:
        target = 'non_sarcasm'
      input_ = input_ + ' </s>'
      target = target + " </s>"

      tokenized_inputs = self.tokenizer.batch_encode_plus(
          [input_], max_length=self.max_len, pad_to_max_length=True, return_tensors="pt"
      )
       # tokenize targets
      tokenized_targets = self.tokenizer.batch_encode_plus(
          [target], max_length=8, pad_to_max_length=True, return_tensors="pt"
      )

      self.inputs.append(tokenized_inputs)
      self.targets.append(tokenized_targets)

In [20]:
dataset = SarcasmDataset(tokenizer,train_data,512)
len(dataset)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


17171

In [21]:
data = dataset[42]
print(tokenizer.decode(data['source_ids']))
print(tokenizer.decode(data['target_ids']))

hey, which one of you wise guys teleported me into the future?</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pa

### Train

In [22]:
args_dict.update({'input_data': train_data,'num_train_epochs':5})
args = argparse.Namespace(**args_dict)
print(args_dict)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [23]:
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath='/scratch/mss9240/', monitor="val_loss", mode="min", save_top_k=5
)

train_params = dict(
    #accumulate_grad_batches=args.gradient_accumulation_steps,
    gpus=args.n_gpu,
    max_epochs=args.num_train_epochs,
    #early_stop_callback=False,
    precision= 16 if args.fp_16 else 32,
    #amp_level=args.opt_level,
    gradient_clip_val=args.max_grad_norm,
    checkpoint_callback=checkpoint_callback,
    callbacks=[LoggingCallback()],
)

In [24]:
def get_dataset(tokenizer, input_data, args):
  return SarcasmDataset(tokenizer=tokenizer, input_data = train_data,  max_len=args.max_seq_length)

In [25]:
model = T5FineTuner(args)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### For training and saving the model

In [None]:
trainer = pl.Trainer(**train_params)

In [None]:
trainer.fit(model)

In [29]:
torch.save(model.state_dict(), 'T5_News/T5model_for_news')

### For loading the saved model

In [27]:
model.load_state_dict(torch.load('T5_News/T5model_for_news'))
model.eval()

T5FineTuner(
  (model): T5ForConditionalGeneration(
    (shared): Embedding(32128, 768)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 768)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseReluDense(
                (wi): Linear(in_features=768, out_features=3072, bias=False)
                (wo): Linear(in_featur

### Eval

In [28]:
import textwrap
from tqdm.auto import tqdm
from sklearn import metrics

In [29]:
dataset = SarcasmDataset(tokenizer, test_data , 512)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [30]:
it = iter(loader)

In [31]:
batch = next(it)
batch["source_ids"].shape

torch.Size([32, 512])

In [32]:
outs = model.model.generate(input_ids=batch['source_ids'], 
                              attention_mask=batch['source_mask'], 
                              max_length=8)

dec = [tokenizer.decode(ids) for ids in outs]

texts = [tokenizer.decode(ids) for ids in batch['source_ids']]
targets = [tokenizer.decode(ids) for ids in batch['target_ids']]

In [33]:
for i in range(32):
    c = texts[i]
    lines = textwrap.wrap("text:\n%s\n" % c, width=100)
    print("\n".join(lines))
    print("\nActual sentiment: %s" % targets[i])
    print("predicted sentiment: %s" % dec[i])
    print("=====================================================================\n")

text: scientists finally prove what area dad has been saying for years</s> <pad> <pad> <pad> <pad>
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
<pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>
<pad> <pad> <pad> <pad> <pad> <pad> <

#### Test Metrics

In [34]:
dataset = SarcasmDataset(tokenizer, test_data , 512)
loader = DataLoader(dataset, batch_size=32, num_workers=4)
model.model.eval()
outputs = []
targets = []
for batch in tqdm(loader):
  outs = model.model.generate(input_ids=batch['source_ids'], 
                              attention_mask=batch['source_mask'], 
                              max_length=8)

  dec = [tokenizer.decode(ids) for ids in outs]
  target = [tokenizer.decode(ids) for ids in batch["target_ids"]]
  
  outputs.extend(dec)
  targets.extend(target)



  0%|          | 0/179 [00:02<?, ?it/s]

In [35]:
emotions = [ "<pad> sarcasm</s>", "<pad> non_sarcasm"]
for em in emotions:
  print(len(tokenizer.encode(em)))

8
9


In [36]:
i=0
for i, tar in enumerate(targets):
  print(i, tar)
  i+=1
  if i>300:
    break


0 sarcasm</s> <pad>
1 sarcasm</s> <pad>
2 sarcasm</s> <pad>
3 non_sarcasm</s>
4 sarcasm</s> <pad>
5 non_sarcasm</s>
6 non_sarcasm</s>
7 sarcasm</s> <pad>
8 non_sarcasm</s>
9 non_sarcasm</s>
10 non_sarcasm</s>
11 sarcasm</s> <pad>
12 sarcasm</s> <pad>
13 sarcasm</s> <pad>
14 sarcasm</s> <pad>
15 sarcasm</s> <pad>
16 sarcasm</s> <pad>
17 sarcasm</s> <pad>
18 non_sarcasm</s>
19 sarcasm</s> <pad>
20 non_sarcasm</s>
21 non_sarcasm</s>
22 sarcasm</s> <pad>
23 non_sarcasm</s>
24 sarcasm</s> <pad>
25 sarcasm</s> <pad>
26 non_sarcasm</s>
27 sarcasm</s> <pad>
28 sarcasm</s> <pad>
29 sarcasm</s> <pad>
30 sarcasm</s> <pad>
31 sarcasm</s> <pad>
32 sarcasm</s> <pad>
33 sarcasm</s> <pad>
34 sarcasm</s> <pad>
35 non_sarcasm</s>
36 non_sarcasm</s>
37 non_sarcasm</s>
38 non_sarcasm</s>
39 non_sarcasm</s>
40 sarcasm</s> <pad>
41 sarcasm</s> <pad>
42 non_sarcasm</s>
43 sarcasm</s> <pad>
44 sarcasm</s> <pad>
45 non_sarcasm</s>
46 non_sarcasm</s>
47 sarcasm</s> <pad>
48 non_sarcasm</s>
49 sarcasm</s> <pad>


In [37]:
i=0
for i, out in enumerate(outputs):
  print(i, out)
  if out not in emotions:
    print(i, 'detected invalid prediction')
  i+=1
  if i>300:
    break

0 <pad> sarcasm</s>
1 <pad> non_sarcasm
2 <pad> sarcasm</s>
3 <pad> non_sarcasm
4 <pad> sarcasm</s>
5 <pad> non_sarcasm
6 <pad> non_sarcasm
7 <pad> sarcasm</s>
8 <pad> non_sarcasm
9 <pad> non_sarcasm
10 <pad> non_sarcasm
11 <pad> sarcasm</s>
12 <pad> sarcasm</s>
13 <pad> sarcasm</s>
14 <pad> sarcasm</s>
15 <pad> sarcasm</s>
16 <pad> sarcasm</s>
17 <pad> non_sarcasm
18 <pad> sarcasm</s>
19 <pad> non_sarcasm
20 <pad> non_sarcasm
21 <pad> non_sarcasm
22 <pad> sarcasm</s>
23 <pad> non_sarcasm
24 <pad> sarcasm</s>
25 <pad> sarcasm</s>
26 <pad> non_sarcasm
27 <pad> sarcasm</s>
28 <pad> sarcasm</s>
29 <pad> sarcasm</s>
30 <pad> sarcasm</s>
31 <pad> sarcasm</s>
32 <pad> non_sarcasm
33 <pad> sarcasm</s>
34 <pad> sarcasm</s>
35 <pad> non_sarcasm
36 <pad> non_sarcasm
37 <pad> non_sarcasm
38 <pad> non_sarcasm
39 <pad> non_sarcasm
40 <pad> sarcasm</s>
41 <pad> sarcasm</s>
42 <pad> non_sarcasm
43 <pad> sarcasm</s>
44 <pad> sarcasm</s>
45 <pad> non_sarcasm
46 <pad> non_sarcasm
47 <pad> sarcasm</s>
48

In [38]:
target_list = []
output_list = []
for i, tar in enumerate(targets):
  if 'non_sarcasm' in tar:
    target_list.append(0)
  else:
    target_list.append(1)

for i, tar in enumerate(outputs):
  if 'non_sarcasm' in tar:
    output_list.append(0)
  else:
    output_list.append(1)

print(metrics.accuracy_score(target_list, output_list))

0.8909853249475891


In [39]:
print(metrics.classification_report(target_list, output_list))

              precision    recall  f1-score   support

           0       0.89      0.91      0.90      3012
           1       0.90      0.87      0.88      2712

    accuracy                           0.89      5724
   macro avg       0.89      0.89      0.89      5724
weighted avg       0.89      0.89      0.89      5724

