[Reference](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb#scrollTo=y43HcyWgEadG)


In [1]:
# Installations
!pip install transformers==3.0.2

Collecting transformers==3.0.2
  Downloading transformers-3.0.2-py3-none-any.whl (769 kB)
[?25l[K     |▍                               | 10 kB 33.0 MB/s eta 0:00:01[K     |▉                               | 20 kB 37.1 MB/s eta 0:00:01[K     |█▎                              | 30 kB 42.0 MB/s eta 0:00:01[K     |█▊                              | 40 kB 32.8 MB/s eta 0:00:01[K     |██▏                             | 51 kB 35.0 MB/s eta 0:00:01[K     |██▋                             | 61 kB 39.2 MB/s eta 0:00:01[K     |███                             | 71 kB 28.4 MB/s eta 0:00:01[K     |███▍                            | 81 kB 29.9 MB/s eta 0:00:01[K     |███▉                            | 92 kB 31.8 MB/s eta 0:00:01[K     |████▎                           | 102 kB 33.4 MB/s eta 0:00:01[K     |████▊                           | 112 kB 33.4 MB/s eta 0:00:01[K     |█████▏                          | 122 kB 33.4 MB/s eta 0:00:01[K     |█████▌                          | 133 k

In [2]:
# imports
import os
import random

import torch
from torch.utils.data import Dataset, DataLoader
from torch import cuda

import transformers
from transformers import RobertaTokenizer, RobertaModel
from transformers import pipeline

from torch import cuda
from tqdm import tqdm
device = 'cuda' if cuda.is_available() else 'cpu'

In [3]:
# Mounting Google Drive to this .ipynb
from google.colab import drive
drive.mount('/content/drive')

'''
About: SST-2 Dataset (Stanford Sentiment Treebank - 2)

Source - Rotten Tomatoes.
Task- Sentiment Analysis 
Classes - Positive and Negative. 
Data Split - train/dev/test (pre-split)

The Test set for SST-2 is not labelled as this is a benchmark dataset. 

'''


# assume 'data/SST-2/' directory in google drive account ie. SST-2 dataset downloaded in the data folder in the main Google Drive root directory

train_data_loc = '/content/drive/My Drive/data/SST-2/train.tsv'
dev_data_loc = '/content/drive/My Drive/data/SST-2/dev.tsv'
test_data_loc = '/content/drive/My Drive/data/SST-2/test.tsv'

Mounted at /content/drive


In [None]:
#! mkdir /content/drive/MyDrive/data/SST-2/Few_Shot


In [4]:
# File Data Reader

def data_reader(filename):
  '''
  Reads the NLP datset from a .tsv file (each line is a separate example)

  Returns:
    data - List of (text,label) tuples
    y_sizes - Number of classes in the dataset
    class_sizes - List of integers representing number of training examples in the class

  '''  

  with open(filename) as f:
    f.readline()

    data = [line.split('\t') for line in f]
  data = [(x, int(y)) for (x,y) in data]
  y_size = len(set([int(y) for (x,y) in data]))

  data_by_class = {}

  for i in range(y_size):
    data_by_class[i] = []
    
  for x,y in data:
    data_by_class[int(y)].append(x)

  class_size = [len(data_by_class[i]) for i in range(y_size)]

  # Can also return data by class if needed
  return data, y_size, class_size



In [5]:
# Create a few shot learning set

def create_fewshot(K, data, y_size):
  '''
  Input:
    K - Number of examples per class
    data - dataset as a List of (text,label) tuples
    y_size - Number of classes in the datset
  '''
  
  dataset = []
  class_counter = [0 for i in range(y_size)]
  class_fixed = [0 for i in range(y_size)]

  while(sum(class_fixed) < y_size):
    sample = random.choice(data)
    
    if class_counter[sample[1]] < K:
      dataset.append(sample)
      class_counter[sample[1]] += 1

    elif class_counter[sample[1]] == K:
      class_fixed[sample[1]] = 1


  return dataset



In [6]:
def dataset_to_file(dataset, filepath):
  '''
    Saves the few shot learning Dataset (from dataset) in the filename(filepath)
  '''
  with open(filepath, 'w') as f:
    f.write('\n')
    for x,y in dataset:
      line = x + '\t' + str(y)+'\n'
      f.write(line)


In [None]:

# Creating Few Shot Datasets

# Number of few-shot datasets you want to create
n_datasets = 5 
K = 16 # Number of examples per class

# location where datsets are to be created
few_shot_location = '/content/drive/MyDrive/data/SST-2/Few_Shot'


data, nclasses, class_sizes = data_reader(train_data_loc)

for i in range(n_datasets):
  
  filepath = few_shot_location+"/train_"+str(i)+'.tsv'

  fewshot_dataset = create_fewshot(K, data, nclasses)

  dataset_to_file(fewshot_dataset, filepath)




In [None]:
# 5 Development sets

dev_data, _ , dev_class_sizes = data_reader(dev_data_loc)

for i in range(n_datasets):
  
  filepath = few_shot_location+"/dev_"+str(i)+'.tsv'

  fewshot_dataset = create_fewshot(K, dev_data, nclasses)

  #dataset_to_file(fewshot_dataset, filepath)


In [7]:
data, nclasses, class_sizes = data_reader(train_data_loc)
dev_data, _ , dev_class_sizes = data_reader(dev_data_loc)

### **SST-2 Baseline 1**
Outputting the majority class (based on the training set) on original development set.

In [None]:

print("Dataset Classes: 0, 1")
majority_class = class_sizes.index(max(class_sizes))

print(f"Majority Class:{majority_class}")

accuracy = dev_class_sizes[1]/sum(dev_class_sizes) * 100
print("Majority Class by full training set: ", accuracy, " (acc)")


Dataset Classes: 0, 1
Majority Class:1
Majority Class by full training set:  50.917431192660544  (acc)


### **SST-2 Baseline 2** 
Prompt Based Zero Shot on pretrained. A.k.a formulating the sentance classification task as a Masked Language Modelling (MLM) Task using a their best Prompt. We are testing the prompt on the following base models -  

*   pretrained roberta-large
*   pretrained roberta-base

For SST-2, the paper suggests the following Prompt:

<u>Template</u>: \<S1> It was [MASK] .   
<u>Labels</u>: positive: great, negative: terrible

(where \<S1> refers to the example sentance)

In [None]:
# SST-2 Baseline 2 : Zero Shot Performance on Prompt Based Zero Shot on pretrained roberta-large and roberta-base

tokenizer = RobertaTokenizer.from_pretrained("roberta-large")
unmasker = pipeline('fill-mask', model='roberta-large')



Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-large and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-large")

In [None]:
neg = tokenizer.encode(' terrible')[1]
pos = tokenizer.encode(' great')[1]

positives = [pos]
negatives = [neg]

pred = []
for x,y in dev_data:
  
  example = x+" It was <mask> . "
  if x[0] != ' ':
     example = " " + example
  pred.append(unmasker(example)[0]['token'])



Zero Shot on Prompt Accuracy: 0.0 (acc)


In [None]:
result = []
for i in range(len(dev_data)):
  if pred[i] in positives and dev_data[i][1] == 1:
    result.append(True)
  if pred[i] in negatives and dev_data[i][1] == 0:
    result.append(True)
  result.append(False)
count = 0
for elem in result:
  if elem:
    count+=1

print(f"Zero Shot on Prompt Accuracy: {count/len(dev_data) * 100} (acc)")

Zero Shot on Prompt Accuracy: 11.582568807339449 (acc)


In [None]:
neg = tokenizer.encode(' terrible')[1]
pos = tokenizer.encode(' great')[1]

neg1 = tokenizer.encode(' bad')[1]
pos1 = tokenizer.encode(' good')[1]

neg2 = tokenizer.encode(' pathetic')[1]
pos2 = tokenizer.encode(' irresistible')[1]

neg3 = tokenizer.encode(' bad')[1]
pos3 = tokenizer.encode(' wonderful')[1]

neg4 = tokenizer.encode(' bad')[1]
pos4 = tokenizer.encode(' delicious')[1]

positives = set([pos, pos1,pos2, pos3, pos4])
negatives = set([neg, neg1,neg2, neg3, neg4])

In [None]:
pred1 = []
for x,y in dev_data:
  
  example = x+" A <mask> one . "
  if x[0] != ' ':
     example = " " + example
  pred1.append(unmasker(example)[0]['token'])


In [None]:
pred2 = []
for x,y in dev_data:
  
  example = x+" A <mask> piece . "
  if x[0] != ' ':
     example = " " + example
  pred2.append(unmasker(example)[0]['token'])

In [None]:
pred3 = []
for x,y in dev_data:
  
  example = x+" All in all <mask> . "
  if x[0] != ' ':
     example = " " + example
  pred3.append(unmasker(example)[0]['token'])

In [None]:
for i in range(len(dev_data)):
  if (pred1[i] in positives) and dev_data[i][1] == 1:
    result[i] = result[i] or True
  if (pred1[i] in negatives) and dev_data[i][1] == 0:
    result[i] = result[i] or True
 

In [None]:
for i in range(len(dev_data)):
  if (pred3[i] in positives) and dev_data[i][1] == 1:
    result[i] = result[i] or True
  if (pred3[i] in negatives) and dev_data[i][1] == 0:
    result[i] = result[i] or True
 

In [None]:
for i in range(len(dev_data)):
  if (pred2[i] in positives) and dev_data[i][1] == 1:
    result[i] = result[i] or True
  if (pred2[i] in negatives) and dev_data[i][1] == 0:
    result[i] = result[i] or True
 

In [None]:
count = 0
for elem in result:
  if elem:
    count+=1

print(f"Zero Shot on Prompt Accuracy: {count/len(dev_data) * 100} (acc)")

Zero Shot on Prompt Accuracy: 49.65596330275229 (acc)


In [None]:
unique_outputs = list(set(pred))

In [None]:
decoded_unique_outputs = tokenizer.batch_decode(unique_outputs)

### Now trying on `roberta-base`

In [8]:

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
unmasker = pipeline('fill-mask', model='roberta-base')


Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/230 [00:00<?, ?B/s]



Downloading:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
neg = tokenizer.encode(' terrible')[1]
pos = tokenizer.encode(' great')[1]

positives = [pos]
negatives = [neg]

pred = []
for x,y in dev_data:
  
  example = x+" It was <mask> . "
  if x[0] != ' ':
     example = " " + example
  pred.append(unmasker(example)[0]['token'])



In [10]:
result = []
for i in range(len(dev_data)):
  if pred[i] in positives and dev_data[i][1] == 1:
    result.append(True)
  if pred[i] in negatives and dev_data[i][1] == 0:
    result.append(True)
  result.append(False)
count = 0
for elem in result:
  if elem:
    count+=1

print(f"Zero Shot on Prompt Accuracy: {count/len(dev_data) * 100} (acc)")

Zero Shot on Prompt Accuracy: 5.160550458715597 (acc)


In [11]:
neg = tokenizer.encode(' terrible')[1]
pos = tokenizer.encode(' great')[1]

neg1 = tokenizer.encode(' bad')[1]
pos1 = tokenizer.encode(' good')[1]

neg2 = tokenizer.encode(' pathetic')[1]
pos2 = tokenizer.encode(' irresistible')[1]

neg3 = tokenizer.encode(' bad')[1]
pos3 = tokenizer.encode(' wonderful')[1]

neg4 = tokenizer.encode(' bad')[1]
pos4 = tokenizer.encode(' delicious')[1]

positives = set([pos, pos1,pos2, pos3, pos4])
negatives = set([neg, neg1,neg2, neg3, neg4])

In [12]:
pred1 = []
for x,y in dev_data:
  
  example = x+" A <mask> one . "
  if x[0] != ' ':
     example = " " + example
  pred1.append(unmasker(example)[0]['token'])


In [13]:
pred2 = []
for x,y in dev_data:
  
  example = x+" A <mask> piece . "
  if x[0] != ' ':
     example = " " + example
  pred2.append(unmasker(example)[0]['token'])

In [14]:
pred3 = []
for x,y in dev_data:
  
  example = x+" All in all <mask> . "
  if x[0] != ' ':
     example = " " + example
  pred3.append(unmasker(example)[0]['token'])

In [15]:
for i in range(len(dev_data)):
  if (pred1[i] in positives) and dev_data[i][1] == 1:
    result[i] = result[i] or True
  if (pred1[i] in negatives) and dev_data[i][1] == 0:
    result[i] = result[i] or True
 

In [16]:
for i in range(len(dev_data)):
  if (pred3[i] in positives) and dev_data[i][1] == 1:
    result[i] = result[i] or True
  if (pred3[i] in negatives) and dev_data[i][1] == 0:
    result[i] = result[i] or True
 

In [17]:
for i in range(len(dev_data)):
  if (pred2[i] in positives) and dev_data[i][1] == 1:
    result[i] = result[i] or True
  if (pred2[i] in negatives) and dev_data[i][1] == 0:
    result[i] = result[i] or True
 

In [18]:
count = 0
for elem in result:
  if elem:
    count+=1

print(f"Zero Shot on Prompt Accuracy: {count/len(dev_data) * 100} (acc)")

Zero Shot on Prompt Accuracy: 46.788990825688074 (acc)
