<a href="https://colab.research.google.com/github/zecakpm/NLP/blob/main/roBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Using RoBERTa: A Robustly Optimized BERT Pretraining Approach\
to predict relation between headers and news.

https://github.com/pytorch/fairseq/blob/master/examples/roberta/README.md

In [2]:
import pandas as pd
import re
from itertools import cycle 

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
#open file
text = open('/content/drive/My Drive/Colab Notebooks/NLP/somefile.txt','r')


In [5]:
#convert to a list and to a dataframe
df = pd.DataFrame(list(text))


In [6]:
#make a copy
data = df.copy()
len(data)

149916

In [7]:
data.head()

Unnamed: 0,0
0,A small meteorite crashed into a wooded area i...
1,"Soldier shot, Parliament locked down after gun..."
2,unrelated\n
3,A small meteorite crashed into a wooded area i...
4,Tourist dubbed ‘Spider Man’ after spider burro...


In [8]:
#creating a temp. column
x = cycle(range(3))
data['tmp'] = [next(x) for row in range(len(data))]

In [9]:
#populating lists based on temp. column
news = []
headers = []
labels = []
for i in range(len(data)):
    a = data[0][i]
    if data['tmp'][i] == 0:
      news.append(a)
    elif data['tmp'][i] == 1:
      headers.append(a)
    else:
      labels.append(a)

In [10]:
#checking length is each list
max_length_news = max(len(x) for x in news )
min_length_news = min(len(x) for x in news )
max_length_headers = max(len(x) for x in headers )
min_length_headers = min(len(x) for x in headers )

print(len(news), len(headers) , len(labels))
print(min_length_news)
print(max_length_news)
print(min_length_headers)
print(max_length_headers)

49972 49972 49972
39
27580
10
226


In [26]:
#creating a new df
new_df = pd.DataFrame(
    {'news':news,
     'headers':headers,
     'labels':labels}
)
new_df.head()

Unnamed: 0,news,headers,labels
0,A small meteorite crashed into a wooded area i...,"Soldier shot, Parliament locked down after gun...",unrelated\n
1,A small meteorite crashed into a wooded area i...,Tourist dubbed ‘Spider Man’ after spider burro...,unrelated\n
2,A small meteorite crashed into a wooded area i...,Luke Somers 'killed in failed rescue attempt i...,unrelated\n
3,A small meteorite crashed into a wooded area i...,BREAKING: Soldier shot at War Memorial in Otta...,unrelated\n
4,A small meteorite crashed into a wooded area i...,Giant 8ft 9in catfish weighing 19 stone caught...,unrelated\n


In [27]:
new_df['labels'].value_counts()

unrelated\n    36545
discuss\n       8909
agree\n         3678
disagree\n       840
Name: labels, dtype: int64

In [28]:
#removing the trailing new line
new_df['labels'] = [new_df['labels'][row].rstrip("\n") for row in range(len(new_df))]
new_df.head()

Unnamed: 0,news,headers,labels
0,A small meteorite crashed into a wooded area i...,"Soldier shot, Parliament locked down after gun...",unrelated
1,A small meteorite crashed into a wooded area i...,Tourist dubbed ‘Spider Man’ after spider burro...,unrelated
2,A small meteorite crashed into a wooded area i...,Luke Somers 'killed in failed rescue attempt i...,unrelated
3,A small meteorite crashed into a wooded area i...,BREAKING: Soldier shot at War Memorial in Otta...,unrelated
4,A small meteorite crashed into a wooded area i...,Giant 8ft 9in catfish weighing 19 stone caught...,unrelated


In [None]:
!pip install hydra-core omegaconf

In [None]:
import torch
from torch import cuda
# Download RoBERTa already finetuned for MNLI
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
roberta.eval() 

In [17]:
#batched predictions
from fairseq.data.data_utils import collate_tokens


In [18]:
#running model in a small batch
batch = collate_tokens(
    [roberta.encode(row[0], row[1]) for row in new_df], pad_idx=1
)

logprobs = roberta.predict('mnli', batch)
print(logprobs.argmax(dim=1))

tensor([2, 0, 2])


In [29]:
#creating a new copy
df_2 = new_df.copy()

In [30]:
#checking label counts
df_2['labels'].value_counts()

unrelated    36545
discuss       8909
agree         3678
disagree       840
Name: labels, dtype: int64

In [44]:
#updating labels for model comparison
#lets add disagree and unrelated together
df_2['labels_mod'] = ''
df_2['labels_int'] = ''

for label in range(len(df_2)):
  if df_2['labels'][label] == 'unrelated':
    df_2['labels_mod'][label] = 'contradiction'
    df_2['labels_int'][label] = 0
  elif df_2['labels'][label] == 'disagree':
    df_2['labels_mod'][label] = 'contradiction'
    df_2['labels_int'][label] = 1
  elif df_2['labels'][label] == 'discuss':
    df_2['labels_mod'][label] = 'neutral'
    df_2['labels_int'][label] = 1
  else:
    df_2['labels_mod'][label] = 'entailment'
    df_2['labels_int'][label] = 2




In [45]:
#checking label counts
df_2['labels_mod'].value_counts()

contradiction    37385
neutral           8909
entailment        3678
Name: labels_mod, dtype: int64

In [46]:
#checking label counts
df_2['labels_int'].value_counts()

0    36545
1     9749
2     3678
Name: labels_int, dtype: int64

In [84]:
#Evaluating the model
%%time
label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
ncorrect, nsamples = 0 , 0 
#nsamples = len(df_2)
roberta.cuda()
roberta.eval()

for row in range(len(df_2)):
  sent1, sent2, target = df_2.iloc[:,0], df_2.iloc[:,1], df_2.iloc[:,3]
  tokens = roberta.encode(sent1[row][0:511], sent2[row])
  prediction = roberta.predict('mnli', tokens).argmax().item()
  prediction_label = label_map[prediction]
  if prediction_label == target[row]: ncorrect +=1
  nsamples +=1
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.21111822620667575
CPU times: user 40min 25s, sys: 5.92 s, total: 40min 31s
Wall time: 40min 35s
