<a href="https://colab.research.google.com/github/zecakpm/NLP/blob/main/roBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Using RoBERTa: A Robustly Optimized BERT Pretraining Approach\
to predict relation between headers and news.

https://github.com/pytorch/fairseq/blob/master/examples/roberta/README.md

In [1]:
import pandas as pd
import numpy as np
from itertools import cycle 
from sklearn.metrics import confusion_matrix

In [2]:
#connecting with personal frive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
#open file
text = open('/content/drive/My Drive/Colab Notebooks/NLP/somefile.txt','r')


In [4]:
#convert to a list and to a dataframe
df = pd.DataFrame(list(text))


In [5]:
#make a copy
data = df.copy()
len(data)

149916

In [6]:
#data check
data.head()

Unnamed: 0,0
0,A small meteorite crashed into a wooded area i...
1,"Soldier shot, Parliament locked down after gun..."
2,unrelated\n
3,A small meteorite crashed into a wooded area i...
4,Tourist dubbed ‘Spider Man’ after spider burro...


In [7]:
#creating a temp. column
x = cycle(range(3))
data['tmp'] = [next(x) for row in range(len(data))]

In [8]:
#populating lists based on temp. column
news = []
headers = []
labels = []
for i in range(len(data)):
    a = data[0][i]
    if data['tmp'][i] == 0:
      news.append(a)
    elif data['tmp'][i] == 1:
      headers.append(a)
    else:
      labels.append(a)

In [9]:
#checking length is each list
max_length_news = max(len(x) for x in news )
min_length_news = min(len(x) for x in news )
max_length_headers = max(len(x) for x in headers )
min_length_headers = min(len(x) for x in headers )

print(len(news), len(headers) , len(labels))
print(min_length_news)
print(max_length_news)
print(min_length_headers)
print(max_length_headers)

49972 49972 49972
39
27580
10
226


In [10]:
#creating a new df
new_df = pd.DataFrame(
    {'news':news,
     'headers':headers,
     'labels':labels}
)
new_df.head()

Unnamed: 0,news,headers,labels
0,A small meteorite crashed into a wooded area i...,"Soldier shot, Parliament locked down after gun...",unrelated\n
1,A small meteorite crashed into a wooded area i...,Tourist dubbed ‘Spider Man’ after spider burro...,unrelated\n
2,A small meteorite crashed into a wooded area i...,Luke Somers 'killed in failed rescue attempt i...,unrelated\n
3,A small meteorite crashed into a wooded area i...,BREAKING: Soldier shot at War Memorial in Otta...,unrelated\n
4,A small meteorite crashed into a wooded area i...,Giant 8ft 9in catfish weighing 19 stone caught...,unrelated\n


In [11]:
#checking unique values for the column label
new_df['labels'].value_counts()

unrelated\n    36545
discuss\n       8909
agree\n         3678
disagree\n       840
Name: labels, dtype: int64

In [12]:
#removing the trailing new line
new_df['labels'] = [new_df['labels'][row].rstrip("\n") for row in range(len(new_df))]
new_df.head()

Unnamed: 0,news,headers,labels
0,A small meteorite crashed into a wooded area i...,"Soldier shot, Parliament locked down after gun...",unrelated
1,A small meteorite crashed into a wooded area i...,Tourist dubbed ‘Spider Man’ after spider burro...,unrelated
2,A small meteorite crashed into a wooded area i...,Luke Somers 'killed in failed rescue attempt i...,unrelated
3,A small meteorite crashed into a wooded area i...,BREAKING: Soldier shot at War Memorial in Otta...,unrelated
4,A small meteorite crashed into a wooded area i...,Giant 8ft 9in catfish weighing 19 stone caught...,unrelated


In [13]:
!pip install hydra-core omegaconf

Collecting hydra-core
[?25l  Downloading https://files.pythonhosted.org/packages/52/e3/fbd70dd0d3ce4d1d75c22d56c0c9f895cfa7ed6587a9ffb821d6812d6a60/hydra_core-1.0.6-py3-none-any.whl (123kB)
[K     |████████████████████████████████| 133kB 30.0MB/s 
[?25hCollecting omegaconf
  Downloading https://files.pythonhosted.org/packages/d0/eb/9d63ce09dd8aa85767c65668d5414958ea29648a0eec80a4a7d311ec2684/omegaconf-2.0.6-py3-none-any.whl
Collecting antlr4-python3-runtime==4.8
[?25l  Downloading https://files.pythonhosted.org/packages/56/02/789a0bddf9c9b31b14c3e79ec22b9656185a803dc31c15f006f9855ece0d/antlr4-python3-runtime-4.8.tar.gz (112kB)
[K     |████████████████████████████████| 112kB 44.4MB/s 
Collecting PyYAML>=5.1.*
[?25l  Downloading https://files.pythonhosted.org/packages/7a/a5/393c087efdc78091afa2af9f1378762f9821c9c1d7a22c5753fb5ac5f97a/PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636kB)
[K     |████████████████████████████████| 645kB 49.8MB/s 
Building wheels for collected package

In [None]:
import torch
from torch import cuda
# Download RoBERTa already finetuned for MNLI
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
roberta.eval() 

In [16]:
#batched predictions
from fairseq.data.data_utils import collate_tokens


In [17]:
#running model in a small batch
batch = collate_tokens(
    [roberta.encode(row[0], row[1]) for row in new_df], pad_idx=1
)

logprobs = roberta.predict('mnli', batch)
print(logprobs.argmax(dim=1))

tensor([2, 0, 2])


In [18]:
#creating a new copy
df_2 = new_df.copy()

In [19]:
#checking label counts
df_2['labels'].value_counts()

unrelated    36545
discuss       8909
agree         3678
disagree       840
Name: labels, dtype: int64

In [20]:
#updating labels for model comparison
df_2['labels_int'] = ''

for label in range(len(df_2)):
  if df_2['labels'][label] == 'unrelated':
    df_2['labels_int'][label] = 0
  elif df_2['labels'][label] == 'disagree':
    df_2['labels_int'][label] = 1
  elif df_2['labels'][label] == 'discuss':
    df_2['labels_int'][label] = 2
  else:
    df_2['labels_int'][label] = 3




In [21]:
#checking label counts
df_2['labels_int'].value_counts()

0    36545
2     8909
3     3678
1      840
Name: labels_int, dtype: int64

In [23]:
#data check
df_2.head()

Unnamed: 0,news,headers,labels,labels_int
0,A small meteorite crashed into a wooded area i...,"Soldier shot, Parliament locked down after gun...",unrelated,0
1,A small meteorite crashed into a wooded area i...,Tourist dubbed ‘Spider Man’ after spider burro...,unrelated,0
2,A small meteorite crashed into a wooded area i...,Luke Somers 'killed in failed rescue attempt i...,unrelated,0
3,A small meteorite crashed into a wooded area i...,BREAKING: Soldier shot at War Memorial in Otta...,unrelated,0
4,A small meteorite crashed into a wooded area i...,Giant 8ft 9in catfish weighing 19 stone caught...,unrelated,0


In [24]:
#evaluating the model
%%time
label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
ncorrect, nsamples = 0 , 0 
roberta.cuda()
roberta.eval()
prediction_list = []

for row in range(len(df_2)):
  sent1, sent2, target = df_2.iloc[:,0], df_2.iloc[:,1], df_2.iloc[:,3]
  tokens = roberta.encode(sent1[row][0:511], sent2[row])
  prediction = roberta.predict('mnli', tokens).argmax().item()
  prediction_label = label_map[prediction]
  prediction_list.append(prediction_label)
  if prediction_label == target[row]: ncorrect +=1
  nsamples +=1
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.0
CPU times: user 30min 44s, sys: 4.72 s, total: 30min 49s
Wall time: 30min 58s


In [26]:
#checking prediction list
prediction_list[0:10]

['neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral']

In [27]:
#adding predictions to a data frame as column
df_2['model_pred'] = prediction_list

In [81]:
#save 
df_2.to_csv('/content/drive/My Drive/Colab Notebooks/NLP/sometext_pred.csv',index=False)

In [82]:
#open file
text_pred = open('/content/drive/My Drive/Colab Notebooks/NLP/sometext_pred.csv')

In [83]:
#convert to a list and to a dataframe
df_pred = pd.read_csv(text_pred)

In [109]:
#data check
df_pred.head()

Unnamed: 0,news,headers,labels,labels_int,model_pred
0,A small meteorite crashed into a wooded area i...,"Soldier shot, Parliament locked down after gun...",unrelated,0,neutral
1,A small meteorite crashed into a wooded area i...,Tourist dubbed ‘Spider Man’ after spider burro...,unrelated,0,neutral
2,A small meteorite crashed into a wooded area i...,Luke Somers 'killed in failed rescue attempt i...,unrelated,0,neutral
3,A small meteorite crashed into a wooded area i...,BREAKING: Soldier shot at War Memorial in Otta...,unrelated,0,neutral
4,A small meteorite crashed into a wooded area i...,Giant 8ft 9in catfish weighing 19 stone caught...,unrelated,0,neutral


In [103]:
# adding labels to lists and merging lists
rows = list(df_pred['labels'].unique())
columns = list(df_pred['model_pred'].unique())
labels = rows + columns
print(labels)

['unrelated', 'agree', 'discuss', 'disagree']
['neutral', 'entailment', 'contradiction']
['unrelated', 'agree', 'discuss', 'disagree', 'neutral', 'entailment', 'contradiction']


In [105]:
# generating the confusion matrix (cm)
y_true  = df_pred['labels']
y_pred = df_pred['model_pred']
cm = confusion_matrix(y_true, y_pred, labels=labels)

In [108]:
# adding and printing the cm in a dataFrame format
cmxt = pd.DataFrame(cm, index=labels, columns=labels)
print(cmxt)

               unrelated  agree  discuss  ...  neutral  entailment  contradiction
unrelated              0      0        0  ...    31384         560           4601
agree                  0      0        0  ...     1558        1819            301
discuss                0      0        0  ...     3821        3968           1120
disagree               0      0        0  ...      389         142            309
neutral                0      0        0  ...        0           0              0
entailment             0      0        0  ...        0           0              0
contradiction          0      0        0  ...        0           0              0

[7 rows x 7 columns]
