In [3]:
from transformers import pipeline
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification, TrainingArguments
from transformers import PreTrainedModel
from transformers.pipelines.pt_utils import KeyDataset

from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score

import pandas as pd
import numpy as np
import logging
from glob import glob
from os import path

from IPython.display import HTML, display

import torch

In [4]:
category_codes = {0: 'Claim',
 1: 'Concluding Statement',
 2: 'Counterclaim',
 3: 'Evidence',
 4: 'Lead',
 5: 'Position',
 6: 'Rebuttal'}
labels = list(zip(*category_codes.items()))[1]
labels

('Claim',
 'Concluding Statement',
 'Counterclaim',
 'Evidence',
 'Lead',
 'Position',
 'Rebuttal')

In [5]:
model_checkpoint = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [6]:
model_path = r"models_gitignored/roberta_base/checkpoint-75756"
model = AutoModelForSequenceClassification.from_pretrained(model_path, id2label=category_codes)

In [7]:
input_text = "What the dog doing?"

In [8]:
encodings = tokenizer(input_text, return_tensors="pt")
outputs = model(**encodings)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 5.6986, -2.5383, -1.3422,  0.7742,  0.5792, -0.9855, -1.9841]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [9]:
outputs.logits

tensor([[ 5.6986, -2.5383, -1.3422,  0.7742,  0.5792, -0.9855, -1.9841]],
       grad_fn=<AddmmBackward0>)

The following syntax may be confusing, but the logits output has a gradient associated with it, if we just want the logits itself, we have to `.detach()` from the gradient, and then `[0]` because batch size of 1, and then get the highest confidence index

In [10]:
highest_conf = torch.argmax(outputs.logits)
print(f"predicted {labels[highest_conf]} with confidence of {outputs.logits.detach()[0][highest_conf]}")

predicted Claim with confidence of 5.698571681976318


In [11]:
outputs.logits[0].tolist()

[5.698571681976318,
 -2.5382676124572754,
 -1.3422482013702393,
 0.7742164134979248,
 0.5792469382286072,
 -0.985458493232727,
 -1.984054446220398]

In [12]:
import nltk

sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

def predict(text):
    return {t:dict(zip(labels, model(**tokenizer(t, return_tensors="pt")).logits[0].tolist())) for t in sentence_tokenizer.tokenize(text)}


In [13]:
predict("I am Prannaya Gupta. My teammate is Liew Wei Pyn.")

{'I am Prannaya Gupta.': {'Claim': -0.05495935678482056,
  'Concluding Statement': -1.2194091081619263,
  'Counterclaim': -1.0489801168441772,
  'Evidence': -0.5730050206184387,
  'Lead': 7.462467193603516,
  'Position': -0.15923750400543213,
  'Rebuttal': -3.358013868331909},
 'My teammate is Liew Wei Pyn.': {'Claim': 4.576728343963623,
  'Concluding Statement': -3.076887845993042,
  'Counterclaim': -0.07713758945465088,
  'Evidence': 1.287278413772583,
  'Lead': 0.04913000017404556,
  'Position': -0.7670019865036011,
  'Rebuttal': -1.6367422342300415}}

In [14]:
def predictLogits(text, tokenizer, model):
    return {t:dict(zip(labels, model(**tokenizer(t, return_tensors="pt")).logits[0].tolist())) for t in sentence_tokenizer.tokenize(text)}

def predictLabel(logits):
    return sorted(logits, key=lambda x:logits[x])[-1]



In [15]:
predictLabel(next(iter(predict("I agree with Porin as he highlighted in the passage above.").values())))

'Position'

In [19]:
pd.DataFrame(predictLogits("I am Prannaya Gupta. My teammate is Liew Wei Pyn.", tokenizer, model)).T

Unnamed: 0,Claim,Concluding Statement,Counterclaim,Evidence,Lead,Position,Rebuttal
I am Prannaya Gupta.,-0.054959,-1.219409,-1.04898,-0.573005,7.462467,-0.159238,-3.358014
My teammate is Liew Wei Pyn.,4.576728,-3.076888,-0.077138,1.287278,0.04913,-0.767002,-1.636742


In [42]:
def predict(text):
    return pd.DataFrame({t:{**dict(zip(labels, model(**tokenizer(t, return_tensors="pt")).logits[0].tolist())), "text":text} for t in sentence_tokenizer.tokenize(text)}).T


In [43]:
predict("I am Prannaya Gupta. My teammate is Liew Wei Pyn.")

Unnamed: 0,Claim,Concluding Statement,Counterclaim,Evidence,Lead,Position,Rebuttal,text
I am Prannaya Gupta.,-0.054959,-1.219409,-1.04898,-0.573005,7.462467,-0.159238,-3.358014,I am Prannaya Gupta. My teammate is Liew Wei Pyn.
My teammate is Liew Wei Pyn.,4.576728,-3.076888,-0.077138,1.287278,0.04913,-0.767002,-1.636742,I am Prannaya Gupta. My teammate is Liew Wei Pyn.


In [44]:
text = pd.Series(["I am Prannaya Gupta. My teammate is Liew Wei Pyn.", "I have an inflamed larynx. We are out of herbal tea. Do you want any?"])

In [45]:
text.apply(predict)

0                                     Claim Conclud...
1                                   Claim Concludin...
dtype: object

In [48]:
pd.concat(text.apply(predict).apply(lambda df:df).tolist())

Unnamed: 0,sentence,Claim,Concluding Statement,Counterclaim,Evidence,Lead,Position,Rebuttal,text
0,I am Prannaya Gupta.,-0.054959,-1.219409,-1.04898,-0.573005,7.462467,-0.159238,-3.358014,I am Prannaya Gupta. My teammate is Liew Wei Pyn.
1,My teammate is Liew Wei Pyn.,4.576728,-3.076888,-0.077138,1.287278,0.04913,-0.767002,-1.636742,I am Prannaya Gupta. My teammate is Liew Wei Pyn.
0,I have an inflamed larynx.,6.714224,-2.542686,-1.284292,1.195773,-0.788462,-1.276066,-1.976083,I have an inflamed larynx. We are out of herba...
1,We are out of herbal tea.,4.061336,-2.019389,-2.027314,2.581639,-0.58224,-1.491735,-0.986001,I have an inflamed larynx. We are out of herba...
2,Do you want any?,-0.009475,-0.878833,-1.584011,-0.448498,7.578365,-0.599353,-3.289468,I have an inflamed larynx. We are out of herba...


In [52]:
ogdf = pd.DataFrame({"text":["I am Prannaya Gupta. My teammate is Liew Wei Pyn.", "I have an inflamed larynx. We are out of herbal tea. Do you want any?"], "url":["https://nytimes.com", "https://straitstimes.com"]})
ogdf

Unnamed: 0,text,url
0,I am Prannaya Gupta. My teammate is Liew Wei Pyn.,https://nytimes.com
1,I have an inflamed larynx. We are out of herba...,https://straitstimes.com


In [53]:
pd.concat(text.apply(predict).apply(lambda df:df.reset_index().rename(columns={"index":"sentence"})).tolist()).merge(ogdf)

Unnamed: 0,sentence,Claim,Concluding Statement,Counterclaim,Evidence,Lead,Position,Rebuttal,text,url
0,I am Prannaya Gupta.,-0.054959,-1.219409,-1.04898,-0.573005,7.462467,-0.159238,-3.358014,I am Prannaya Gupta. My teammate is Liew Wei Pyn.,https://nytimes.com
1,My teammate is Liew Wei Pyn.,4.576728,-3.076888,-0.077138,1.287278,0.04913,-0.767002,-1.636742,I am Prannaya Gupta. My teammate is Liew Wei Pyn.,https://nytimes.com
2,I have an inflamed larynx.,6.714224,-2.542686,-1.284292,1.195773,-0.788462,-1.276066,-1.976083,I have an inflamed larynx. We are out of herba...,https://straitstimes.com
3,We are out of herbal tea.,4.061336,-2.019389,-2.027314,2.581639,-0.58224,-1.491735,-0.986001,I have an inflamed larynx. We are out of herba...,https://straitstimes.com
4,Do you want any?,-0.009475,-0.878833,-1.584011,-0.448498,7.578365,-0.599353,-3.289468,I have an inflamed larynx. We are out of herba...,https://straitstimes.com
