In [73]:
import numpy as np
from datasets import load_dataset
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import torch
import requests
import xml.etree.ElementTree as ET

pd.set_option('display.max_colwidth', None)

In [82]:
def tokenize_and_align_labels(
    examples,
    sentence1_key,
    sentence2_key,
    paraphrase_type_id2cls_id,
    tokenizer,
):
    sentence1_key = sentence1_key + "_tokenized"
    sentence2_key = sentence2_key + "_tokenized"

    args = (
        (examples[sentence1_key],)
        if sentence2_key is None
        else (examples[sentence1_key], examples[sentence2_key])
    )
    tokenized_inputs = tokenizer(*args, truncation=True, is_split_into_words=True, return_offsets_mapping=True)
    return tokenized_inputs



def create_label_maps(etpc):
    # Flatten paraphrase_types as list
    all_types = {el for sublist in etpc["paraphrase_types"] for el in sublist}

    # Download xml with paraphrase types to ids from url https://github.com/venelink/ETPC/blob/master/Corpus/paraphrase_types.xml
    url = "https://raw.githubusercontent.com/venelink/ETPC/master/Corpus/paraphrase_types.xml"
    r = requests.get(url)
    root = ET.fromstring(r.text)

    # Get paraphrase types, ids and categories
    paraphrase_types = [child.find("type_name").text for child in root]
    paraphrase_type_ids = [int(child.find("type_id").text) for child in root]
    paraphrase_type_categories = [child.find("type_category").text for child in root]

    # Create dictionary with paraphrase type as key and paraphrase type id as value
    paraphrase_type2cls_id = dict(zip(paraphrase_types, paraphrase_type_ids))
    paraphrase_id2cls_type = dict(zip(paraphrase_type_ids, paraphrase_types))

    # Create dictionary with paraphrase type as key and paraphrase type category as value
    paraphrase_type_to_category = dict(
        zip(paraphrase_types, paraphrase_type_categories)
    )

    # Add 0 for no paraphrase to all dictionaries
    paraphrase_type2cls_id["no_paraphrase"] = 0
    paraphrase_id2cls_type[0] = "no_paraphrase"
    paraphrase_type_to_category["no_paraphrase"] = "no_paraphrase"

    # Create label2id and id2label for etpc paraphrase_types
    label2cls_id = {label: i + 1 for i, label in enumerate(all_types)}
    cls_id2label = {i: label for label, i in label2cls_id.items()}

    # Add 0 for no paraphrase to all dictionaries
    label2cls_id["no_paraphrase"] = 0
    cls_id2label[0] = "no_paraphrase"

    # Create a map from ids to the ones in paraphrase_type_to_id and vice versa
    cls_id2paraphrase_type_id = {
        i: paraphrase_type2cls_id[cls_id2label[i]] for i in cls_id2label
    }
    paraphrase_type_id2cls_id = {
        paraphrase_type2cls_id[cls_id2label[i]]: i for i in cls_id2label
    }

    # Create a dictionary that maps ids from label2cls_id to the ones in paraphrase_type_to_id using the type label and vice versa
    cls_id2paraphrase_type_id = {
        i: paraphrase_type2cls_id[cls_id2label[i]] for i in cls_id2label
    }
    paraphrase_type_id2cls_id = {
        paraphrase_type2cls_id[cls_id2label[i]]: i for i in cls_id2label
    }

    return (
        label2cls_id,
        cls_id2label,
        paraphrase_type2cls_id,
        paraphrase_id2cls_type,
        paraphrase_type_to_category,
        cls_id2paraphrase_type_id,
        paraphrase_type_id2cls_id,
    )

In [75]:
dataset = load_dataset("jpwahle/etpc")

In [63]:
df = dataset['train'].to_pandas()

In [64]:
df.head()

Unnamed: 0,idx,sentence1,sentence2,sentence1_tokenized,sentence2_tokenized,etpc_label,mrpc_label,negation,paraphrase_types,paraphrase_type_ids,sentence1_segment_location,sentence2_segment_location,sentence1_segment_location_indices,sentence2_segment_location_indices,sentence1_segment_text,sentence2_segment_text
0,1_0,"Amrozi accused his brother, whom he called ""the witness"", of deliberately distorting his evidence.","Referring to him as only ""the witness"", Amrozi accused his brother of deliberately distorting his evidence.","[Amrozi, accused, his, brother, ,, whom, he, called, ``, the, witness, '', ,, of, deliberately, distorting, his, evidence, .]","[Referring, to, him, as, only, ``, the, witness, '', ,, Amrozi, accused, his, brother, of, deliberately, distorting, his, evidence, .\n]",1,1,0,"[Same Polarity Substitution (habitual), Same Polarity Substitution (contextual), Change of order, Addition/Deletion, Identity]","[5, 6, 26, 25, 29]","[26, 26, 26, 26, 0, 5, 0, 6, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25]","[6, 5, 5, 0, 25, 0, 0, 0, 0, 0, 26, 26, 26, 26, 0, 0, 0, 0, 0, 0]","[[5], [7], [0, 1, 2, 3], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]]","[[1, 2], [0], [10, 11, 12, 13], [4]]","[whom, called, Amrozi accused his brother, `` the witness '' , of deliberately distorting his evidence .]","[to him, Referring, Amrozi accused his brother, only, `` the witness '' , of deliberately distorting his evidence .\n]"
1,2_1,Yucaipa owned Dominick's before selling the chain to Safeway in 1998 for $2.5 billion.,Yucaipa bought Dominick's in 1995 for $693 million and sold it to Safeway for $1.8 billion in 1998.,"[Yucaipa, owned, Dominick, 's, before, selling, the, chain, to, Safeway, in, 1998, for, $, 2.5, billion, .]","[Yucaipa, bought, Dominick, 's, in, 1995, for, $, 693, million, and, sold, it, to, Safeway, for, $, 1.8, billion, in, 1998, .\n]",0,0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",[],[],[],[]
2,3_2,"They had published an advertisement on the Internet on June 10, offering the cargo for sale, he added.","On June 10, the ship's owners had published an advertisement on the Internet, offering the explosives for sale.","[They, had, published, an, advertisement, on, the, Internet, on, June, 10, ,, offering, the, cargo, for, sale, ,, he, added, .]","[On, June, 10, ,, the, ship, 's, owners, had, published, an, advertisement, on, the, Internet, ,, offering, the, explosives, for, sale, .\n]",1,1,0,"[Same Polarity Substitution (contextual), Same Polarity Substitution (contextual), Change of order, Addition/Deletion, Identity]","[6, 6, 26, 25, 29]","[6, 0, 0, 0, 0, 0, 0, 0, 26, 26, 26, 0, 0, 0, 6, 0, 0, 25, 25, 25, 0]","[26, 26, 26, 26, 6, 6, 6, 6, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 6, 25, 25, 25]","[[0], [14], [8, 9, 10], [17, 18, 19]]","[[4, 5, 6, 7], [18], [0, 1, 2, 3], [8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21]]","[They, cargo, on June 10, , he added, had published an advertisement on the Internet , offering the for sale .]","[the ship 's owners, explosives, On June 10 ,, had published an advertisement on the Internet , offering the for sale .\n]"
3,4_3,"Around 0335 GMT, Tab shares were up 19 cents, or 4.4%, at A$4.56, having earlier set a record high of A$4.57.","Tab shares jumped 20 cents, or 4.6%, to set a record closing high at A$4.57.","[Around, 0335, GMT, ,, Tab, shares, were, up, 19, cents, ,, or, 4.4, %, ,, at, A, $, 4.56, ,, having, earlier, set, a, record, high, of, A, $, 4.57, .]","[Tab, shares, jumped, 20, cents, ,, or, 4.6, %, ,, to, set, a, record, closing, high, at, A, $, 4.57, .\n]",0,0,0,[],[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",[],[],[],[]
4,5_4,"The stock rose $2.11, or about 11 percent, to close Friday at $21.51 on the New York Stock Exchange.",PG&E Corp. shares jumped $1.63 or 8 percent to $21.03 on the New York Stock Exchange on Friday.,"[The, stock, rose, $, 2.11, ,, or, about, 11, percent, ,, to, close, Friday, at, $, 21.51, on, the, New, York, Stock, Exchange, .]","[PG, &, E, Corp., shares, jumped, $, 1.63, or, 8, percent, to, $, 21.03, on, the, New, York, Stock, Exchange, on, Friday, .\n]",0,1,0,"[Same Polarity Substitution (contextual), Same Polarity Substitution (habitual), Same Polarity Substitution (contextual), Synthetic/analytic substitution, Change of order, Addition/Deletion, Identity, Non-paraphrase, Non-paraphrase, Non-paraphrase, Punctuation changes]","[6, 5, 6, 11, 26, 25, 29, 30, 30, 30, 21]","[6, 6, 5, 29, 30, 0, 29, 25, 30, 29, 0, 6, 6, 26, 6, 29, 30, 29, 29, 29, 29, 29, 29, 29]","[30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30]","[[0, 1], [2], [11, 12, 14], [13], [13], [7], [3, 6, 9, 15, 17, 18, 19, 20, 21, 22, 23], [4], [8], [16]]","[[0, 1, 2, 3, 4], [5], [11], [20, 21], [20, 21], [6, 8, 10, 12, 14, 15, 16, 17, 18, 19, 22], [7], [9], [13], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22]]","[The stock, rose, to close at, Friday, Friday, about, $ or percent $ on the New York Stock Exchange ., 2.11, 11, 21.51, The stock rose $ 2.11 , or about 11 percent , to close Friday at $ 21.51 on the New York Stock Exchange .]","[PG & E Corp. shares, jumped, to, on Friday, on Friday, $ or percent $ on the New York Stock Exchange .\n, 1.63, 8, 21.03, PG & E Corp. shares jumped $ 1.63 or 8 percent to $ 21.03 on the New York Stock Exchange on Friday .\n]"


In [59]:
row = df.iloc[[0]]
print(row.to_dict())

sentence1_segment_location = row['sentence1_segment_location'].values[0].flatten()
sentence1_segment_location_indices = row['sentence1_segment_location_indices'].values[0].flatten()
print('sentence1_tokenized')
print(row['sentence1_tokenized'].values[0])
print(len(row['sentence1_tokenized'].values[0]))
print()
print('sentence1_segment_location_indices')
sentence1_segment_location_indices = np.concatenate(sentence1_segment_location_indices).flatten()

print(len(sentence1_segment_location_indices))

{'idx': {0: '1_0'}, 'sentence1': {0: 'Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.'}, 'sentence2': {0: 'Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.'}, 'sentence1_tokenized': {0: array(['Amrozi', 'accused', 'his', 'brother', ',', 'whom', 'he', 'called',
       '``', 'the', 'witness', "''", ',', 'of', 'deliberately',
       'distorting', 'his', 'evidence', '.'], dtype=object)}, 'sentence2_tokenized': {0: array(['Referring', 'to', 'him', 'as', 'only', '``', 'the', 'witness',
       "''", ',', 'Amrozi', 'accused', 'his', 'brother', 'of',
       'deliberately', 'distorting', 'his', 'evidence', '.\n'],
      dtype=object)}, 'etpc_label': {0: 1}, 'mrpc_label': {0: 1}, 'negation': {0: 0}, 'paraphrase_types': {0: array(['Same Polarity Substitution (habitual)',
       'Same Polarity Substitution (contextual)', 'Change of order',
       'Addition/Deletion', 'Identity'], dtype=obje

In [83]:
# Constants

sentence1_key = "sentence1"
sentence2_key = "sentence2"
dataset = dataset["train"]

In [84]:
def load_model(model_path):
    tokenizer_bert = AutoTokenizer.from_pretrained(model_path)
    model_bert = AutoModel.from_pretrained(model_path)
    return model_bert, tokenizer_bert

In [85]:
(
    label2cls_id,
    cls_id2label,
    paraphrase_type2cls_id,
    paraphrase_id2cls_type,
    paraphrase_type_to_category,
    cls_id2paraphrase_type_id,
    paraphrase_type_id2cls_id,) = create_label_maps(dataset)

In [86]:
model, tokenizer = load_model('/Users/yasir/github/paraphrase-types/out/cls-models/bert-large-uncased-jpwahle/etpc-paraphrase-detection/checkpoint-3045')
dataset_tokenized = dataset.map(
            tokenize_and_align_labels,
            batched=True,
            fn_kwargs={
                "sentence1_key": sentence1_key,
                "sentence2_key": sentence2_key,
                "tokenizer": tokenizer,
                "paraphrase_type_id2cls_id": paraphrase_type_id2cls_id,
            },
        )

Map: 100%|██████████| 5801/5801 [00:01<00:00, 4595.50 examples/s]


In [87]:
def encode(input_ids, attention_mask, token_type_ids, model):
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    return outputs

# print(dataset_tokenized['offset_mapping'][0])
# attention_mask = torch.tensor([dataset_tokenized['attention_mask'][0]])
# input_ids = torch.tensor([dataset_tokenized['input_ids'][0]])
# token_type_ids = torch.tensor([dataset_tokenized['token_type_ids'][0]])

# print(attention_mask.shape, input_ids.shape, token_type_ids.shape)
# outputs = encode(input_ids, attention_mask, token_type_ids, model)


[[0, 0], [0, 2], [2, 4], [4, 6], [0, 7], [0, 3], [0, 7], [0, 1], [0, 4], [0, 2], [0, 6], [0, 1], [1, 2], [0, 3], [0, 7], [0, 1], [1, 2], [0, 1], [0, 2], [0, 12], [0, 2], [2, 6], [6, 10], [0, 3], [0, 8], [0, 1], [0, 0], [0, 9], [0, 2], [0, 3], [0, 2], [0, 4], [0, 1], [1, 2], [0, 3], [0, 7], [0, 1], [1, 2], [0, 1], [0, 2], [2, 4], [4, 6], [0, 7], [0, 3], [0, 7], [0, 2], [0, 12], [0, 2], [2, 6], [6, 10], [0, 3], [0, 8], [0, 1], [0, 0]]
torch.Size([1, 54]) torch.Size([1, 54]) torch.Size([1, 54])


In [88]:
print(dataset_tokenized)

Dataset({
    features: ['idx', 'sentence1', 'sentence2', 'sentence1_tokenized', 'sentence2_tokenized', 'etpc_label', 'mrpc_label', 'negation', 'paraphrase_types', 'paraphrase_type_ids', 'sentence1_segment_location', 'sentence2_segment_location', 'sentence1_segment_location_indices', 'sentence2_segment_location_indices', 'sentence1_segment_text', 'sentence2_segment_text', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping'],
    num_rows: 5801
})


In [89]:
sentence1_tokenized = dataset_tokenized['sentence1_tokenized'][0]
sentence2_tokenized = dataset_tokenized['sentence2_tokenized'][0]
offsets = dataset_tokenized['offset_mapping'][0]
input_ids = dataset_tokenized['input_ids'][0]
attention_mask = dataset_tokenized['attention_mask'][0]
texts = [dataset_tokenized['sentence1'][0], dataset_tokenized['sentence2'][0]]

In [12]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)
target_word = "Amrozi accused his brother"
target_word_tokens = tokenizer.tokenize(target_word)

for i in range(len(tokens)):
    if tokens[i:i+len(target_word_tokens)] == target_word_tokens:
        word_start_index = i
        word_end_index = i + len(target_word_tokens) - 1
        break

print(word_start_index, word_end_index)
print(f"Tokens: {tokens}")



1 6
Tokens: ['[CLS]', 'am', '##ro', '##zi', 'accused', 'his', 'brother', ',', 'whom', 'he', 'called', '`', '`', 'the', 'witness', "'", "'", ',', 'of', 'deliberately', 'di', '##stor', '##ting', 'his', 'evidence', '.', '[SEP]', 'referring', 'to', 'him', 'as', 'only', '`', '`', 'the', 'witness', "'", "'", ',', 'am', '##ro', '##zi', 'accused', 'his', 'brother', 'of', 'deliberately', 'di', '##stor', '##ting', 'his', 'evidence', '.', '[SEP]']


## sentence2 segment location: category of paraphrase types 
## sentence1_segment_location_indices sentence1_segment_text sentence2_segment_location_indices sentence2_segment_text are useless

In [117]:
import torch
from transformers import BatchEncoding

# # Load pre-trained model tokenizer (vocabulary)
# # model, tokenizer = load_model('/Users/yasir/github/paraphrase-types/out/cls-models/bert-large-uncased-jpwahle/etpc-paraphrase-detection/checkpoint-3045')

# sentence1_tokenized_key = sentence1_key
# sentence2_tokenized_key = sentence2_key

# # Encode the sentence using the tokenizer
# row = df.iloc[[0]]
sentence1 = row["sentence1"].values[0]
print(sentence1)
# sentence2 = row[sentence2_tokenized_key].values[0]
# print(dataset_tokenized[1])


offsets = dataset_tokenized['offset_mapping'][0]
input_ids = dataset_tokenized['input_ids'][0]
attention_mask = dataset_tokenized['attention_mask'][0]
token_type_ids = dataset_tokenized['token_type_ids'][0]

input_ids = torch.tensor([input_ids])
attention_mask = torch.tensor([attention_mask])
token_type_ids = torch.tensor([token_type_ids])

encoded_input_manual = BatchEncoding({
    'input_ids': input_ids,
    'token_type_ids': token_type_ids,
    'attention_mask': attention_mask,
})

encoded_input = tokenizer(sentence1, return_tensors='pt')
print(type(encoded_input))
print(type(encoded_input_manual))

model.eval()
with torch.no_grad():
    outputs = model(**encoded_input_manual)

# # The last hidden state of the model
last_hidden_states = outputs.last_hidden_state

# Reverse tokenization to get the tokens
tokens = tokenizer.convert_ids_to_tokens(encoded_input_manual['input_ids'][0])

# # Find the indices of the tokens corresponding to the word 'love'
target_word = "Amrozi"
target_word_tokens = tokenizer.tokenize(target_word)

# # Get the start and end index of the word "love" tokens
word_start_index = None
word_end_index = None

for i in range(len(tokens)):
    if tokens[i:i+len(target_word_tokens)] == target_word_tokens:
        word_start_index = i
        word_end_index = i + len(target_word_tokens) - 1
        break

if word_start_index is None:
    raise ValueError(f"Word '{target_word}' not found in the tokenized sentence.")

print(word_start_index, word_end_index)
# Get the embeddings for the subwords of 'love'
love_embeddings = last_hidden_states[0, word_start_index:word_end_index+1, :]

# Average the embeddings of the subwords to get the final embedding for 'love'
love_embedding = love_embeddings.mean(dim=0)

print(f"Tokens: {tokens}")
print(f"Embedding for '{target_word}': {love_embedding}")


Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.
<class 'transformers.tokenization_utils_base.BatchEncoding'>
<class 'transformers.tokenization_utils_base.BatchEncoding'>
1 3
Tokens: ['[CLS]', 'am', '##ro', '##zi', 'accused', 'his', 'brother', ',', 'whom', 'he', 'called', '`', '`', 'the', 'witness', "'", "'", ',', 'of', 'deliberately', 'di', '##stor', '##ting', 'his', 'evidence', '.', '[SEP]', 'referring', 'to', 'him', 'as', 'only', '`', '`', 'the', 'witness', "'", "'", ',', 'am', '##ro', '##zi', 'accused', 'his', 'brother', 'of', 'deliberately', 'di', '##stor', '##ting', 'his', 'evidence', '.', '[SEP]']
Embedding for 'Amrozi': tensor([ 0.0899, -0.8350, -0.2770,  ...,  0.2921,  0.8338,  0.4639])


In [101]:
from transformers import BatchEncoding
BatchEncoding({
    'input_ids': torch.tensor([[101, 2572, 3217, 5831, 5496, 2010, 2567, 1010, 3183, 2002, 2170, 1000, 1996, 7409, 1000, 1010, 1997, 9969, 4487, 23809, 3436, 2010, 3350, 1012, 102]]),
    'token_type_ids': torch.tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
    'attention_mask': torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
})


{'input_ids': tensor([[  101,  2572,  3217,  5831,  5496,  2010,  2567,  1010,  3183,  2002,
          2170,  1000,  1996,  7409,  1000,  1010,  1997,  9969,  4487, 23809,
          3436,  2010,  3350,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}

In [9]:
love_embedding.shape

torch.Size([1024])

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0])
target_word = "Yasir"
target_word_tokens = tokenizer.tokenize(target_word)