In [3]:
import numpy as np
from datasets import load_dataset
from scipy.spatial import distance
import seaborn as sns
import pandas as pd
from transformers import AutoModel, AutoTokenizer
import torch
import requests
import xml.etree.ElementTree as ET

pd.set_option('display.max_colwidth', None)

In [11]:
def tokenize_and_align_labels(
    examples,
    sentence1_key,
    sentence2_key,
    paraphrase_type_id2cls_id,
    tokenizer,
):
    sentence1_key = sentence1_key + "_tokenized"
    sentence2_key = sentence2_key + "_tokenized"

    args = (
        (examples[sentence1_key],)
        if sentence2_key is None
        else (examples[sentence1_key], examples[sentence2_key])
    )
    tokenized_inputs = tokenizer(*args, truncation=True, is_split_into_words=True, return_offsets_mapping=True)
    return tokenized_inputs



def create_label_maps(etpc):
    # Flatten paraphrase_types as list
    all_types = {el for sublist in etpc["paraphrase_types"] for el in sublist}

    # Download xml with paraphrase types to ids from url https://github.com/venelink/ETPC/blob/master/Corpus/paraphrase_types.xml
    url = "https://raw.githubusercontent.com/venelink/ETPC/master/Corpus/paraphrase_types.xml"
    r = requests.get(url)
    root = ET.fromstring(r.text)

    # Get paraphrase types, ids and categories
    paraphrase_types = [child.find("type_name").text for child in root]
    paraphrase_type_ids = [int(child.find("type_id").text) for child in root]
    paraphrase_type_categories = [child.find("type_category").text for child in root]

    # Create dictionary with paraphrase type as key and paraphrase type id as value
    paraphrase_type2cls_id = dict(zip(paraphrase_types, paraphrase_type_ids))
    paraphrase_id2cls_type = dict(zip(paraphrase_type_ids, paraphrase_types))

    # Create dictionary with paraphrase type as key and paraphrase type category as value
    paraphrase_type_to_category = dict(
        zip(paraphrase_types, paraphrase_type_categories)
    )

    # Add 0 for no paraphrase to all dictionaries
    paraphrase_type2cls_id["no_paraphrase"] = 0
    paraphrase_id2cls_type[0] = "no_paraphrase"
    paraphrase_type_to_category["no_paraphrase"] = "no_paraphrase"

    # Create label2id and id2label for etpc paraphrase_types
    label2cls_id = {label: i + 1 for i, label in enumerate(all_types)}
    cls_id2label = {i: label for label, i in label2cls_id.items()}

    # Add 0 for no paraphrase to all dictionaries
    label2cls_id["no_paraphrase"] = 0
    cls_id2label[0] = "no_paraphrase"

    # Create a map from ids to the ones in paraphrase_type_to_id and vice versa
    cls_id2paraphrase_type_id = {
        i: paraphrase_type2cls_id[cls_id2label[i]] for i in cls_id2label
    }
    paraphrase_type_id2cls_id = {
        paraphrase_type2cls_id[cls_id2label[i]]: i for i in cls_id2label
    }

    # Create a dictionary that maps ids from label2cls_id to the ones in paraphrase_type_to_id using the type label and vice versa
    cls_id2paraphrase_type_id = {
        i: paraphrase_type2cls_id[cls_id2label[i]] for i in cls_id2label
    }
    paraphrase_type_id2cls_id = {
        paraphrase_type2cls_id[cls_id2label[i]]: i for i in cls_id2label
    }

    return (
        label2cls_id,
        cls_id2label,
        paraphrase_type2cls_id,
        paraphrase_id2cls_type,
        paraphrase_type_to_category,
        cls_id2paraphrase_type_id,
        paraphrase_type_id2cls_id,
    )

In [12]:
dataset = load_dataset("jpwahle/etpc")

In [13]:
# Constants

sentence1_key = "sentence1"
sentence2_key = "sentence2"
dataset = dataset["train"]

In [14]:
def load_model(model_path):
    tokenizer_bert = AutoTokenizer.from_pretrained(model_path)
    model_bert = AutoModel.from_pretrained(model_path)
    return model_bert, tokenizer_bert

In [15]:
(
    label2cls_id,
    cls_id2label,
    paraphrase_type2cls_id,
    paraphrase_id2cls_type,
    paraphrase_type_to_category,
    cls_id2paraphrase_type_id,
    paraphrase_type_id2cls_id,) = create_label_maps(dataset)

In [16]:
model, tokenizer = load_model('/Users/yasir/github/paraphrase-types/out/cls-models/bert-large-uncased-jpwahle/etpc-paraphrase-detection/checkpoint-3045')
dataset_tokenized = dataset.map(
            tokenize_and_align_labels,
            batched=True,
            fn_kwargs={
                "sentence1_key": sentence1_key,
                "sentence2_key": sentence2_key,
                "tokenizer": tokenizer,
                "paraphrase_type_id2cls_id": paraphrase_type_id2cls_id,
            },
        )

Map: 100%|██████████| 5801/5801 [00:01<00:00, 4266.21 examples/s]


In [18]:
def encode(input_ids, attention_mask, token_type_ids, model):
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
    return outputs
print(dataset_tokenized['offset_mapping'][0])
attention_mask = torch.tensor([dataset_tokenized['attention_mask'][0]])
input_ids = torch.tensor([dataset_tokenized['input_ids'][0]])
token_type_ids = torch.tensor([dataset_tokenized['token_type_ids'][0]])

print(attention_mask.shape, input_ids.shape, token_type_ids.shape)
outputs = encode(input_ids, attention_mask, token_type_ids, model)


[[0, 0], [0, 2], [2, 4], [4, 6], [0, 7], [0, 3], [0, 7], [0, 1], [0, 4], [0, 2], [0, 6], [0, 1], [1, 2], [0, 3], [0, 7], [0, 1], [1, 2], [0, 1], [0, 2], [0, 12], [0, 2], [2, 6], [6, 10], [0, 3], [0, 8], [0, 1], [0, 0], [0, 9], [0, 2], [0, 3], [0, 2], [0, 4], [0, 1], [1, 2], [0, 3], [0, 7], [0, 1], [1, 2], [0, 1], [0, 2], [2, 4], [4, 6], [0, 7], [0, 3], [0, 7], [0, 2], [0, 12], [0, 2], [2, 6], [6, 10], [0, 3], [0, 8], [0, 1], [0, 0]]
torch.Size([1, 54]) torch.Size([1, 54]) torch.Size([1, 54])


In [19]:
print(dataset_tokenized)

Dataset({
    features: ['idx', 'sentence1', 'sentence2', 'sentence1_tokenized', 'sentence2_tokenized', 'etpc_label', 'mrpc_label', 'negation', 'paraphrase_types', 'paraphrase_type_ids', 'sentence1_segment_location', 'sentence2_segment_location', 'sentence1_segment_location_indices', 'sentence2_segment_location_indices', 'sentence1_segment_text', 'sentence2_segment_text', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels'],
    num_rows: 5801
})


In [17]:
sentence1_tokenized = dataset_tokenized['sentence1_tokenized'][0]
sentence2_tokenized = dataset_tokenized['sentence2_tokenized'][0]
offsets = dataset_tokenized['offset_mapping'][0]
input_ids = dataset_tokenized['input_ids'][0]
attention_mask = dataset_tokenized['attention_mask'][0]
texts = [dataset_tokenized['sentence1'][0], dataset_tokenized['sentence2'][0]]

In [25]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)
target_word = "Amrozi accused his brother"
target_word_tokens = tokenizer.tokenize(target_word)

for i in range(len(tokens)):
    if tokens[i:i+len(target_word_tokens)] == target_word_tokens:
        word_start_index = i
        word_end_index = i + len(target_word_tokens) - 1
        break

print(word_start_index, word_end_index)
print(f"Tokens: {tokens}")



1 6
Tokens: ['[CLS]', 'am', '##ro', '##zi', 'accused', 'his', 'brother', ',', 'whom', 'he', 'called', '`', '`', 'the', 'witness', "'", "'", ',', 'of', 'deliberately', 'di', '##stor', '##ting', 'his', 'evidence', '.', '[SEP]', 'referring', 'to', 'him', 'as', 'only', '`', '`', 'the', 'witness', "'", "'", ',', 'am', '##ro', '##zi', 'accused', 'his', 'brother', 'of', 'deliberately', 'di', '##stor', '##ting', 'his', 'evidence', '.', '[SEP]']


AttributeError: 'list' object has no attribute 'size'

In [22]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained model tokenizer (vocabulary)
model, tokenizer = load_model('/Users/yasir/github/paraphrase-types/out/cls-models/bert-large-uncased-jpwahle/etpc-paraphrase-detection/checkpoint-3045')

# Encode the sentence using the tokenizer
sentence = "My name is Yasir"
encoded_input = tokenizer(sentence, return_tensors='pt')

# Put the model in evaluation mode
model.eval()

# Get the embeddings from the model
with torch.no_grad():
    outputs = model(**encoded_input)

# The last hidden state of the model
last_hidden_states = outputs.last_hidden_state

# Convert token IDs back to words to find the index of the target word
tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0])

# Find the indices of the tokens corresponding to the word 'love'
target_word = "Yasir"
target_word_tokens = tokenizer.tokenize(target_word)

# Get the start and end index of the word "love" tokens
word_start_index = None
word_end_index = None

for i in range(len(tokens)):
    if tokens[i:i+len(target_word_tokens)] == target_word_tokens:
        word_start_index = i
        word_end_index = i + len(target_word_tokens) - 1
        break

if word_start_index is None:
    raise ValueError(f"Word '{target_word}' not found in the tokenized sentence.")

print(word_start_index, word_end_index)
# Get the embeddings for the subwords of 'love'
love_embeddings = last_hidden_states[0, word_start_index:word_end_index+1, :]

# Average the embeddings of the subwords to get the final embedding for 'love'
love_embedding = love_embeddings.mean(dim=0)

print(f"Tokens: {tokens}")
print(f"Embedding for '{target_word}': {love_embedding}")


4 5
Tokens: ['[CLS]', 'my', 'name', 'is', 'ya', '##sir', '[SEP]']
Embedding for 'Yasir': tensor([-0.3394,  0.0585, -0.5342,  ..., -0.0869, -0.2932,  0.1511])


In [9]:
love_embedding.shape

torch.Size([1024])

In [None]:
tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0])
target_word = "Yasir"
target_word_tokens = tokenizer.tokenize(target_word)