In [5]:
from nltk.tokenize import sent_tokenize, TweetTokenizer

text = "This is a great value for the money . We purchased this as a back up computer after our more expensive HP needed to be repaired . This is a great computer . We have n't had any problems with it at all . The body is a bit cheaply made so it will be interesting to see how long it holds up . Overall though , for the money spent it 's a great deal ."

tokens = TweetTokenizer().tokenize(text)
print(tokens)
print(len(tokens))

['This', 'is', 'a', 'great', 'value', 'for', 'the', 'money', '.', 'We', 'purchased', 'this', 'as', 'a', 'back', 'up', 'computer', 'after', 'our', 'more', 'expensive', 'HP', 'needed', 'to', 'be', 'repaired', '.', 'This', 'is', 'a', 'great', 'computer', '.', 'We', 'have', "n't", 'had', 'any', 'problems', 'with', 'it', 'at', 'all', '.', 'The', 'body', 'is', 'a', 'bit', 'cheaply', 'made', 'so', 'it', 'will', 'be', 'interesting', 'to', 'see', 'how', 'long', 'it', 'holds', 'up', '.', 'Overall', 'though', ',', 'for', 'the', 'money', 'spent', 'it', "'", 's', 'a', 'great', 'deal', '.']
78


In [4]:
def is_whitespace(c):
    if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
        return True
    return False

paragraph_text = text
doc_tokens = []
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
    if is_whitespace(c):
        prev_is_whitespace = True
    else:
        if prev_is_whitespace:
            doc_tokens.append(c)
        else:
            doc_tokens[-1] += c
        prev_is_whitespace = False
    char_to_word_offset.append(len(doc_tokens) - 1)

print(doc_tokens)
print(len(doc_tokens))

['This', 'is', 'a', 'great', 'value', 'for', 'the', 'money', '.', 'We', 'purchased', 'this', 'as', 'a', 'back', 'up', 'computer', 'after', 'our', 'more', 'expensive', 'HP', 'needed', 'to', 'be', 'repaired', '.', 'This', 'is', 'a', 'great', 'computer', '.', 'We', 'have', "n't", 'had', 'any', 'problems', 'with', 'it', 'at', 'all', '.', 'The', 'body', 'is', 'a', 'bit', 'cheaply', 'made', 'so', 'it', 'will', 'be', 'interesting', 'to', 'see', 'how', 'long', 'it', 'holds', 'up', '.', 'Overall', 'though', ',', 'for', 'the', 'money', 'spent', 'it', "'s", 'a', 'great', 'deal', '.']
77


In [9]:
from transformers import BertModel, BertTokenizer

bert = BertModel.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

sentence = "I like this weather. It is sunny!"

inputs = tokenizer(sentence, add_special_tokens=True, return_tensors="pt", padding=True, truncation=True)
print(inputs)
output = bert(**inputs)

print(output.last_hidden_state.shape)
print(output.pooler_output.shape)
print(output)


{'input_ids': tensor([[  101,  1045,  2066,  2023,  4633,  1012,  2009,  2003, 11559,   999,
           102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
torch.Size([1, 11, 768])
torch.Size([1, 768])
BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.0110,  0.0159,  0.0500,  ..., -0.3678,  0.2542,  0.4237],
         [ 0.3790,  0.0734, -0.1440,  ..., -0.1316,  0.6123,  0.2099],
         [-0.1150,  0.4740,  0.9893,  ...,  0.1482,  0.3360,  0.3301],
         ...,
         [-0.4032,  0.1886,  0.0884,  ...,  0.2425, -0.1337, -0.1080],
         [ 0.0968, -0.4204,  0.0740,  ...,  0.6097,  0.2299, -0.2964],
         [ 0.4490,  0.1954,  0.3692,  ...,  0.3948, -0.3804, -0.4034]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-9.3437e-01, -5.4492e-01, -9.7169e-01,  8.4760e-01,  8.5664e-01,
         -1.9093e-01,  9.0413e-01,  3.5569e-01, -9.3135e-01, -9.9999e-01,
     

In [None]:
from torch.nn.modules import MultiheadAttention

a = MultiheadAttention(embed_dim=768, num_heads=8)

print(a())

In [27]:
import torch
import torch.nn.functional as F
from transformers import BertModel, BertTokenizer


class DualEncoder(torch.nn.Module):
    def __init__(self, model_name, model_hidden_dim, attn_heads = 8):
        super().__init__()
        self.bert = BertModel.from_pretrained(model_name)
        self.q_linear = torch.nn.Linear(model_hidden_dim, model_hidden_dim)
        self.k_linear = torch.nn.Linear(model_hidden_dim, model_hidden_dim)

        # self.attn = MultiheadAttention(embed_dim=model_hidden_dim, num_heads=attn_heads)
        # self.linear = torch.nn.Linear(in_features=model_hidden_dim, out_features=1)

    def forward(self, question, context):
        q_e = F.normalize(self.q_linear(self.bert(**question).pooler_output), dim=-1)
        k_e = F.normalize(self.k_linear(self.bert(**context).last_hidden_state), dim=-1)

        return F.sigmoid(torch.einsum('bij,bj->bi', k_e, q_e))


        # q_e = self.bert(**question).last_hidden_state
        # k_e = v_e = self.bert(**context).last_hidden_state
        # attn_output = self.attn(q_e, k_e, v_e)

        # return F.sigmoid(self.linear(attn_output))
        
    

sentence = [
    "This is a great value for the money . We purchased this as a back up computer after our more expensive HP needed to be repaired . This is a great computer . We have n't had any problems with it at all . The body is a bit cheaply made so it will be interesting to see how long it holds up . Overall though , for the money spent it 's a great deal .",
    "Right out of the box , this little netbook did everything I asked of it , including streaming the everyday video you 're bound to encounter checking mail and websites ( my biggest complaint previously ) . It even has a great webcam , and Skype works very well . The fact that you can spend over $ 100 on just a webcam underscores the value of this machine . The Windows 7 Starter is , in my opinion , a great way to think about using your netbook : basics , basics , basics . I wiped nearly everything off of it , installed OpenOffice and Firefox , and I am operating an incredibly efficient and useful machine for a great price . This netbook is a perfect supplementary computer to another laptop or desktop ( my wife and I have another laptop ) , or if you are a user who uses the computer for simple tasks . I use this for my tutoring business , and since I 'm always bouncing from student to student , it is ideal for portability and battery life ( yes , it gets the 8 hours as advertised ! ) . Finally , I should note that I took the 2GB RAM stick from my old EeePC and installed it before I even powered on for the first time . ASUS has done an outstanding job of evolving their netbooks , and I would recommend this to anyone who both understands their needs and how netbooks can fit them .", 
]

question = [
    "how is the value ?",
    "do you like its webcam ?"

]

model_name = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(model_name)

q_t = tokenizer(question, return_tensors="pt", add_special_tokens=True, padding=True, truncation=True)
c_t = tokenizer(sentence, return_tensors="pt", add_special_tokens=True, padding=True, truncation=True)

print(q_t)

answer = DualEncoder(model_name, 768)(q_t, c_t)
print(answer)
print(answer.shape)



{'input_ids': tensor([[  101,  2129,  2003,  1996,  3643,  1029,   102,     0,     0],
        [  101,  2079,  2017,  2066,  2049,  4773, 28727,  1029,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([[0.4950, 0.5052, 0.5062, 0.5072, 0.5011, 0.4976, 0.5045, 0.5012, 0.5056,
         0.4956, 0.4978, 0.5083, 0.5086, 0.5074, 0.5092, 0.5047, 0.5090, 0.5165,
         0.4962, 0.4971, 0.4996, 0.5025, 0.5165, 0.5003, 0.5047, 0.4988, 0.5083,
         0.4947, 0.5025, 0.5039, 0.5055, 0.4976, 0.5125, 0.5240, 0.4954, 0.4941,
         0.5004, 0.4888, 0.4940, 0.4903, 0.4841, 0.5006, 0.5014, 0.5039, 0.4866,
         0.4895, 0.4901, 0.5056, 0.5078, 0.5067, 0.5084, 0.5139, 0.5137, 0.5177,
         0.5220, 0.5068, 0.4982, 0.5006, 0.4935, 0.4936, 0.4995, 0.5022, 0.4995,
         0.4905, 0.5009, 0.4826, 0.4961, 0.5281, 0.5068, 0.5129, 0.5122, 0.4959,
  

In [40]:
import os, json
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer

def read_json_examples(input_file):
    with open(input_file, "r", encoding='utf-8') as reader:
        input_data = json.load(reader)["data"]
    
    questions = []
    contexts = []
    question_ids = []
    answer_texts = []
    start_positions = []

    for entry in input_data:
        for paragraph in entry["paragraphs"]:
            for qa in paragraph["qas"]:
                contexts.append(paragraph["context"])
                questions.append(qa["question"])
                question_ids.append(qa["id"])
                answer_texts.append(qa["answers"][0]["text"])
                start_positions.append(qa["answers"][0]["answer_start"])

    return contexts, questions, question_ids, answer_texts, start_positions



class QADataset(Dataset):
    def __init__(self, questions, contexts, answers, answer_starts, tokenizer, question_ids, max_length=512):
        self.questions = questions
        self.contexts = contexts
        self.answers = answers
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.answer_starts = answer_starts
        self.question_ids = question_ids

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        question = self.questions[idx]
        context = self.contexts[idx]
        answer = self.answers[idx]
        question_id = self.question_ids[idx]

        # Tokenize the input pair (question, context)
        q_encode = self.tokenizer(
            question,
            add_special_tokens=True,   # Add [CLS] and [SEP] tokens
            max_length=self.max_length,
            padding='max_length',      # Pad to max_length
            truncation=True,           # Truncate if too long
            return_tensors='pt'        # Return PyTorch tensors
        )
        q_encode = {k:v.squeeze() for k, v in q_encode.items()}
        # print(q_encode)

        c_encode = self.tokenizer(
            context,
            add_special_tokens=True,   # Add [CLS] and [SEP] tokens
            max_length=self.max_length,
            padding='max_length',      # Pad to max_length
            truncation=True,           # Truncate if too long
            return_tensors='pt'        # Return PyTorch tensors
        )

        c_encode = {k:v.squeeze() for k, v in c_encode.items()}


        # Find the start and end positions of the answer in the context
        start_position = self.answer_starts[idx]
        end_position = start_position + len(answer) - 1

        answer_span = torch.zeros_like(q_encode["input_ids"])
        # print(answer_span.shape)


        # align the positions
        context_len = len(context)

        start_token_idx = end_token_idx = None


        if start_position <= context_len:
            start_token_idx = len(self.tokenizer.tokenize(context[:start_position]))

        if end_position <= context_len:
            end_token_idx = len(self.tokenizer.tokenize(context[:end_position])) -1

        if start_token_idx is not None:
            if end_token_idx is not None:
                answer_span[torch.arange(start_token_idx, end_token_idx)] = 1
            else:
                answer_span[torch.arange(start_token_idx, len(answer_span))] = 1
        

        return {
            "question_encoding": q_encode,
            "context_encoding": c_encode,
            "answer_span": answer_span,
            "question_ids": question_id
        }


data_folder = "../data/rrc/laptop"
mini_batch = 4

model_name = "bert-base-uncased"

tokenizer = BertTokenizer.from_pretrained(model_name)

contexts, questions, question_ids, answer_texts, start_positions = read_json_examples(os.path.join(data_folder, "train.json"))
train_dataset = QADataset(questions, contexts, answer_texts, start_positions, tokenizer, question_ids)
train_dataloader = DataLoader(train_dataset, batch_size=mini_batch)

for batch in train_dataloader:
    print(batch["question_encoding"])
    print(batch["answer_span"])
    print(DualEncoder("bert-base-uncased", 768)(batch["question_encoding"], batch["context_encoding"]))
    break

{'input_ids': tensor([[ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2129, 2003,  ...,    0,    0,    0],
        [ 101, 2079, 2017,  ...,    0,    0,    0],
        [ 101, 2515, 1996,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
tensor([[0.4964, 0.4879, 0.4915,  ..., 0.4938, 0.4939, 0.4963],
        [0.4971, 0.4884, 0.4900,  ..., 0.4927, 0.4939, 0.4976],
        [0.4963, 0.4952, 0.5104,  ..., 0.4911, 0.4964, 0.4837],
        [0.4966, 0.4960, 0.5094,  ..., 0.4904, 0.4963, 0.4844]],
       grad_fn=<SigmoidBackward0>)


In [42]:
import torch
print(torch.arange(3, 4))

tensor([3])
