In [1]:
# import os
# import json
# import pickle
import random

import spacy
# from spacy.lang.en.stop_words import STOP_WORDS
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from utils import Data

data_processor = Data(root_path="../")

print("Data files: ")
for file in data_processor.data_dict.keys():
    print(file)

Data files: 
collection.sampled.tsv
train_sample_queries.tsv
train_sample_passv2_qrels.tsv
val_2021_53_queries.tsv
val_2021_passage_top100.txt
val_2021.qrels.pass.final.txt
test_2022_76_queries.tsv
test_2022_passage_top100.txt
test_2022.qrels.pass.withDupes.txt


In [2]:
data_processor.read_in_memory()

正在处理文件collection.sampled.tsv 读取文件的格式为('pid', 'passage')
正在处理文件train_sample_queries.tsv 读取文件的格式为('qid', 'query')
正在处理文件train_sample_passv2_qrels.tsv 读取文件的格式为('qid', 'mark', 'pid', 'rating')
正在处理文件val_2021_53_queries.tsv 读取文件的格式为('qid', 'query')
正在处理文件val_2021_passage_top100.txt 读取文件的格式为('qid', 'mark', 'pid', 'rank', 'score', 'sys_id')
正在处理文件val_2021.qrels.pass.final.txt 读取文件的格式为('qid', 'mark', 'pid', 'rating')
正在处理文件test_2022_76_queries.tsv 读取文件的格式为('qid', 'query')
正在处理文件test_2022_passage_top100.txt 读取文件的格式为('qid', 'mark', 'pid', 'rank', 'score', 'sys_id')
正在处理文件test_2022.qrels.pass.withDupes.txt 读取文件的格式为('qid', 'mark', 'pid', 'rating')


In [3]:
# read data from memory by calling data_processor.dataset with following keys
for f in data_processor.dataset.keys():
    print(f)

collection.sampled
train_sample_queries
train_sample_passv2_qrels
val_2021_53_queries
val_2021_passage_top100
val_2021.qrels.pass.final
test_2022_76_queries
test_2022_passage_top100
test_2022.qrels.pass.withDupes


In [4]:
set_passage_id = set(data_processor.dataset['collection.sampled'].keys())
print("Total number of passages: ", len(set_passage_id))

Total number of passages:  126799


In [5]:
example_passage_id = random.choice(list(set_passage_id))
example_passage = data_processor.dataset['collection.sampled'][example_passage_id]['passage']
doc = nlp(example_passage)
sentences = [i.text for i in doc.sents]
sentences

["'I should know I am addicted to lettuce.",
 'I eat it every single day.',
 'If you put three largish leaves, in your sandwhich the calorie content is small.',
 'A 1lb head of lettuce will have around 60 calories.',
 'So each leaf is around 5 calories.',
 "That is a normal sliced loaf size of bread.'"]

In [6]:
from transformers import AutoTokenizer
model_dir = "../model/ColBERT/"
tokenizer = AutoTokenizer.from_pretrained(model_dir)

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
encoded_input = tokenizer(
    sentences,
    padding='max_length',
    truncation=True,
    max_length=64,
    return_tensors='pt'
)
encoded_input

{'input_ids': tensor([[  101,  1005,  1045,  2323,  2113,  1045,  2572, 23042,  2000,  2292,
          8525,  3401,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0],
        [  101,  1045,  4521,  2009,  2296,  2309,  2154,  1012,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,


In [8]:
tokenizer.decode(encoded_input['input_ids'][0])

"[CLS]'i should know i am addicted to lettuce. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]"

In [9]:
doc = nlp(sentences[0])
displacy.render(doc, style='dep', jupyter=True)

In [10]:
# with doc.retokenize() as retokenizer:
#     retokenizer.merge(doc[1:3], attrs={"LEMMA": "quick_brown_fox"})
# displacy.render(doc, style='dep', jupyter=True)

In [11]:
# 生成依存树

class Node:
    def __init__(self, text, parent, children, dep):
        self.text = text
        self.parent = parent
        self.children = children
        self.dep = dep

nodes = {}
for token in doc:
    node = Node(
        text=token.text,
        parent=None,
        children=set(),
        dep=token.dep_
    )
    nodes[token.text] = node

for token in doc:
    text, head, dep, head_dep = token.text, token.head.text, token.dep_, token.head.dep_
    nodes[text].parent = nodes[head]
    nodes[head].children.add(nodes[text])

for text, node in nodes.items():
    if node.text == node.parent.text:
        root = node
        break

root.text

'know'