In [2]:
from datasets import load_dataset
import pandas as pd
from ir_eval.metrics import recall, precision, hole, ndcg
from sentence_transformers import SentenceTransformer, util, CrossEncoder
from ir_eval.utils_prompt import load_prompt_text, eval_prompt, preprocess_prompt
import collections
import os
import json
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load trec covid dataset
https://paperswithcode.com/dataset/trec-covid

In [3]:
corpus = load_dataset("BeIR/trec-covid", 'corpus')['corpus']
queries = load_dataset("BeIR/trec-covid", 'queries')['queries']
qrels = load_dataset("BeIR/trec-covid-qrels")


In [4]:
queries

Dataset({
    features: ['_id', 'title', 'text'],
    num_rows: 50
})

In [5]:
def combine_text(example):
    example['full_text'] = '[Title] ' + example['title'] + ' [TEXT] ' + example['text']
    return example
corpus = corpus.map(combine_text)

In [6]:
qid_2_query = dict(zip(queries['_id'], queries['text']))
docid_2_title = dict(zip(corpus['_id'], corpus['title']))
docid_2_text = dict(zip(corpus['_id'], corpus['text']))
docid_2_combined_text = dict(zip(corpus['_id'], [title + " " + text for title, text in zip(corpus['title'], corpus['text'])]))

## Milvus BM25

In [23]:
# https://zilliz.com/blog/getting-started-with-a-milvus-connection
from milvus import default_server
default_server.start()

FileNotFoundError: [Errno 2] No such file or directory: '/home/yangyutu/miniconda3/envs/huggingface_lastest/lib/python3.11/site-packages/milvus/data/glog.conf'

In [18]:
from pymilvus import MilvusClient
client = MilvusClient("milvus_demo.db")

In [22]:
import milvus
milvus.__version__

'2.3.5'

In [None]:
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    Collection,
)

# # Connect to Milvus server
# connections.connect(
#    host='127.0.0.1',
#    port=default_server.listen_port)

In [12]:
from pymilvus.model.sparse.bm25.tokenizers import build_default_analyzer
from pymilvus.model.sparse import BM25EmbeddingFunction

In [None]:
# analyzer  removes common stopwords in a particular language, tokenizes each remaining word, and then gathers the statistics of each token’s relevancy.
analyzer = build_default_analyzer(language="en")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/yangyutu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
# Create corpus based on product title
corpus = list(docid_2_combined_text.values())

# Use the analyzer to instantiate the BM25EmbeddingFunction
bm25_ef = BM25EmbeddingFunction(analyzer)
# Fit the model on the corpus to get the statistics of the corpus
bm25_ef.fit(corpus)

In [15]:
corpus_bm25_embeddings = bm25_ef.encode_documents(corpus)

In [24]:
client = MilvusClient(uri="http://localhost:19530")

In [29]:

# Define collection name
collection_name = "trec_covid_bm25"

# Drop collection if it exists
if utility.has_collection(collection_name):
    utility.drop_collection(collection_name)

# Define fields for the collection

fields = [
    FieldSchema(name="pk", dtype=DataType.VARCHAR,
                is_primary=True, auto_id=True, max_length=100),
    FieldSchema(name="full_text", dtype=DataType.VARCHAR, max_length=2048),
    FieldSchema(name="full_text_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
]

# Create collection schema
schema = CollectionSchema(fields=fields, description="TREC-COVID BM25 collection")


# Create collection
#collection = client.create_collection(collection_name=collection_name, schema=schema)

collection = Collection(name=collection_name, schema=schema)

sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"}
collection.create_index(field_name="full_text_vector", index_params=sparse_index)


# Insert data
entities = [
    corpus,  # text field
    corpus_bm25_embeddings,  # embedding field
]

collection.insert(entities)
collection.flush()






2025-02-22 22:24:39,723 [ERROR][handler]: RPC error: [create_index], <MilvusException: (code=1100, message=create index on 104 field is not supported: invalid parameter[expected=supported field][actual=create index on 104 field])>, <Time:{'RPC start': '2025-02-22 22:24:39.719441', 'RPC error': '2025-02-22 22:24:39.722983'}> (decorators.py:140)


MilvusException: <MilvusException: (code=1100, message=create index on 104 field is not supported: invalid parameter[expected=supported field][actual=create index on 104 field])>

In [None]:

# Load collection into memory
collection.load()

In [None]:
# Example search
search_param = {
    "metric_type": "L2",
    "params": {"nprobe": 10}
}

# Example query using the first query from the dataset
results = collection.search(
    data=[queries[0]['text']],  # Using first query as example
    anns_field="text",
    param=search_param,
    limit=10
)

# Print results
for hits in results:
    for hit in hits:
        print(f"ID: {hit.id}, Distance: {hit.distance}")

In [None]:
import json

def corpus_to_jsonl(corpus, output_file):
    """
    Converts a corpus (list of dictionaries) to a JSONL file suitable for Pyserini indexing.

    Args:
        corpus (list): A list of dictionaries, where each dictionary represents a document.
                       Each document dictionary should have '_id' and 'full_text' keys.
        output_file (str): The path to the output JSONL file.
    """
    with open(output_file, 'w') as f_out:
        for doc in corpus:
            doc_id = doc['_id']
            text = doc['full_text']
            
            # Create a dictionary with 'id' and 'contents' keys
            output_dict = {'id': doc_id, 'contents': text}
            
            # Write the dictionary to the JSONL file as a JSON string
            json.dump(output_dict, f_out)
            f_out.write('\n')

# Example usage:
output_file = './data/trec_covid_corpus.jsonl'
corpus_to_jsonl(corpus, output_file)
print(f"Corpus converted to JSONL format and saved to {output_file}")

Corpus converted to JSONL format and saved to ./data/trec_covid_corpus.jsonl


### build index

In [None]:
index_path = "./data/trec_covid_corpus_index"

In [None]:
%%bash
python -m pyserini.index.lucene --collection JsonCollection \
    --generator DefaultLuceneDocumentGenerator \
    --threads 4 \
    --input "./data/trec_covid_corpus_input" \
    --index "./data/trec_covid_corpus_index" \
    --storePositions --storeDocvectors --storeRaw


### Search

In [None]:
from pyserini.search.lucene import LuceneSearcher

searcher = LuceneSearcher('./data/trec_covid_corpus_index')

query_text = queries[0]['text']
hits = searcher.search(query_text, k=10)

for i in range(len(hits)):
    print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f}')

 1 dv9m19yk 7.27880
 2 0paafp5j 6.22610
 3 96zsd27n 6.22610
 4 hmvo5b0q 6.11880
 5 1ij25a7u 5.85120
 6 5d7zien3 5.70360
 7 fqs40ivc 5.66660
 8 xqqn1t4e 5.64080
 9 dckuhrlf 5.64070
10 h4vigeuy 5.63960


### Search with RM3 query expansion

In [None]:
searcher.set_rm3()
hits = searcher.search(query_text, k=10)

for i in range(len(hits)):
    print(f'{i+1:2} {hits[i].docid:4} {hits[i].score:.5f}')

 1 dv9m19yk 1.51230
 2 0paafp5j 1.47740
 3 96zsd27n 1.47740
 4 h4vigeuy 1.40550
 5 fqs40ivc 1.35570
 6 1ij25a7u 1.29020
 7 37v59fs8 1.27120
 8 5fg87lvu 1.26260
 9 950x4b9a 1.26260
10 hmvo5b0q 1.25150


In [None]:
print(query_text)

print(searcher.is_using_rm3())

print(searcher.get_feedback_terms(query_text))

what is the origin of COVID-19
True
{'countri': 0.03179526701569557, 'covid': 0.125, 'what': 0.22162935137748718, 'origin': 0.19928449392318726, '19': 0.125, 'about': 0.03716013953089714, 'should': 0.04642285034060478, 'outbreak': 0.053088217973709106, 'global': 0.04541816934943199, 'event': 0.03194766864180565, 'research': 0.03683099523186684, 'anesthesia': 0.04642285034060478}


### Gather search results

In [None]:
hits[0]

<io.anserini.search.ScoredDoc at 0x7f1ca2f22630 jclass=io/anserini/search/ScoredDoc jself=<LocalRef obj=0x384e401a at 0x7f1cfd526b30>>

In [None]:
BM25_retrieval_results = collections.defaultdict(dict)
top_k = 500
searcher.unset_rm3()
for query in queries:
    qid = query['_id']
    query_text = query['text']
    hits = searcher.search(query_text, k=top_k)
    
    docids = [hits[i].docid for i in range(top_k)]
    scores = [hits[i].score for i in range(top_k)]
    BM25_retrieval_results[qid] = dict(zip(docids, scores))

In [None]:
BM25_retrieval_results_RM3 = collections.defaultdict(dict)
top_k = 500
searcher.set_rm3()
for query in queries:
    qid = query['_id']
    query_text = query['text']
    hits = searcher.search(query_text, k=top_k)
    
    docids = [hits[i].docid for i in range(top_k)]
    scores = [hits[i].score for i in range(top_k)]
    BM25_retrieval_results_RM3[qid] = dict(zip(docids, scores))

### EVal

In [None]:
qrels_for_eval = collections.defaultdict(dict)
for example in qrels['test']:
    qrels_for_eval[str(example['query-id'])][str(example['corpus-id'])] = example['score']

In [None]:
def eval_model(retrieval_results, qrels_for_eval):
    print(recall(qrels=qrels_for_eval, results=retrieval_results, k_values=[1, 3, 5, 10, 20, 30, 100, 500, 2000]))
    print(precision(qrels=qrels_for_eval, results=retrieval_results, k_values=[1, 3, 5, 10, 20, 30, 100, 500, 2000]))
    print(ndcg(qrels=qrels_for_eval, results=retrieval_results, k_values=[1, 3, 5, 10]))

In [None]:
eval_model(BM25_retrieval_results, qrels_for_eval)

# {'Recall@1': 0.00188, 'Recall@3': 0.00534, 'Recall@5': 0.00855, 'Recall@10': 0.01585, 'Recall@20': 0.02968, 'Recall@30': 0.043, 'Recall@100': 0.10896, 'Recall@500': 0.29329, 'Recall@2000': 0.29329}
# {'Precision@1': 0.76, 'Precision@3': 0.71333, 'Precision@5': 0.684, 'Precision@10': 0.646, 'Precision@20': 0.597, 'Precision@30': 0.576, 'Precision@100': 0.4712, 'Precision@500': 0.2766, 'Precision@2000': 0.06915}
# {'NDCG@1': 0.67, 'NDCG@3': 0.64801, 'NDCG@5': 0.63063, 'NDCG@10': 0.5995}

{'Recall@1': 0.00188, 'Recall@3': 0.00534, 'Recall@5': 0.00855, 'Recall@10': 0.01585, 'Recall@20': 0.02968, 'Recall@30': 0.043, 'Recall@100': 0.10896, 'Recall@500': 0.29329, 'Recall@2000': 0.29329}
{'Precision@1': 0.76, 'Precision@3': 0.71333, 'Precision@5': 0.684, 'Precision@10': 0.646, 'Precision@20': 0.597, 'Precision@30': 0.576, 'Precision@100': 0.4712, 'Precision@500': 0.2766, 'Precision@2000': 0.06915}
{'NDCG@1': 0.67, 'NDCG@3': 0.64801, 'NDCG@5': 0.63063, 'NDCG@10': 0.5995}


In [None]:
eval_model(BM25_retrieval_results_RM3, qrels_for_eval)

# using RM3 query expansion can improve recall, but will hurt precision and ranking
# {'Recall@1': 0.0018, 'Recall@3': 0.0053, 'Recall@5': 0.00873, 'Recall@10': 0.01646, 'Recall@20': 0.03072, 'Recall@30': 0.04298, 'Recall@100': 0.1181, 'Recall@500': 0.33463, 'Recall@2000': 0.33463}
# {'Precision@1': 0.74, 'Precision@3': 0.71333, 'Precision@5': 0.704, 'Precision@10': 0.662, 'Precision@20': 0.62, 'Precision@30': 0.59, 'Precision@100': 0.5, 'Precision@500': 0.30824, 'Precision@2000': 0.07706}
# {'NDCG@1': 0.64, 'NDCG@3': 0.62561, 'NDCG@5': 0.61974, 'NDCG@10': 0.59406}

{'Recall@1': 0.0018, 'Recall@3': 0.0053, 'Recall@5': 0.00873, 'Recall@10': 0.01646, 'Recall@20': 0.03072, 'Recall@30': 0.04298, 'Recall@100': 0.1181, 'Recall@500': 0.33463, 'Recall@2000': 0.33463}
{'Precision@1': 0.74, 'Precision@3': 0.71333, 'Precision@5': 0.704, 'Precision@10': 0.662, 'Precision@20': 0.62, 'Precision@30': 0.59, 'Precision@100': 0.5, 'Precision@500': 0.30824, 'Precision@2000': 0.07706}
{'NDCG@1': 0.64, 'NDCG@3': 0.62561, 'NDCG@5': 0.61974, 'NDCG@10': 0.59406}


### Inspect BM25 index
https://github.com/castorini/pyserini/blob/master/docs/usage-indexreader.md#how-do-i-iterate-over-index-terms-and-access-term-statistics

In [None]:
from pyserini.index.lucene import LuceneIndexReader


In [None]:
# Initialize from an index path:
index_reader = LuceneIndexReader(index_path)

In [None]:
# get access to basic stats
index_reader.stats()

{'total_terms': 21165485,
 'documents': 171332,
 'non_empty_documents': 171332,
 'unique_terms': 202648}

In [None]:
import itertools
for term in itertools.islice(index_reader.terms(), 100):
    print(f'{term.term} (df={term.df}, cf={term.cf})')

0 (df=4571, cf=8586)
0,00 (df=1, cf=1)
0,000 (df=1, cf=1)
0,0001 (df=6, cf=7)
0,0002 (df=1, cf=1)
0,0004 (df=2, cf=2)
0,0005 (df=1, cf=1)
0,0006 (df=1, cf=1)
0,0007 (df=1, cf=1)
0,001 (df=16, cf=28)
0,0010 (df=1, cf=1)
0,0013 (df=1, cf=1)
0,002 (df=5, cf=5)
0,003 (df=2, cf=2)
0,0033 (df=1, cf=1)
0,0037 (df=1, cf=1)
0,004 (df=1, cf=1)
0,0041 (df=1, cf=1)
0,0043 (df=2, cf=2)
0,005 (df=6, cf=6)
0,006 (df=3, cf=4)
0,007 (df=2, cf=2)
0,009 (df=1, cf=1)
0,01 (df=9, cf=9)
0,013 (df=1, cf=1)
0,014 (df=1, cf=1)
0,015 (df=2, cf=2)
0,018 (df=2, cf=2)
0,02 (df=3, cf=5)
0,0202 (df=1, cf=1)
0,024 (df=1, cf=1)
0,026 (df=1, cf=1)
0,027 (df=1, cf=1)
0,029 (df=1, cf=1)
0,03 (df=5, cf=5)
0,031 (df=1, cf=1)
0,033 (df=1, cf=1)
0,036 (df=2, cf=2)
0,038 (df=1, cf=1)
0,04 (df=2, cf=5)
0,043 (df=1, cf=1)
0,044 (df=1, cf=1)
0,045 (df=2, cf=2)
0,048 (df=2, cf=2)
0,049 (df=1, cf=2)
0,05 (df=17, cf=22)
0,052 (df=1, cf=1)
0,054 (df=1, cf=1)
0,060 (df=1, cf=1)
0,0651 (df=1, cf=1)
0,069 (df=1, cf=1)
0,07 (df=1, cf=1)

In [None]:
term = 'cities'

# Look up its document frequency (df) and collection frequency (cf).
# Note, we use the unanalyzed form:
df, cf = index_reader.get_term_counts(term)
print(f'term "{term}": df={df}, cf={cf}')

term "cities": df=3277, cf=5556


In [None]:
# Here's how to fetch and traverse postings:
# Fetch and traverse postings for an unanalyzed term:
postings_list = index_reader.get_postings_list(term)
print(len(postings_list))
for posting in postings_list:
    # position is where is the token at the document
    print(f'docid={posting.docid}, tf={posting.tf}, pos={posting.positions}')


3277
docid=68, tf=2, pos=[220, 236]
docid=140, tf=1, pos=[151]
docid=189, tf=1, pos=[81]
docid=349, tf=1, pos=[219]
docid=364, tf=1, pos=[39]
docid=394, tf=1, pos=[181]
docid=438, tf=2, pos=[171, 185]
docid=496, tf=2, pos=[81, 87]
docid=497, tf=8, pos=[60, 62, 79, 88, 90, 115, 163, 200]
docid=587, tf=1, pos=[68]
docid=655, tf=1, pos=[260]
docid=691, tf=1, pos=[95]
docid=743, tf=3, pos=[17, 63, 137]
docid=806, tf=2, pos=[13, 89]
docid=936, tf=2, pos=[41, 74]
docid=976, tf=1, pos=[70]
docid=1005, tf=1, pos=[118]
docid=1030, tf=3, pos=[19, 31, 72]
docid=1091, tf=1, pos=[48]
docid=1105, tf=1, pos=[175]
docid=1114, tf=2, pos=[95, 106]
docid=1355, tf=1, pos=[133]
docid=1368, tf=1, pos=[54]
docid=1425, tf=1, pos=[91]
docid=1435, tf=2, pos=[93, 281]
docid=1620, tf=5, pos=[11, 83, 139, 181, 227]
docid=1623, tf=1, pos=[17]
docid=1698, tf=1, pos=[128]
docid=1736, tf=6, pos=[12, 90, 127, 133, 167, 219]
docid=1792, tf=2, pos=[150, 163]
docid=1817, tf=1, pos=[67]
docid=1893, tf=1, pos=[74]
docid=197

In [None]:
# get the sparse doc vector (word, term_freq)
# note that stemming heavily modify the morphology of the word
doc_vector = index_reader.get_document_vector('dv9m19yk')
print(doc_vector)

{'viru': 2, 'been': 1, 'covid': 2, 'virtual': 1, 'fact': 1, 'zoonot': 1, 'origin': 5, 'about': 1, 'cov': 1, 'type': 1, 'dozen': 1, 'everi': 1, 'theori': 1, 'caus': 1, 'hit': 1, 'determin': 1, 'known': 1, 'far': 1, 'have': 1, 'text': 1, 'titl': 1, 'event': 1, 'so': 1, 'all': 1, 'which': 1, 'new': 1, 'like': 1, 'sar': 1, 'occur': 1, 'pandem': 2, 'attribut': 1, '19': 2, 'diseas': 1, 'differ': 1, 'emerg': 1, 'most': 1, '2': 1, 'globe': 1, 'what': 2, 'ha': 2, 'investig': 1, 'except': 1, 'time': 1, 'first': 1}


In [None]:
print(docid_2_combined_text['dv9m19yk'])

[What is the origin of SARS-CoV-2?] Every time a pandemic occurs, dozens of theories emerge to attribute the origin of the event to different facts. The COVID-19 pandemic that has hit virtually all the globe has been no exception. What is known so far about the origin of the virus that causes COVID 19? The first investigations on the origin of this disease have determined that it is a new type of virus, the origin of which is most likely zoonotic.


In [None]:
# Note that the keys of get_document_vector() are already analyzed, we set analyzer to be None.
bm25_score = index_reader.compute_bm25_term_weight('dv9m19yk', 'viru', analyzer=None)
print(bm25_score)

1.191981315612793


### Inspect tokenization process
https://github.com/castorini/pyserini/blob/master/docs/usage-analyzer.md

In [None]:
from pyserini.analysis import Analyzer, get_lucene_analyzer

# Default analyzer for English uses the Porter stemmer:
# stop words will be removed
analyzer = Analyzer(get_lucene_analyzer())
tokens = analyzer.analyze('City buses are running on time.')
print(tokens)

['citi', 'buse', 'run', 'time']


## Enrich document via generated queries

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from huggingface_hub import notebook_login

# Optional: Log in to Hugging Face (if required)
#notebook_login()  #Uncomment if you are in a notebook and need to log in

model_name = 'csdc-atl/doc2query'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

text = "2014年12月9日，于洋转会至中超联赛球队广州富力。2015赛季初，于洋并没有出场机会。韩国中后卫张贤秀受伤后，主教练选择用金洋洋代替。足协杯4比0战胜贵州人和的比赛中，金洋洋打入两球。赛后，中国足协认定金洋洋在庆祝进球时使用侮辱性手势，将他禁赛四场。之后对阵山东鲁能的联赛，于洋迎来出场机会，首次代表广州富力出战正式比赛。从此开始，于洋得到了较为充足的出场时间。2015赛季于洋中超联赛出场17次、亚冠联赛1次，这18次出场中有17次为首发。2016赛季，于洋成为广州富力三后卫体系的主力，还曾担任队长。这个赛季，他在中超联赛出场25次、足协杯出场5次，联赛的25次出场中含22次首发。效力广州富力期间，他于2015年重返中国国家足球队。\n2016年12月30日，广州富力宣布于洋离队，加盟北京国安。有媒体透露，转会费在5000万至6000万元人民币之间。回归北京国安之后，于洋成为中后卫位置上的主力。2018年3月31日北京国安与北京人和的北京德比上，于洋第100次代表北京国安出场。他在比赛中打入一球，助球队4比0获胜。"


def create_queries(text):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    with torch.no_grad():
        # Here we use top_k / top_k random sampling. It generates more diverse queries, but of lower quality
        sampling_outputs = model.generate(
            input_ids=input_ids,
            max_length=64,
            do_sample=True,
            top_p=0.95,
            top_k=10, 
            num_return_sequences=10
            )
    return [tokenizer.decode(sampling_output, skip_special_tokens=True) for sampling_output in sampling_outputs]


sampling_outputs = create_queries(text)
print("Paragraph:")
print(text)

print("\nSampling Outputs:")
for query in sampling_outputs:
    print(query)
# 1: 于洋在2016年重返中国国家足球队是在哪个球队效力?
# 2: 于洋在2018年3月31日的北京德比上打入了几个球?
# 3: 于洋在哪些比赛中有出场机会?
# 4: 于洋在哪个比赛中打入了两球?
# 5: 于洋在2015赛季中超联赛中出场次数和亚冠联赛中的首发次数分别是多少?
# 6: 于洋在哪个比赛中打入了两球,帮助球队赢了这场比赛?
# 7: 于洋在2018年3月31日北京国安与北京人和的北京德比上打进了几个进球?
# 8: 于洋在2015赛季中超联赛和亚冠联赛中出场次数分别是多少?
# 9: 于洋在广州富力期间曾担任什么职位?
# 10: 于洋在哪些比赛中有出场机会?


Paragraph:
2014年12月9日，于洋转会至中超联赛球队广州富力。2015赛季初，于洋并没有出场机会。韩国中后卫张贤秀受伤后，主教练选择用金洋洋代替。足协杯4比0战胜贵州人和的比赛中，金洋洋打入两球。赛后，中国足协认定金洋洋在庆祝进球时使用侮辱性手势，将他禁赛四场。之后对阵山东鲁能的联赛，于洋迎来出场机会，首次代表广州富力出战正式比赛。从此开始，于洋得到了较为充足的出场时间。2015赛季于洋中超联赛出场17次、亚冠联赛1次，这18次出场中有17次为首发。2016赛季，于洋成为广州富力三后卫体系的主力，还曾担任队长。这个赛季，他在中超联赛出场25次、足协杯出场5次，联赛的25次出场中含22次首发。效力广州富力期间，他于2015年重返中国国家足球队。
2016年12月30日，广州富力宣布于洋离队，加盟北京国安。有媒体透露，转会费在5000万至6000万元人民币之间。回归北京国安之后，于洋成为中后卫位置上的主力。2018年3月31日北京国安与北京人和的北京德比上，于洋第100次代表北京国安出场。他在比赛中打入一球，助球队4比0获胜。

Sampling Outputs:
于洋在哪场比赛中打入了一球,帮助北京国安取得了胜利?
于洋为什么在足协杯4比0战胜贵州人和的比赛中被禁赛四场?
于洋在哪些比赛中打入了进球?
于洋在加盟北京国安之前效力于哪支球队?
于洋在2016年的转会费用是多少?
于洋在2017年3月31日北京国安的比赛中打进了几个球?
于洋在哪些联赛中出场过?
于洋在哪些比赛中打入了进球?
于洋在2015年重返中国国家足球队时获得了多少的出场时间?
于洋在2016赛季中担任过哪些职务?


In [None]:
from tqdm import tqdm
for i in tqdm(range(len(corpus))):
    text = corpus[i]['full_text']
    generated_queries = create_queries(text)
    concat_query_text = " ".join (generated_queries)
    corpus[i]['queries'] = concat_query_text

  0%|          | 15/171332 [00:41<133:09:31,  2.80s/it]


KeyboardInterrupt: 