In [8]:
import json

def load_jsonl(file_path):
    """加载JSONL文件并返回字典列表。"""
    with open(file_path, 'r', encoding='utf-8') as file:
        return [json.loads(line) for line in file]

def preprocess_documents(documents):
    """将文档信息存储到字典中。"""
    doc_dict = {}
    for doc in documents:
        doc_id = doc['document_id']
        doc_text = doc['document_text']
        doc_dict[doc_id] = doc_text
    return doc_dict

def preprocess_questions(questions):
    """将问题、答案和参考文档ID存储到字典中。"""
    question_dict = {}
    for question in questions:
        question_text = question['question']
        answer_text = question['answer']
        # 假设你会添加一个参考文档ID（这里以例子为主）
        reference_doc_ids = question.get('document_id', [])  # 这里假设问题中有这个字段
        question_dict[question_text] = {
            'answer': answer_text,
            'document_id': reference_doc_ids
        }
    return question_dict

# 加载文档和问题
documents = load_jsonl('./data/documents.jsonl')
questions = load_jsonl('./data/train.jsonl')

# 处理文档和问题
doc_dict = preprocess_documents(documents)
question_dict = preprocess_questions(questions)

# 输出结果
# print("Document Dictionary:", doc_dict)
# print("Question Dictionary:", question_dict)

### step2 segmentation

In [10]:
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

def segment_text(text):
    """对文本进行分词。"""
    return nltk.word_tokenize(text)
def compute_tfidf(text,vectorizer):
    """计算TF-IDF值。"""
    # vectorizer = TfidfVectorizer(tokenizer=segment_text)
    tfidf_matrix = vectorizer.fit_transform(text)
    return tfidf_matrix, vectorizer.get_feature_names_out()

vectorizer = TfidfVectorizer(tokenizer=segment_text)
all_text = list(doc_dict.values()) + list(question_dict.keys())
tfidf_matrix = vectorizer.fit_transform(all_text)
num_doc = len(doc_dict)
doc_tfidf_matrix = tfidf_matrix[:num_doc]
question_tfidf_matrix = tfidf_matrix[num_doc:]

# 输出TF-IDF矩阵和特征名称
# print("TF-IDF Matrix:\n", doc_tfidf_matrix.toarray())
# print("Problem TF-IDF Matrix:\n", question_tfidf_matrix.toarray())
# ouput the dimension of the matrixs
print("TF-IDF Matrix Shape:", doc_tfidf_matrix.shape)
print("Problem TF-IDF Matrix Shape:", question_tfidf_matrix.shape)

doc_tfidf_dict = {}
for i, doc_id in enumerate(doc_dict.keys()):
    doc_tfidf_dict[doc_id] = doc_tfidf_matrix[i]

question_tfidf_dict = {}
for i, question_text in enumerate(question_dict.keys()):
    question_tfidf_dict[question_text] = {
            'tfidf': question_tfidf_matrix[i],
            'answer': question_dict[question_text]['answer'],
            'document_id': question_dict[question_text]['document_id']
        }

TF-IDF Matrix Shape: (12138, 787015)
Problem TF-IDF Matrix Shape: (8000, 787015)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity
tot_num = 0
acc_num = 0
for question_text, question_data in question_tfidf_dict.items():
    question_tfidf = question_data['tfidf']  # 形状为 (1, n_features)

    # 计算相似度
    similarities = cosine_similarity(question_tfidf.reshape(1, -1), doc_tfidf_matrix)
    
    # 找到最匹配的五个文档
    top_5_indices = similarities.argsort()[0][-5:][::-1]
    top_5_scores = similarities[0][top_5_indices]
    # 根据相对index找到doc_id
    top_5_doc_ids = [list(doc_dict.keys())[i] for i in top_5_indices]
    
    
    
    # 输出结果
    print(f"Question: {question_text}")
    print(f"Answer: {question_data['answer']}")
    print(f"Reference Document ID: {question_data['document_id']}")
    print("Top 5 Similar Documents:")
    print(top_5_doc_ids)
    tot_num = tot_num + 1
    if question_data['document_id'] in top_5_doc_ids:
        acc_num = acc_num + 1
print("--------------------------------------------------")
print("Accuracy:", acc_num/tot_num)
    

Question: when do sorry to bother you come out
Answer: July 6, 2018
Reference Document ID: 732
Top 5 Similar Documents:
[732, 1599, 1360, 250, 244]
Question: when was when you say nothing at all written
Answer: 1988
Reference Document ID: 1034
Top 5 Similar Documents:
[1034, 59, 1370, 1360, 224]
Question: which olsen twin was in full house more
Answer: Mary-Kate
Reference Document ID: 1066
Top 5 Similar Documents:
[1066, 1686, 1637, 639, 769]
Question: who is the kid who played the banjo in deliverance
Answer: Billy Redden
Reference Document ID: 502
Top 5 Similar Documents:
[502, 769, 417, 570, 1287]
Question: how many justices currently serve on the us supreme court
Answer: nine
Reference Document ID: 1680
Top 5 Similar Documents:
[1286, 523, 558, 278, 1464]
Question: how many super bowl wins do giants have
Answer: four
Reference Document ID: 1296
Top 5 Similar Documents:
[345, 1498, 162, 1296, 1318]
Question: when was the last time england made it to a world cup semi final
Answer: 19