In [11]:
import json
import nltk
import numpy as np
import re
from nltk.tokenize import sent_tokenize
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

# 下载nltk的停用词数据
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

# 加载停用词
stop_words = set(stopwords.words('english'))

def load_jsonl(file_path):
    """加载JSONL文件并返回字典列表。"""
    with open(file_path, 'r', encoding='utf-8') as file:
        return [json.loads(line) for line in file]

def preprocess_documents(documents):
    """将文档信息存储到字典中。"""
    doc_dict = {}
    for doc in documents:
        doc_id = doc['document_id']
        doc_text = doc['document_text']
        doc_dict[doc_id] = doc_text
    return doc_dict

def preprocess_questions(questions):
    """将问题、答案和参考文档ID存储到字典中。"""
    question_dict = {}
    for question in questions:
        question_text = question['question']
        answer_text = question['answer']
        reference_doc_ids = question.get('document_id', [])
        question_dict[question_text] = {
            'answer': answer_text,
            'document_id': reference_doc_ids
        }
    return question_dict

def clean_text(text):
    """清洗文本 去掉HTML标签并转换为小写。"""
    text = text.replace("'", "")  
    text = re.sub(r'<.*?>', ' ', text)  # 去掉所有的HTML标签
    text = re.sub(r'\s+', ' ', text)  # 合并多个空格为一个空格
    return text.lower()  # 转换为小写

# 加载文档和问题
documents = load_jsonl('./data/documents1800.jsonl')
questions = load_jsonl('./data/train1800.jsonl')

# 处理文档和问题
doc_dict = preprocess_documents(documents)
question_dict = preprocess_questions(questions)

### Step 2: 分词并去掉停用词
def segment_text(text):
    """对文本进行分句和分词，并去掉停用词。"""
    cleaned_text = clean_text(text)  # 清洗文本
    sentences = sent_tokenize(cleaned_text)  # 按句子分割
    # print (sentences)
    segmented_sentences = []
    for sentence in sentences:
        words = nltk.word_tokenize(sentence)
        filtered_words = [word for word in words if ((word.lower() not in stop_words) or not word.isalpha())]
        segmented_sentences.extend(filtered_words)
    # print (segmented_sentences)
    return segmented_sentences

def segment_text(text):
    """对文本进行分词并去掉停用词。"""
    cleaned_text = clean_text(text)  # 清洗文本
    words = nltk.word_tokenize(cleaned_text)
    return [word for word in words if word.lower() not in stop_words]

# 准备训练数据
all_texts = list(doc_dict.values()) + list(question_dict.keys())
segmented_texts = [segment_text(text) for text in all_texts]
# segmented_texts = [sentence for text in all_texts for sentence in segment_text(text)]

# 输出平均长度 中位数长度
print(np.mean([len(text) for text in segmented_texts]))
print(np.median([len(text) for text in segmented_texts]))



[nltk_data] Downloading package punkt to
[nltk_data]     g:\Anaconda\Miniconda\envs\MyNLP\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     g:\Anaconda\Miniconda\envs\MyNLP\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


4251.187902187902
2544.0


In [13]:
# '''
# 训练 Word2Vec 模型
word2vec_model = Word2Vec(sentences=segmented_texts, vector_size=800, window=200, min_count=10, workers=32)
# word2vec_model = Word2Vec(sentences=segmented_texts, vector_size=100, window=50, min_count=10, workers=32)

# 获取文本的向量表示
def get_text_vector(text, model):
    """计算给定文本的向量表示。"""
    words = segment_text(text)
    vectors = [model.wv[word] for word in words if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

# 计算文档的向量
doc_vectors = {doc_id: get_text_vector(doc_text, word2vec_model) for doc_id, doc_text in doc_dict.items()}

# 计算问题的向量
question_vectors = {question_text: {
    'vector': get_text_vector(question_text, word2vec_model),
    'answer': question_dict[question_text]['answer'],
    'document_id': question_dict[question_text]['document_id']
} for question_text in question_dict.keys()}

# 示例：计算每个问题与文档之间的余弦相似度
tot_num = 0
acc_num = 0

for question_text, question_data in question_vectors.items():
    question_vector = question_data['vector'].reshape(1, -1)  # 确保是二维数组
    doc_vectors_array = np.array(list(doc_vectors.values()))  # 转换为数组

    # 计算相似度
    similarities = cosine_similarity(question_vector, doc_vectors_array)
    
    # 找到最匹配的五个文档
    top_5_indices = similarities.argsort()[0][-5:][::-1]
    
    # 根据相对index找到doc_id
    top_5_doc_ids = [list(doc_dict.keys())[i] for i in top_5_indices]
    
    # 输出结果
    print(f"Question: {question_text}")
    print(f"Answer: {question_data['answer']}")
    print(f"Reference Document ID: {question_data['document_id']}")
    print("Top 5 Similar Documents:")
    print(top_5_doc_ids)
    
    tot_num += 1
    if question_data['document_id'] in top_5_doc_ids:
        acc_num += 1

print("--------------------------------------------------")
print("Accuracy:", acc_num / tot_num)
# '''

Question: when do sorry to bother you come out
Answer: July 6, 2018
Reference Document ID: 732
Top 5 Similar Documents:
[13, 913, 1534, 279, 1423]
Question: when was when you say nothing at all written
Answer: 1988
Reference Document ID: 1034
Top 5 Similar Documents:
[999, 1281, 1568, 1034, 504]
Question: which olsen twin was in full house more
Answer: Mary-Kate
Reference Document ID: 1066
Top 5 Similar Documents:
[639, 1119, 1683, 1686, 229]
Question: who is the kid who played the banjo in deliverance
Answer: Billy Redden
Reference Document ID: 502
Top 5 Similar Documents:
[502, 1722, 189, 158, 1306]
Question: how many justices currently serve on the us supreme court
Answer: nine
Reference Document ID: 1680
Top 5 Similar Documents:
[1286, 25, 662, 558, 563]
Question: how many super bowl wins do giants have
Answer: four
Reference Document ID: 1296
Top 5 Similar Documents:
[1498, 345, 601, 1543, 1170]
Question: when was the last time england made it to a world cup semi final
Answer: 199