In [7]:
import json
import re
import numpy as np
from tqdm import tqdm
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import torch
from sklearn.metrics.pairwise import cosine_similarity

# 加载停用词
stop_words = set(stopwords.words('english'))
Vector_size = 1400
Window_size = 1000
Min_Count = 10
DataNum = '5'
Dir = './data/W2V/'
Model_Dir = './model/W2V/'

def load_jsonl(file_path):
    """加载 JSONL 文件并返回字典列表。"""
    with open(file_path, 'r', encoding='utf-8') as file:
        return [json.loads(line) for line in file]

def clean_text(text):
    """清洗文本，去掉 HTML 标签并转换为小写。"""
    text = re.sub(r'<.*?>', ' ', text)  # 去掉所有 HTML 标签
    text = re.sub(r'\s+', ' ', text)  # 合并多个空格为一个空格
    return text.lower()  # 转换为小写

def segment_text(text):
    """对文本进行分词并去掉停用词。"""
    cleaned_text = clean_text(text)
    words = word_tokenize(cleaned_text)
    return [word for word in words if word not in stop_words]

def preprocess_documents(documents):
    """将文档信息存储到字典中。"""
    doc_dict = {}
    for doc in tqdm(documents, desc="Processing documents"):
        doc_id = doc['document_id']
        doc_text = doc['document_text']
        doc_dict[doc_id] = doc_text
    return doc_dict

def get_vector_mean(model, words):
    """获取词向量的均值，忽略不在词汇表中的词。"""
    valid_words = [word for word in words if word in model.wv]
    if not valid_words:
        return None  # 如果没有有效词，则返回 None
    return torch.mean(torch.tensor(model.wv[valid_words]), dim=0).unsqueeze(0)

def preprocess_questions(questions, model):
    """将问题、答案和参考文档ID存储到字典中，同时生成问题向量。"""
    question_dict = {}
    for question in tqdm(questions, desc="Processing questions"):
        question_text = question['question']
        answer_text = question['answer']
        reference_doc_ids = question.get('document_id', [])
        
        # 计算问题向量
        question_vector = get_vector_mean(model, segment_text(question_text))
        
        question_dict[question_text] = {
            'answer': answer_text,
            'document_id': reference_doc_ids,
            'vector': question_vector  # 存储问题向量
        }
    return question_dict

def train_word2vec(documents):
    """训练 Word2Vec 模型，并显示进度。"""
    tokenized_docs = []
    for doc in tqdm(documents.values(), desc="Tokenizing documents"):
        tokenized_docs.append(segment_text(doc))
    
    model = Word2Vec(sentences=tokenized_docs, vector_size=Vector_size, window=Window_size, min_count=Min_Count, workers=32)
    return model

def Candidate_Calculation(question_vector, doc_dict):
    # 将文档向量转换为 NumPy 数组
    doc_vectors_array = torch.stack(list(doc_vectors.values())).numpy()
    doc_vectors_array = doc_vectors_array.squeeze()  # 去掉多余的维度

    # for question_text, question_data in tqdm(question_vectors.items(), desc="Validating accuracy"):
    #     question_vector = question_data['vector'].detach().numpy()  # 获取问题向量

    # 计算相似度
    similarities = cosine_similarity(question_vector, doc_vectors_array)
    
    # 找到最匹配的五个文档
    top_5_indices = similarities.argsort()[0][-5:][::-1]
    
    # 根据相对index找到doc_id
    top_5_doc_ids = [list(doc_dict.keys())[i] for i in top_5_indices]
    
    # 输出结果
    return top_5_doc_ids

def validate_accuracy(question_dict):
    # 判断 ['document_id_top5'] 是否包含 ['document_id_answer']
    correct = 0
    for question_text, question_data in tqdm(question_dict.items(), desc="Validating accuracy"):
        if question_data['document_id_answer'] in question_data['document_id_top5']:
            correct += 1
    accuracy = correct / len(question_dict) if question_dict else 0
    return accuracy

# 主程序
documents = load_jsonl('./data/documents'+DataNum+'.jsonl')
questions = load_jsonl('./data/train'+DataNum+'.jsonl')

# 预处理文档和问题
doc_dict = preprocess_documents(documents)
model = train_word2vec(doc_dict)
question_dict = preprocess_questions(questions, model)

# 生成词语到词向量的映射
word_translation_mapping = {word: word for word in model.wv.index_to_key}

# 生成文档向量
doc_vectors = {}
for doc_id, doc_text in tqdm(doc_dict.items(), desc="Generating document vectors"):
    doc_vec_mean = get_vector_mean(model, segment_text(doc_text))
    if doc_vec_mean is not None:
        doc_vectors[doc_id] = doc_vec_mean

save_model_path = Model_Dir + 'W2V'+ str(DataNum) + '_' + str(Vector_size) + '_' + str(Window_size) + '_' + str(Min_Count)+'.model'
model.save(save_model_path)
print(f"Word2Vec model saved to {save_model_path}")
save_doc_vector_path = Dir + 'doc_vector'+ str(DataNum) + '_' + str(Vector_size) + '_' + str(Window_size) + '_' + str(Min_Count) + '.jsonl'
save_ques_vector_path = Dir + 'ques_vector'+ str(DataNum) + '_' + str(Vector_size) + '_' + str(Window_size) + '_' + str(Min_Count) + '.jsonl'

# 保存文档向量到 JSONL
with open(save_doc_vector_path, 'w', encoding='utf-8') as doc_file:
    for doc_id, doc_text in doc_dict.items():
        doc_vector = doc_vectors.get(doc_id).detach().numpy().tolist() if doc_id in doc_vectors else None
        doc_entry = {
            'document_id': doc_id,
            'document_text': doc_text,
            'document_vector': doc_vector
        }
        doc_file.write(json.dumps(doc_entry) + '\n')

# 保存问题向量到 JSONL，并将整个字典保存到变量 val_dict
# dictionary 
val_dict = {}
with open(save_ques_vector_path, 'w', encoding='utf-8') as ques_file:
    for question_text, question_data in question_dict.items():
        ques_entry = {
            'question_text': question_text,
            'question_answer': question_data['answer'],
            'document_id_answer': question_data['document_id'],
            'document_id_top5': Candidate_Calculation(question_data['vector'].detach().numpy(), doc_dict),
            'question_vector': question_data['vector'].detach().numpy().tolist() if question_data['vector'] is not None else None
        }
        val_dict[question_text] = ques_entry
        ques_file.write(json.dumps(ques_entry) + '\n')
# 保存单词翻译表到 JSONL
translation_path = Dir + 'word_translation'+ str(DataNum) + '_' + str(Vector_size) + '_' + str(Window_size) + '_' + str(Min_Count) + '.jsonl'
with open(translation_path, 'w', encoding='utf-8') as trans_file:
    for word, translation in word_translation_mapping.items():
        translation_entry = {
            'word': word,
            'translation': translation  # 将其替换为实际翻译
        }
        trans_file.write(json.dumps(translation_entry) + '\n')
# 验证准确性
print(validate_accuracy(val_dict))

Processing documents: 100%|██████████| 5/5 [00:00<?, ?it/s]
Tokenizing documents: 100%|██████████| 5/5 [00:00<00:00, 17.01it/s]
Processing questions: 100%|██████████| 5/5 [00:00<00:00, 4988.47it/s]
Generating document vectors: 100%|██████████| 5/5 [00:00<00:00, 11.36it/s]


Word2Vec model saved to ./model/W2V/W2V5_1400_1000_10.model


Validating accuracy: 100%|██████████| 5/5 [00:00<?, ?it/s]

1.0





In [4]:
def load_word_translation(file_path):
    """加载单词翻译表并返回字典。"""
    with open(file_path, 'r', encoding='utf-8') as file:
        return {json.loads(line)['word']: json.loads(line)['translation'] for line in file}
def load_model(file_path):
    """加载 Word2Vec 模型。"""
    return Word2Vec.load(file_path)
def print_vocabulary(model):
    """打印模型的词汇表。"""
    vocab = model.wv.key_to_index  # 获取词汇表
    print("模型词汇表中的单词数量:", len(vocab))
    print("词汇表中的单词:")
    for word in vocab.keys():
        print(word)
def translate_text(vectors, model, translation_mapping, top_n=5):
    """
    根据给定的词向量找到最相近的单词，并返回其翻译。
    """
    translated_words = []
    try:
        # 找到与给定向量最相近的单词
        for vector in vectors:
            # 确保向量是有效的
            if vector is not None and len(vector) == model.vector_size:
                similar_words = model.wv.similar_by_vector(vector, topn=top_n)

                # 获取翻译
                for word, _ in similar_words:
                    translation = translation_mapping.get(word, word)  # 查找翻译
                    translated_words.append(translation)
            else:
                print("无效的向量，跳过该向量。")
        
        return ' '.join(translated_words)  # 返回翻译后的文本

    except IndexError:
        print("没有找到相似的单词，返回空字符串。")
        return ""
    except Exception as e:
        print(f"发生错误: {e}")
        return ""
def calculate_accuracy(original_texts, translated_texts):
    """计算翻译的正确率，简单地比较原文与翻译的相等性。"""
    correct_count = sum(1 for orig, trans in zip(original_texts, translated_texts) if orig == trans)
    return correct_count / len(original_texts) if original_texts else 0

# 加载单词翻译表
translation_mapping = load_word_translation(translation_path)

# 翻译文档并计算正确率
original_texts = []
original_vectors = []
translated_texts = []

model = load_model(save_model_path)
with open(save_doc_vector_path, 'r', encoding='utf-8') as doc_file:
    for line in doc_file:
        doc_entry = json.loads(line)
        doc_id = doc_entry['document_id']
        doc_text = doc_entry['document_text']
        doc_vector = doc_entry['document_vector']

        # 保存原始文本
        original_texts.append(doc_text)

        # 翻译文档文本
        translated_text = translate_text(doc_vector,model, translation_mapping)
        translated_texts.append(translated_text)

        original_vectors.append(doc_vector)
        print(doc_vector)

        # 这里可以选择保存翻译后的文档
        # 例如：保存到一个文件或其他数据结构

# 计算翻译的正确率
print(original_texts)
print(translated_texts)
# print_vocabulary(model)
# print(original_vectors)
accuracy = calculate_accuracy(original_texts, translated_texts)

print(f"翻译的正确率是: {accuracy:.2f}")

发生错误: "Key '0.02195710875093937' not present in vocabulary"
[[0.02195710875093937, 0.1433168202638626, 0.09306761622428894, 0.08938513696193695, -0.01815202832221985, -0.4823739528656006, 0.15793529152870178, 0.320736825466156, 0.12258078902959824, -0.04552391171455383, -0.1496233493089676, -0.20496433973312378, -0.22340601682662964, -0.41342025995254517, -0.21848057210445404, -0.13991309702396393, -0.07562611997127533, 0.12344617396593094, 0.14609169960021973, 0.04339278116822243, 0.2358282208442688, 0.19401125609874725, 0.0685826912522316, 0.2151585966348648, -0.19797833263874054, -0.004186511971056461, -0.22602994740009308, 0.31535759568214417, 0.1701056808233261, 0.27195608615875244, -0.16137711703777313, 0.05192309990525246, -0.10296069830656052, -0.46769991517066956, -0.12570713460445404, 0.4032790958881378, 0.10703115165233612, 0.19693638384342194, 0.1606924682855606, -0.287717342376709, -0.36753925681114197, 0.4062115252017975, -0.09784172475337982, -0.03067733347415924, -0.001