In [6]:
import json
import re
import numpy as np
import nltk
import os  # Make sure to import os
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix

# Load stop words
stop_words = set(stopwords.words('english'))
DataNum = ''
CHUNK_SIZE = 1000

def load_jsonl(file_path):
    """Load JSONL file and return a list of dictionaries."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return [json.loads(line) for line in file]

def clean_text(text):
    """Clean text by removing HTML tags and converting to lowercase."""
    text = re.sub(r'<.*?>', ' ', text)  # Remove HTML tags
    text = re.sub(r'\s+', ' ', text)  # Merge multiple spaces
    return text.lower()  # Convert to lowercase

def segment_text(text):
    """Tokenize text and remove stop words."""
    words = word_tokenize(text)
    return [word for word in words if word not in stop_words]

def preprocess_documents(documents):
    """Store document information in a dictionary."""
    doc_dict = {}
    for doc in tqdm(documents, desc="Processing documents"):
        doc_id = doc['document_id']
        doc_text = doc['document_text']
        doc_dict[doc_id] = doc_text
    return doc_dict

def preprocess_questions(questions, question_tfidf_matrix):
    """Store questions, answers, and reference document IDs in a dictionary."""
    question_dict = {}
    for idx, question in tqdm(enumerate(questions), desc="Processing questions"):
        question_text = question['question']
        answer_text = question['answer']
        reference_doc_ids = question.get('document_id', [])
        
        # Store the sparse matrix directly without converting to dense
        question_dict[question_text] = {
            'answer': answer_text,
            'document_id': reference_doc_ids,  # Store reference document IDs
            'vector': question_tfidf_matrix[idx]  # Keep as sparse matrix
        }
    return question_dict

def compute_tfidf(text, vectorizer):
    """Compute TF-IDF values."""
    tfidf_matrix = vectorizer.fit_transform(text)
    return tfidf_matrix, vectorizer.get_feature_names_out()

def Candidate_Calculation(question_vector, doc_vectors):
    """Calculate the top 5 matching documents."""
    question_vector = question_vector.reshape(1, -1)  # Ensure it's a 2D array
    similarities = cosine_similarity(question_vector, doc_vectors)
    top_5_indices = similarities.argsort()[0][-5:][::-1]
    return top_5_indices

def validate_accuracy(question_dict, doc_dict, doc_vectors):
    """Validate accuracy: Check if candidate documents contain reference document IDs."""
    correct = 0
    total = len(question_dict)
    
    for question_text, question_data in tqdm(question_dict.items(), desc="Validating accuracy"):
        question_vector = question_data['vector']  # Sparse matrix
        top_5_indices = Candidate_Calculation(question_vector.toarray(), doc_vectors)
        top_5_doc_ids = [list(doc_dict.keys())[i] for i in top_5_indices]
        
        reference_doc_ids = question_data['document_id']
        if isinstance(reference_doc_ids, list):
            if any(doc_id in top_5_doc_ids for doc_id in reference_doc_ids):
                correct += 1
        else:
            if reference_doc_ids in top_5_doc_ids:
                correct += 1
    
    accuracy = correct / total if total > 0 else 0
    return accuracy

def save_sparse_vector(vector, file_path):
    """Save a sparse vector in a compact format."""
    row, col = vector.nonzero()
    data = vector.data
    entry = {
        'indices': col.tolist(),
        'values': data.tolist(),
        'shape': vector.shape
    }
    with open(file_path, 'w', encoding='utf-8') as f:
        json.dump(entry, f)

# Main program
documents = load_jsonl('./data/documents' + DataNum + '.jsonl')
questions = load_jsonl('./data/train' + DataNum + '.jsonl')

# Preprocess documents and questions
doc_dict = preprocess_documents(documents)
vectorizer = TfidfVectorizer(tokenizer=segment_text)
all_text = list(doc_dict.values()) + [q['question'] for q in questions]
tfidf_matrix = vectorizer.fit_transform(all_text)

# Split TF-IDF matrix into documents and questions
num_doc = len(doc_dict)
doc_tfidf_matrix = tfidf_matrix[:num_doc]
question_tfidf_matrix = tfidf_matrix[num_doc:]

# Process questions to include TF-IDF vectors
question_dict = preprocess_questions(questions, question_tfidf_matrix)

# Create directory for vector files if it doesn't exist
vector_dir = f'./data/TF_IDF/vector{DataNum}'
os.makedirs(vector_dir, exist_ok=True)

# Save document vectors to JSONL with optimized storage
save_doc_vector_path = f'./data/TF_IDF/doc_vector{DataNum}.jsonl'
with open(save_doc_vector_path, 'w', encoding='utf-8') as doc_file:
    for i, (doc_id, doc_text) in enumerate(doc_dict.items()):
        doc_vector = doc_tfidf_matrix[i]  # Access the sparse matrix using index
        vector_file_path = f'{vector_dir}/vector_{doc_id}.json'
        save_sparse_vector(doc_vector, vector_file_path)
        doc_entry = {
            'document_id': doc_id,
            'document_text': doc_text,
            'document_vector_path': vector_file_path  # Store path to vector file
        }
        doc_file.write(json.dumps(doc_entry) + '\n')

# Save question vectors to JSONL with optimized storage
save_ques_vector_path = f'./data/TF_IDF/ques_vector{DataNum}.jsonl'
with open(save_ques_vector_path, 'w', encoding='utf-8') as ques_file:
    for question_text, question_data in question_dict.items():
        vector_file_path = f'{vector_dir}/vector_ques{hash(question_text)}.json'
        save_sparse_vector(question_data['vector'], vector_file_path)
        ques_entry = {
            'question_text': question_text,
            'question_answer': question_data['answer'],
            'document_id_answer': question_data['document_id'],
            'question_vector_path': vector_file_path  # Store path to vector file
        }
        ques_file.write(json.dumps(ques_entry) + '\n')

# Validate accuracy
accuracy = validate_accuracy(question_dict, doc_dict, doc_tfidf_matrix)
print("Accuracy:", accuracy)

Processing documents: 100%|██████████| 12138/12138 [00:00<00:00, 1517133.89it/s]
Processing questions: 8000it [00:00, 13985.96it/s]
Validating accuracy: 100%|██████████| 8000/8000 [22:34<00:00,  5.91it/s]

Accuracy: 0.58675



