In [None]:
import json
import hashlib
import sys
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def load_data(input_file):
    """Load JSON data"""
    try:
        with open(input_file, 'r', encoding='utf-8') as f:
            data = json.load(f)
        if not isinstance(data, list):
            print("The JSON data is not in a list format. Please ensure the input file is a JSON array.")
            sys.exit(1)
        return data
    except FileNotFoundError:
        print(f"File '{input_file}' not found.")
        sys.exit(1)
    except json.JSONDecodeError:
        print("Error in JSON format. Please check the input file format.")
        sys.exit(1)

def save_data(data, output_file):
    """Save JSON data"""
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(data, f, ensure_ascii=False, indent=4)
    except Exception as e:
        print(f"Error occurred while saving the file: {e}")
        sys.exit(1)

def contains_keywords(text, keywords):
    """Check if the text contains any of the keywords (case-sensitive)"""
    for kw in keywords:
        if kw in text:
            return True
    return False

def remove_entries_with_keywords(data, keywords):
    """Remove Q&A entries that contain specific keywords"""
    filtered_data = []
    deleted_count = 0
    for entry in data:
        instruction = entry.get('instruction', '')
        output = entry.get('output', '')
        if contains_keywords(instruction, keywords) or contains_keywords(output, keywords):
            deleted_count += 1
        else:
            filtered_data.append(entry)
    return filtered_data, deleted_count

def remove_duplicate_questions(data):
    """
    Remove duplicate Q&A entries (based on exact match of the 'instruction' field).
    Use hash values for quick detection of duplicates.
    """
    filtered_data = []
    seen_hashes = set()
    deleted_count = 0
    for entry in data:
        instruction = entry.get('instruction', '')
        # Generate MD5 hash of the instruction
        instruction_hash = hashlib.md5(instruction.encode('utf-8')).hexdigest()
        if instruction_hash in seen_hashes:
            deleted_count += 1
            continue
        seen_hashes.add(instruction_hash)
        filtered_data.append(entry)
    return filtered_data, deleted_count

def remove_short_qnas(data, min_question_length=10, min_answer_length=20):
    """Remove Q&A entries with too short questions or answers"""
    filtered_data = []
    deleted_count = 0
    for entry in data:
        instruction = entry.get('instruction', '')
        output = entry.get('output', '')
        if len(instruction) < min_question_length or len(output) < min_answer_length:
            deleted_count += 1
        else:
            filtered_data.append(entry)
    return filtered_data, deleted_count

def remove_similar_questions(data, similarity_threshold=0.8):
    """
    Use sentence embeddings and cosine similarity to remove similar Q&A entries.
    """
    instructions = [entry.get('instruction', '') for entry in data]
    model = SentenceTransformer('all-MiniLM-L6-v2')  # Choose an appropriate pre-trained model
    embeddings = model.encode(instructions, convert_to_tensor=False)
    
    cosine_sim = cosine_similarity(embeddings)
    
    # Get indices of the upper triangle matrix to avoid duplicate calculations
    upper_tri = np.triu_indices(len(cosine_sim), k=1)
    similar_pairs = np.where(cosine_sim[upper_tri] > similarity_threshold)
    
    # Check if similar_pairs contains two arrays
    if len(similar_pairs) < 2:
        similar_pairs = ([], [])
    
    to_delete = set()
    for i, j in zip(*similar_pairs):
        idx1 = i
        idx2 = j
        # Only delete the later occurrence, keeping the first one
        to_delete.add(idx2 + 1)
    
    filtered_data = [entry for idx, entry in enumerate(data) if idx not in to_delete]
    deleted_count = len(to_delete)
    return filtered_data, deleted_count

def main():
    # Configuration parameters
    input_file = 'aggregated_qna.json'          # Input file name
    output_file = 'cleaned_qna.json'       # Output file name
    keywords = [
        'It seems that',
        'in the study',
        'It seems',
        'To provide a more comprehensive answer',
        'It appears that',
        'main findings'
    ]  # List of keywords to filter, case-sensitive
    # min_question_length = 10               # Minimum length of the question (characters)
    # min_answer_length = 20                 # Minimum length of the answer (characters)
    similarity_threshold = 0.8              # Similarity threshold
    
    # Step 1: Load data
    data = load_data(input_file)
    original_count = len(data)
    
    # Step 2: Remove entries with keywords
    data, deleted_keywords = remove_entries_with_keywords(data, keywords)
    
    # Step 3: Remove duplicate Q&A entries
    data, deleted_duplicates = remove_duplicate_questions(data)
    
    # # Step 4: Remove short Q&A entries
    # data, deleted_short = remove_short_qnas(data, min_question_length, min_answer_length)
    
    # Step 5: Remove similar Q&A entries
    data, deleted_similar = remove_similar_questions(data, similarity_threshold)
    
    # Count the retained records
    retained_count = len(data)
    total_deleted = deleted_keywords + deleted_duplicates + deleted_short + deleted_similar
    
    # Step 6: Save the cleaned data
    save_data(data, output_file)
    
    # Output statistics
    print(f"Original record count: {original_count} entries")
    print(f"Deleted records containing keywords: {deleted_keywords} entries")
    print(f"Deleted duplicate records: {deleted_duplicates} entries")
    # print(f"Deleted short records: {deleted_short} entries")
    print(f"Deleted similar records: {deleted_similar} entries")
    print(f"Total deleted records: {total_deleted} entries")
    print(f"Retained records: {retained_count} entries")
    print(f"Cleaned data has been saved to '{output_file}'.")

if __name__ == "__main__":
    main()
