# Claim Decomposition

## My Prompt

In [None]:
import json
import ollama
from tqdm import tqdm

INPUT_FILE = 'data\filtered_claims.json'
OUTPUT_FILE = 'data\questions_decomposed_my_prompt.json'
MODEL_NAME = "deepseek-r1:32b"
# MODEL_NAME = "qwen2.5:7b"

SYSTEM_PROMPT = """
# Role
You are an expert fact-checker. Your task is "Claim Decomposition." You will receive a list of claims with metadata. For each claim, you must generate exactly 3 sub-questions that, if answered, would verify the truthfulness of the claim.

# Input Format
A JSON list of objects containing the claim and available metadata (date/country).

# Output Format
Provide a strictly valid JSON response containing a list of objects. Each object must correspond to the input claims in the exact same order.
Use the following schema:
[
  {
    "claim": "The original claim text",
    "questions": [
      "Question 1",
      "Question 2",
      "Question 3"
    ]
  }
]

# Instructions
1.  **Analyze the Claim:** Identify the specific entities (who/what), the action (what happened), and the exact values or predicates (how much/details).
    * Metadata Usage: If there are any ambiguities in the claim that can be clarified using the provided metadata, use that information, like `crawled_date` or `country_of_origin`. (e.g., "this year" -> specific year).
2.  **Formulate Verification Questions:** Create 3 questions that verify the factual accuracy of these components.
    * Numerical Precision: If the claim contains a specific number, quantity, or ranking (e.g., "3000 crores", "1%", "5th largest"), ensure questions verify those figures.
    * Decompose the Predicate: If the claim implies a legal or definitions-based contradiction (e.g., "Underage marriage"), ensure one question verifies the definitions/laws and another verifies the actual event.

# Strict Constraints
* **Self-Containment:** Every question must be fully understandable without the original claim. Replace all pronouns (e.g., "he," "it," "the bill," "the year") with the specific full names and entities provided in the text.
* **Metadata-Grounded Entity Resolution:** Use only the entities and figures found in the claim. Do not introduce external names or laws. However, you MUST resolve relative time and location phrases (e.g., "today," "this country") into absolute terms (e.g., "March 7, 2024," "India") using the provided `crawled_date` and `country_of_origin`.
* **Verification Intent:** Focus on the factual truth of the event or figure. Frame questions to ask if the facts are true (e.g., "Did X occur?") rather than asking what the text "says" or "claims."

# Examples

## Example 1
Input:
{
  "claim": "The Mayor of Paris was fined 90,000 euros for breaching gender parity staffing rules in 2020.",
  "crawled_date": "2020-12-16",
  "country_of_origin": "france"
}
Generated Questions:
1. "Was the Mayor of Paris fined exactly 90,000 euros in the year 2020?"
2. "Was the fine against the Mayor of Paris issued specifically for a breach of gender parity staffing rules?"
3. "Are there official records from 2020 confirming a 90,000 euro fine against the Mayor of Paris related to staffing regulations?"

## Example 2
Input:
{
    "claim": "As per today's news, farmers are protesting all over country against the increase of tax rates from 4% to 5%.",
    "crawled_date": "2024-03-07",
    "country_of_origin": "india" 
}
Generated Questions:
1. "Are there reports from March 7, 2024, regarding farmers in India protesting against an increase in tax rates from 4% to 5%?"
2. "Is there evidence of nationwide farmer protests occurring in India in March 2024, specifically regarding tax rate changes?"
3. "Are there official reports stating the increase in tax rates from 4% to 5% for farmers as of March 2024?"

# Task
Now, process the following input JSON and output ONLY the JSON response:
"""

def process_claim_test(claim_data):
    try:
        response = ollama.chat(model=MODEL_NAME, messages=[
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': f"Process this claim: {json.dumps(claim_data)}"}
        ], format='json')

        return json.loads(response['message']['content'])
    
    except Exception as e:
        print(f"Error processing claim: {e}")
        return None

def main():
    try:
        with open(INPUT_FILE, 'r', encoding='utf-8') as f:
            all_claims = json.load(f)
    except FileNotFoundError:
        print(f"Error: Could not find '{INPUT_FILE}'. Please check the file name.")
        return

    TEST_LIMIT = len(all_claims)
    # TEST_LIMIT = 10
  
    claims_to_process = all_claims[:TEST_LIMIT]
    results = []

    print(f"Starting run on {len(claims_to_process)} claims using {MODEL_NAME}...")

    for claim in tqdm(claims_to_process, desc="Processing Claims"):
        result = process_claim_test(claim)
        
        if result:
            results.append(result)
        else:
            pass

    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    print(f"\nDone! Processed {len(results)} claims. Saved to '{OUTPUT_FILE}'.")

if __name__ == "__main__":
    main()

Starting run on 1 claims using qwen2.5:7b...


Processing Claims: 100%|██████████| 1/1 [01:40<00:00, 100.93s/it]


Done! Processed 1 claims. Saved to 'questions_decomposed_my_prompt.json'.





## LIS Prompt

In [None]:
import json
import ollama
from tqdm import tqdm

INPUT_FILE = 'data\filtered_claims.json'
OUTPUT_FILE = 'data\questions_decomposed_LIS_prompt.json'
MODEL_NAME = "deepseek-r1:32b"
# MODEL_NAME = "qwen2.5:7b"

SYSTEM_PROMPT = """
# Role
You are an expert fact-checker. Your task is "Claim Decomposition."

# Input Format
A JSON list of objects containing the claim and available metadata (date/country).

# Output Format
Provide a strictly valid JSON response containing a list of objects. Each object must correspond to the input claims in the exact same order.
Use the following schema:
[
  {
    "claim": "The original claim text",
    "questions": [
      "Question 1",
      "Question 2",
      "Question 3"
    ]
  }
]

# Instructions
You are a fact-checker. Your general motivation is to verify a given claim. You are at the beginning of the fact-checking process, meaning you have just received the claim, optionally with some additional
metadata (such as the date of the claim or the author), if available. Your task now is to prepare the fact-check. That means:
1. Begin with an interpretation of the claim. As part of the interpretation, list the key points of the claim as a list of reformulated sub-claims.
2. Then, analyze what information is missing.
3. Finally, present a complete, numbered list of EXACTLY 3 questions: These are questions that explore the truthfulness of the claim and that we need to answer in order to factually verify the claim.

IMPORTANT:
Follow these rules:
* Do not use external knowledge. Use ONLY the information provided in the claim text to formulate the entities in the question.
* Phrase each question so that it can be understood independently and without additional context. Be explicit and do not use pronouns or generic terms in place of names or objects.
* STOP after generating the 3rd question.

# Examples
Claim: "The new Food Bill in New Zealand bans gardening"
Good Question: "Did the New Zealand government pass a food bill that restricted gardening activities for its citizens?"
Bad Question: "Did the government pass a bill?"
Bad Question: "Did the bill restrict activities?"

# Task
Now, process the following input JSON and output ONLY the JSON response:

"""

def process_claim_test(claim_data):
    try:
        response = ollama.chat(model=MODEL_NAME, messages=[
            {'role': 'system', 'content': SYSTEM_PROMPT},
            {'role': 'user', 'content': f"Process this claim: {json.dumps(claim_data)}"}
        ], format='json')

        return json.loads(response['message']['content'])
    
    except Exception as e:
        print(f"Error processing claim: {e}")
        return None

def main():
    try:
        with open(INPUT_FILE, 'r', encoding='utf-8') as f:
            all_claims = json.load(f)
    except FileNotFoundError:
        print(f"Error: Could not find '{INPUT_FILE}'. Please check the file name.")
        return

    TEST_LIMIT = len(all_claims)
    # TEST_LIMIT = 10  
      
    claims_to_process = all_claims[:TEST_LIMIT]
    results = []

    print(f"Starting run on {len(claims_to_process)} claims using {MODEL_NAME}...")

    for claim in tqdm(claims_to_process, desc="Processing Claims"):
        result = process_claim_test(claim)
        
        if result:
            results.append(result)
        else:
            pass

    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(results, f, indent=2, ensure_ascii=False)
    
    print(f"\nDone! Processed {len(results)} claims. Saved to '{OUTPUT_FILE}'.")

if __name__ == "__main__":
    main()

Starting run on 10 claims using qwen2.5:7b...


Processing Claims: 100%|██████████| 10/10 [05:09<00:00, 30.93s/it]


Done! Processed 10 claims. Saved to 'questions_decomposed_LIS_prompt.json'.





# Build indexes

In [None]:
import json
import numpy as np
import bm25s
from Stemmer import Stemmer
from sentence_transformers import SentenceTransformer
import faiss
from tqdm import tqdm
from pathlib import Path
import re

## Configuration

CORPUS_FILE = Path("data/corpus_evidence_unified.json")
INDEX_DIR = Path("indexes")

INDEX_DIR.mkdir(exist_ok=True)

SPARSE_INDEX_PATH = INDEX_DIR / "bm25s_index"
DENSE_INDEX_PATH = INDEX_DIR / "faiss_index.bin"
DOC_IDS_PATH = INDEX_DIR / "doc_ids.json"

DENSE_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'

## Load and preprocess corpus

with open(CORPUS_FILE, 'r', encoding='utf-8') as f:
    corpus_dict = json.load(f)

doc_ids = list(corpus_dict.keys())
documents = list(corpus_dict.values())

## Build sparse index

def preprocess_text(text):
    return re.sub(r'\W+', ' ', text.lower()).split()

stemmer = Stemmer('english')
corpus_tokens = [stemmer.stemWords(preprocess_text(doc)) for doc in tqdm(documents)]

retriever = bm25s.BM25()
retriever.index(corpus_tokens)
retriever.save(SPARSE_INDEX_PATH)

## Build dense index

model = SentenceTransformer(DENSE_MODEL)

doc_embeddings = model.encode(documents, show_progress_bar=True, convert_to_numpy=True)
doc_embeddings = doc_embeddings.astype('float32') # FAISS requires float32

embedding_dim = doc_embeddings.shape[1]
index = faiss.IndexFlatIP(embedding_dim)
index.add(doc_embeddings)

faiss.write_index(index, str(DENSE_INDEX_PATH))

with open(DOC_IDS_PATH, 'w') as f:
    json.dump(doc_ids, f)

# Evidence Retrieval

In [None]:
import json
import numpy as np
import pandas as pd
import bm25s
from Stemmer import Stemmer
from sentence_transformers import SentenceTransformer
import faiss
from tqdm import tqdm
from pathlib import Path
import re
from collections import defaultdict

## Configuration

INDEX_DIR = Path("indexes")
SPARSE_INDEX_PATH = INDEX_DIR / "bm25s_index"
DENSE_INDEX_PATH = INDEX_DIR / "faiss_index.bin"
DOC_IDS_PATH = INDEX_DIR / "doc_ids.json"

DATA_DIR = Path("data")
CLAIMS_FILE = DATA_DIR / "final_decomposed_questions_list.csv" 
CORPUS_FILE = DATA_DIR / "corpus_evidence_unified.json"

OUTPUT_DIR = Path("output")
OUTPUT_DIR.mkdir(exist_ok=True)
RESULTS_FILE = OUTPUT_DIR / "retrieval_results_top10.csv"

DENSE_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'

# NUM_CLAIMS_TO_PROCESS = len(pd.read_csv(CLAIMS_FILE)) 
NUM_CLAIMS_TO_PROCESS = 100 
TOP_K = 10 