In [1]:
import re
import json


In [31]:
def clean_answer(ans):
    """Clean the answer string according to rules."""
    if ans is None:
        return None  # Keep nulls as-is

    # --- Remove ANY "Ans." or "Answer" patterns (aggressive matching) ---
    # Handles: Ans. A, Ans:A, Ans 'B', Ans is 'C', Ans- D, ans.1, ans: i.e.,
    # Also catches: Answe:, ANS--, ans = 'D', Answer: A
    ans = re.sub(
        r"\bAns(?:wer)?\b\s*[\.:=\-–—]*\s*(is)?\s*['\"]?\(?[a-dA-D0-9]+\)?['\"]?\s*(i\.?\s*e\.?,?)?\s*",
        "",
        ans,
        flags=re.IGNORECASE
    )

    # --- Remove leading option markers like "(a)", "'a'", "a)", "1)", etc. ---
    ans = re.sub(r"^\s*\(?[a-dA-D0-9]\)?\s*(i\.?\s*e\.?,?)?\s*", "", ans)

    # --- Fix missing space after punctuation ---
    ans = re.sub(r"([.,;:!?])([A-Za-z])", r"\1 \2", ans)

    # --- Replace multiple dots with a single dot ---
    ans = re.sub(r"\.{2,}", ".", ans)

    # --- Normalize multiple spaces ---
    ans = re.sub(r"\s+", " ", ans)

    # --- Strip leading/trailing spaces & quotes ---
    ans = ans.strip(" '\"\t\r\n")

    return ans if ans else None  # Set to None if empty


In [33]:
def clean_jsonl(input_path, output_path):
    with open(input_path, 'r', encoding='utf-8') as infile, \
         open(output_path, 'w', encoding='utf-8') as outfile:

        for line in infile:
            if not line.strip():
                continue

            try:
                obj = json.loads(line)
            except json.JSONDecodeError as e:
                print(f"Skipping malformed line: {e}")
                continue

            if "answer" in obj:
                obj["answer"] = clean_answer(obj["answer"])

            outfile.write(json.dumps(obj, ensure_ascii=False) + "\n")

In [35]:
# Example usage
input_file = "C:\\Users\\adiga\\OneDrive\\Desktop\\For training\\unified_training.jsonl"
output_file = "C:\\Users\\adiga\\OneDrive\\Desktop\\For training\\merged_corpus_cleaned_Final.jsonl"
clean_jsonl(input_file, output_file)
print(f"Cleaned file saved to {output_file}")

Cleaned file saved to C:\Users\adiga\OneDrive\Desktop\For training\merged_corpus_cleaned_Final.jsonl


In [37]:
import re
import json

def scan_for_ans_patterns(input_path, report_path):
    ans_pattern = re.compile(r"\bAns\b", re.IGNORECASE)
    leftovers = []

    with open(input_path, 'r', encoding='utf-8') as infile:
        for idx, line in enumerate(infile, start=1):
            if not line.strip():
                continue

            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue

            ans_text = obj.get("answer")
            if ans_text and ans_pattern.search(ans_text):
                leftovers.append({
                    "line_number": idx,
                    "id": obj.get("id"),
                    "answer": ans_text.strip()
                })

    with open(report_path, 'w', encoding='utf-8') as outfile:
        json.dump(leftovers, outfile, indent=2, ensure_ascii=False)

    print(f"Found {len(leftovers)} possible leftover 'Ans.' patterns. Report saved to {report_path}")

# Example usage
input_file = "C:\\Users\\adiga\\OneDrive\\Desktop\\For training\\merged_corpus_cleaned_Final.jsonl"   # already cleaned file
output_report = "C:\\Users\\adiga\\OneDrive\\Desktop\\For training\\leftover_ans_patterns.json"

scan_for_ans_patterns(input_file, output_report)


Found 2672 possible leftover 'Ans.' patterns. Report saved to C:\Users\adiga\OneDrive\Desktop\For training\leftover_ans_patterns.json
