In [55]:
#working
import os
import re
import json
import csv
import fitz  # PyMuPDF for PDF text extraction
from pathlib import Path

# Define paths
codebook_pdf = "anes/anes_timeseries_2024_userguidecodebook_20250219-2.pdf"
data_csv = "anes/anes_timeseries_2024_csv_20250219.csv"
output_dir = "respondents"

# Ensure the output directory exists
Path(output_dir).mkdir(exist_ok=True)

def clean_text(text):
    """Remove extra spaces and normalize text."""
    return re.sub(r'\s+', ' ', text).strip()

def remove_prefixes(text):
    """Remove prefixes like PRE:, PAPER:, WEB:, POST:, RESTRICTED: from summaries."""
    return re.sub(r'^(PRE:|PAPER:|WEB:|POST:|RESTRICTED:|FTF:|CASI:|CAPI:|VIDEO:|PAPI:)\s*', '', text, flags=re.IGNORECASE).strip()


def extract_codebook_data(pdf_path):
    """Extracts variable codes, summaries, full questions, and possible answers from the ANES Codebook."""
    doc = fitz.open(pdf_path)
    text = "\n".join(page.get_text("text") for page in doc)

    # Updated regex pattern based on the example structure
    var_pattern = re.compile(
        r"(V\d{6})\s+PRE:\s*(.*?)\nQuestion\n(.*?)\nValue Labels\n(.*?)\nSurvey Question", 
        re.DOTALL
    )

    questions = {}

    for match in var_pattern.finditer(text):
        var_code = match.group(1).strip()  # Extract variable code (e.g., V241063)
        summary = match.group(2).replace("\n", " ").strip()  # Extract summary (multi-line safe)
        question_text = match.group(3).replace("\n", " ").strip()  # Extract full question (multi-line safe)

        # Extract possible answers (Value Labels section)
        answer_section = match.group(4).strip()
        answers = []
        for line in answer_section.split("\n"):
            ans_match = re.match(r"^(-?\d+)\.\s*(.*)$", line)  # Handle negative & normal values
            if ans_match:
                answers.append({"code": ans_match.group(1), "text": ans_match.group(2)})

        # Store the extracted data
        questions[var_code] = {
            "summary": summary,
            "question_text": question_text,
            "possible_answers": answers
        }

    print(f"✅ Extracted {len(questions)} questions from the Codebook PDF.")
    return questions


def generate_json(data_csv, questions, output_dir):
    """Creates JSON files for each respondent from the CSV data."""
    with open(data_csv, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            respondent_id = row.get("V240001", "unknown").strip()
            respondent_record = {"respondent_id": respondent_id, "responses": []}

            for var_code, question_data in questions.items():
                if var_code in row:
                    response = row[var_code].strip()
                    response_text = next((ans["text"] for ans in question_data["possible_answers"] if ans["code"] == response), response)
                else:
                    response_text = None

                respondent_record["responses"].append({
                    "variable_code": var_code,
                    "question_summary": question_data["summary"],
                    "full_question_text": question_data["question_text"],
                    "possible_answers": question_data["possible_answers"],
                    "respondent_answer": response_text
                })

            out_path = os.path.join(output_dir, f"respondent_{respondent_id}.json")
            with open(out_path, 'w', encoding='utf-8') as jsonfile:
                json.dump(respondent_record, jsonfile, indent=2)

    print(f"✅ Generated JSON files for respondents.")

# Execute pipeline
codebook_data = extract_codebook_data(codebook_pdf)
generate_json(data_csv, codebook_data, output_dir)


✅ Extracted 404 questions from the Codebook PDF.
✅ Generated JSON files for respondents.
