In [4]:
from pathlib import Path
import sys
import pandas as pd

# Project root for imports
PROJECT_ROOT = Path(r"F:\New folder\Research\Project 1")
if str(PROJECT_ROOT) not in sys.path:
    sys.path.append(str(PROJECT_ROOT))

from src.utils.io import write_jsonl

# Generate values_norms.jsonl for 100 respondents from the cleaned CSV
CSV_PATH = PROJECT_ROOT / "data" / "processed" / "wvs_india_q1_q26_clean.csv"
OUT_JSONL = PROJECT_ROOT / "data" / "processed" / "values_norms.jsonl"

# Load cleaned CSV
_df = pd.read_csv(CSV_PATH)
_sample = _df.sample(n=100, random_state=42).reset_index(drop=True) if len(_df) >= 100 else _df.reset_index(drop=True)

# Define stems and options consistent with existing file
_importance_cols = [
    ("Q1_family_importance", "How important is Family in your life?"),
    ("Q2_friends_importance", "How important are Friends in your life?"),
    ("Q3_leisure_time_importance", "How important is Leisure time in your life?"),
    ("Q4_politics_importance", "How important is Politics in your life?"),
    ("Q5_work_importance", "How important is Work in your life?"),
    ("Q6_religion_importance", "How important is Religion in your life?"),
]
_importance_options = [
    "Very important",
    "Rather important",
    "Not very important",
    "Not at all important",
]

_q7_17_cols = [
    ("Q7_child_good_manners_mentioned", "Is 'Good manners' among the top five qualities children should learn at home?"),
    ("Q8_child_independence_mentioned", "Is 'Independence' among the top five qualities children should learn at home?"),
    ("Q9_child_hard_work_mentioned", "Is 'Hard work' among the top five qualities children should learn at home?"),
    ("Q10_child_responsibility_mentioned", "Is 'Feeling of responsibility' among the top five qualities children should learn at home?"),
    ("Q11_child_imagination_mentioned", "Is 'Imagination' among the top five qualities children should learn at home?"),
    ("Q12_child_tolerance_respect_mentioned", "Is 'Tolerance and respect for other people' among the top five qualities?"),
    ("Q13_child_thrift_saving_mentioned", "Is 'Thrift, saving money and things' among the top five qualities?"),
    ("Q14_child_determination_mentioned", "Is 'Determination, perseverance' among the top five qualities?"),
    ("Q15_child_religious_faith_mentioned", "Is 'Religious faith' among the top five qualities?"),
    ("Q16_child_unselfishness_mentioned", "Is 'Not being selfish (unselfishness)' among the top five qualities?"),
    ("Q17_child_obedience_mentioned", "Is 'Obedience' among the top five qualities?"),
]
_q7_17_options = ["Mentioned", "Not mentioned"]

_q18_26_cols = [
    ("Q18_neighbor_drug_addicts_not_like", "Would you not like to have Drug addicts as neighbours?"),
    ("Q19_neighbor_diff_race_not_like", "Would you not like to have People of a different race as neighbours?"),
    ("Q20_neighbor_people_with_AIDS_not_like", "Would you not like to have People who have AIDS as neighbours?"),
    ("Q21_neighbor_immigrants_foreign_workers_not_like", "Would you not like to have Immigrants/foreign workers as neighbours?"),
    ("Q22_neighbor_homosexuals_not_like", "Would you not like to have Homosexuals as neighbours?"),
    ("Q23_neighbor_diff_religion_not_like", "Would you not like to have People of a different religion as neighbours?"),
    ("Q24_neighbor_heavy_drinkers_not_like", "Would you not like to have Heavy drinkers as neighbours?"),
    ("Q25_neighbor_unmarried_couples_not_like", "Would you not like to have Unmarried couples living together as neighbours?"),
    ("Q26_neighbor_diff_language_not_like", "Would you not like to have People who speak a different language as neighbours?"),
]
_q18_26_options = ["Would not like as neighbours", "Would like as neighbours"]

_records = []
for _rid, _row in _sample.iterrows():
    # Q1–Q6: categorical strings as-is
    for _col, _stem in _importance_cols:
        _val = _row[_col]
        if pd.isna(_val):
            continue
        _qid = _col.split("_")[0]
        _records.append({
            "stem": _stem,
            "options": _importance_options,
            "answer": str(_val),
            "meta": {"source": "WVS India 2023", "question_id": _qid, "respondent_id": int(_rid)},
        })

    # Q7–Q17: 1 -> Mentioned, 0 -> Not mentioned
    for _col, _stem in _q7_17_cols:
        _val = _row[_col]
        if pd.isna(_val):
            continue
        _ans = _q7_17_options[0] if int(_val) == 1 else _q7_17_options[1]
        _qid = _col.split("_")[0]
        _records.append({
            "stem": _stem,
            "options": _q7_17_options,
            "answer": _ans,
            "meta": {"source": "WVS India 2023", "question_id": _qid, "respondent_id": int(_rid)},
        })

    # Q18–Q26: 1 -> Would not like, 0 -> Would like
    for _col, _stem in _q18_26_cols:
        _val = _row[_col]
        if pd.isna(_val):
            continue
        _ans = _q18_26_options[0] if int(_val) == 1 else _q18_26_options[1]
        _qid = _col.split("_")[0]
        _records.append({
            "stem": _stem,
            "options": _q18_26_options,
            "answer": _ans,
            "meta": {"source": "WVS India 2023", "question_id": _qid, "respondent_id": int(_rid)},
        })

# Write JSONL
OUT_JSONL.parent.mkdir(parents=True, exist_ok=True)
write_jsonl(str(OUT_JSONL), _records)
len(_records)


2537

# 01_build_values_dataset — Build values_norms.jsonl from WVS India

This notebook converts the cleaned WVS India Q1–Q26 data into a small discriminative values dataset:
- Reads: `data/processed/wvs_india_q1_q26_clean.parquet`
- Writes: `data/processed/values_norms.jsonl`

Each item contains: `stem`, `options`, `answer`, `meta`. Answers are set to the majority response in the survey (per question).


In [1]:
from pathlib import Path
import json
import pandas as pd

# Save and read from project root data/processed (not notebooks/)
DATA_DIR = Path("../data/processed")
CLEAN_PARQUET = DATA_DIR / "wvs_india_q1_q26_clean.parquet"
CLEAN_CSV = DATA_DIR / "wvs_india_q1_q26_clean.csv"
OUT_JSONL = DATA_DIR / "values_norms.jsonl"

if not CLEAN_PARQUET.exists() and not CLEAN_CSV.exists():
    raise FileNotFoundError(
        "Run notebooks/04_clean_wvs_india.ipynb first to generate the cleaned file."
    )

if CLEAN_PARQUET.exists():
    df = pd.read_parquet(CLEAN_PARQUET)
else:
    df = pd.read_csv(CLEAN_CSV)

# Helper: choose majority value with deterministic tie-break by option order

def majority_label(series, ordered_options):
    counts = series.value_counts(dropna=True)
    if counts.empty:
        return None
    # candidates tied for max
    max_count = counts.max()
    tied = [opt for opt in ordered_options if counts.get(opt, 0) == max_count]
    return tied[0] if tied else counts.idxmax()

# Define stems and options by block
likert_opts = [
    "Very important",
    "Rather important",
    "Not very important",
    "Not at all important",
]

binary_mentioned_opts = ["Mentioned", "Not mentioned"]

binary_neighbor_opts = ["Would not like as neighbours", "Would like as neighbours"]

stems = {
    # Q1–Q6
    "Q1_family_importance": "How important is Family in your life?",
    "Q2_friends_importance": "How important are Friends in your life?",
    "Q3_leisure_time_importance": "How important is Leisure time in your life?",
    "Q4_politics_importance": "How important is Politics in your life?",
    "Q5_work_importance": "How important is Work in your life?",
    "Q6_religion_importance": "How important is Religion in your life?",
    # Q7–Q17
    "Q7_child_good_manners_mentioned": "Is 'Good manners' among the top five qualities children should learn at home?",
    "Q8_child_independence_mentioned": "Is 'Independence' among the top five qualities children should learn at home?",
    "Q9_child_hard_work_mentioned": "Is 'Hard work' among the top five qualities children should learn at home?",
    "Q10_child_responsibility_mentioned": "Is 'Feeling of responsibility' among the top five qualities children should learn at home?",
    "Q11_child_imagination_mentioned": "Is 'Imagination' among the top five qualities children should learn at home?",
    "Q12_child_tolerance_respect_mentioned": "Is 'Tolerance and respect for other people' among the top five qualities?",
    "Q13_child_thrift_saving_mentioned": "Is 'Thrift, saving money and things' among the top five qualities?",
    "Q14_child_determination_mentioned": "Is 'Determination, perseverance' among the top five qualities?",
    "Q15_child_religious_faith_mentioned": "Is 'Religious faith' among the top five qualities?",
    "Q16_child_unselfishness_mentioned": "Is 'Not being selfish (unselfishness)' among the top five qualities?",
    "Q17_child_obedience_mentioned": "Is 'Obedience' among the top five qualities?",
    # Q18–Q26
    "Q18_neighbor_drug_addicts_not_like": "Would you not like to have Drug addicts as neighbours?",
    "Q19_neighbor_diff_race_not_like": "Would you not like to have People of a different race as neighbours?",
    "Q20_neighbor_people_with_AIDS_not_like": "Would you not like to have People who have AIDS as neighbours?",
    "Q21_neighbor_immigrants_foreign_workers_not_like": "Would you not like to have Immigrants/foreign workers as neighbours?",
    "Q22_neighbor_homosexuals_not_like": "Would you not like to have Homosexuals as neighbours?",
    "Q23_neighbor_diff_religion_not_like": "Would you not like to have People of a different religion as neighbours?",
    "Q24_neighbor_heavy_drinkers_not_like": "Would you not like to have Heavy drinkers as neighbours?",
    "Q25_neighbor_unmarried_couples_not_like": "Would you not like to have Unmarried couples living together as neighbours?",
    "Q26_neighbor_diff_language_not_like": "Would you not like to have People who speak a different language as neighbours?",
}

# Build JSONL entries (one per question) based on survey majorities
records = []

# Q1–Q6 are categorical strings already
for col in [
    "Q1_family_importance",
    "Q2_friends_importance",
    "Q3_leisure_time_importance",
    "Q4_politics_importance",
    "Q5_work_importance",
    "Q6_religion_importance",
]:
    ans = majority_label(df[col], likert_opts)
    records.append({
        "stem": stems[col],
        "options": likert_opts,
        "answer": ans,
        "meta": {"source": "WVS India 2023", "question_id": col.split("_")[0]},
    })

# Q7–Q17 are Int64 (1/0)
for col in [
    "Q7_child_good_manners_mentioned",
    "Q8_child_independence_mentioned",
    "Q9_child_hard_work_mentioned",
    "Q10_child_responsibility_mentioned",
    "Q11_child_imagination_mentioned",
    "Q12_child_tolerance_respect_mentioned",
    "Q13_child_thrift_saving_mentioned",
    "Q14_child_determination_mentioned",
    "Q15_child_religious_faith_mentioned",
    "Q16_child_unselfishness_mentioned",
    "Q17_child_obedience_mentioned",
]:
    lbl_series = col + "__lbl"
    tmp = df[col].map({1: binary_mentioned_opts[0], 0: binary_mentioned_opts[1]})
    ans = majority_label(tmp, binary_mentioned_opts)
    records.append({
        "stem": stems[col],
        "options": binary_mentioned_opts,
        "answer": ans,
        "meta": {"source": "WVS India 2023", "question_id": col.split("_")[0]},
    })

# Q18–Q26 are Int64 (1/0)
for col in [
    "Q18_neighbor_drug_addicts_not_like",
    "Q19_neighbor_diff_race_not_like",
    "Q20_neighbor_people_with_AIDS_not_like",
    "Q21_neighbor_immigrants_foreign_workers_not_like",
    "Q22_neighbor_homosexuals_not_like",
    "Q23_neighbor_diff_religion_not_like",
    "Q24_neighbor_heavy_drinkers_not_like",
    "Q25_neighbor_unmarried_couples_not_like",
    "Q26_neighbor_diff_language_not_like",
]:
    tmp = df[col].map({1: binary_neighbor_opts[0], 0: binary_neighbor_opts[1]})
    ans = majority_label(tmp, binary_neighbor_opts)
    records.append({
        "stem": stems[col],
        "options": binary_neighbor_opts,
        "answer": ans,
        "meta": {"source": "WVS India 2023", "question_id": col.split("_")[0]},
    })

OUT_JSONL.parent.mkdir(parents=True, exist_ok=True)
with OUT_JSONL.open("w", encoding="utf-8") as f:
    for r in records:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

len(records), str(OUT_JSONL)


(26, '..\\data\\processed\\values_norms.jsonl')