# 04_clean_wvs_india — Clean WVS (India) Q1–Q26

This notebook cleans the India subset of WVS Wave 7 for questions Q1–Q26 using `data/raw/F00017093-WVS_Wave_7_India_Csv_v6.0.csv` and exports:
- `data/processed/wvs_india_q1_q26_clean.parquet` (wide, cleaned)
- `data/processed/wvs_india_q1_q26_tidy.csv` (long, tidy)
- `data/processed/wvs_india_q1_q26_sample100.csv` (100-record sample)

Steps:
1) Load CSV and select Q1–Q26 plus minimal metadata.
2) Handle missing codes (< 0 → NaN).
3) Recode:
   - Q1–Q6: 1–4 Likert importance.
   - Q7–Q17: mentioned (1) vs not (0).
   - Q18–Q26: “not like as neighbour” (1) vs “like” (0).
4) Rename columns to descriptive names.
5) Save cleaned outputs and a 100-record sample.


In [1]:
from pathlib import Path
import numpy as np
import pandas as pd

# Use absolute paths based on project root
PROJECT_ROOT = Path(r"F:\New folder\Research\Project 1")
RAW_PATH = PROJECT_ROOT / "data" / "raw" / "F00017093-WVS_Wave_7_India_Csv_v6.0.csv"
OUT_CLEAN_WIDE = PROJECT_ROOT / "data" / "processed" / "wvs_india_q1_q26_clean.parquet"
OUT_TIDY_LONG = PROJECT_ROOT / "data" / "processed" / "wvs_india_q1_q26_tidy.csv"
OUT_SAMPLE100 = PROJECT_ROOT / "data" / "processed" / "wvs_india_q1_q26_sample100.csv"

QUESTIONS = [f"Q{i}" for i in range(1, 27)]
META_CANDIDATES = [
    "N_REGION_ISO",  # state/region ISO code if present
    "G_TOWNSIZE2",   # town size bucket
    "H_URBRURAL",    # urban/rural
    "W_WEIGHT",      # design/analysis weight
    "PWGHT",         # post-stratification weight, if present
]



In [2]:
# Load CSV (WVS uses semicolon delimiter with quotes)
print(f"Loading from: {RAW_PATH}")
print(f"File exists: {RAW_PATH.exists()}")
df = pd.read_csv(RAW_PATH, sep=';', quotechar='"', low_memory=False)

# Keep minimal metadata if present + Q1..Q26
available_meta = [c for c in META_CANDIDATES if c in df.columns]
keep_cols = available_meta + QUESTIONS
missing = [c for c in QUESTIONS if c not in df.columns]
if missing:
    raise ValueError(f"Missing expected question columns: {missing}")

df = df[keep_cols].copy()

# Cast question columns to numeric and set negative codes to NaN
for q in QUESTIONS:
    df[q] = pd.to_numeric(df[q], errors='coerce')
    df.loc[df[q] < 0, q] = np.nan

df.head(3)


FileNotFoundError: [Errno 2] No such file or directory: 'data\\raw\\F00017093-WVS_Wave_7_India_Csv_v6.0.csv'

In [None]:
# Define recodes and descriptive names
likert_map = {
    1: "very_important",
    2: "rather_important",
    3: "not_very_important",
    4: "not_at_all_important",
}

binary_map = {1: 1, 2: 0}

rename_map = {
    # Q1–Q6: importance
    "Q1": "Q1_family_importance",
    "Q2": "Q2_friends_importance",
    "Q3": "Q3_leisure_time_importance",
    "Q4": "Q4_politics_importance",
    "Q5": "Q5_work_importance",
    "Q6": "Q6_religion_importance",
    # Q7–Q17: child qualities (mentioned)
    "Q7":  "Q7_child_good_manners_mentioned",
    "Q8":  "Q8_child_independence_mentioned",
    "Q9":  "Q9_child_hard_work_mentioned",
    "Q10": "Q10_child_responsibility_mentioned",
    "Q11": "Q11_child_imagination_mentioned",
    "Q12": "Q12_child_tolerance_respect_mentioned",
    "Q13": "Q13_child_thrift_saving_mentioned",
    "Q14": "Q14_child_determination_mentioned",
    "Q15": "Q15_child_religious_faith_mentioned",
    "Q16": "Q16_child_unselfishness_mentioned",
    "Q17": "Q17_child_obedience_mentioned",
    # Q18–Q26: neighbours not liked (1) vs liked (0)
    "Q18": "Q18_neighbor_drug_addicts_not_like",
    "Q19": "Q19_neighbor_diff_race_not_like",
    "Q20": "Q20_neighbor_people_with_AIDS_not_like",
    "Q21": "Q21_neighbor_immigrants_foreign_workers_not_like",
    "Q22": "Q22_neighbor_homosexuals_not_like",
    "Q23": "Q23_neighbor_diff_religion_not_like",
    "Q24": "Q24_neighbor_heavy_drinkers_not_like",
    "Q25": "Q25_neighbor_unmarried_couples_not_like",
    "Q26": "Q26_neighbor_diff_language_not_like",
}

# Apply renaming
clean = df.rename(columns=rename_map)

# Recode Q1–Q6 to labeled categories
for q in [rename_map[f"Q{i}"] for i in range(1, 7)]:
    clean[q] = clean[q].map(likert_map).astype("category")

# Recode Q7–Q17 to binary 1/0
for i in range(7, 18):
    q = rename_map[f"Q{i}"]
    clean[q] = clean[q].map(binary_map).astype("Int64")

# Recode Q18–Q26 to binary 1/0
for i in range(18, 27):
    q = rename_map[f"Q{i}"]
    clean[q] = clean[q].map(binary_map).astype("Int64")

clean.head(3)


In [None]:
# Save wide cleaned dataset
OUT_CLEAN_WIDE.parent.mkdir(parents=True, exist_ok=True)
clean.to_parquet(OUT_CLEAN_WIDE, index=False)

# Build a tidy long version for Q1–Q26 only
id_vars = available_meta
value_vars = [rename_map[q] for q in QUESTIONS]
tidy = clean.melt(id_vars=id_vars, value_vars=value_vars, var_name="question", value_name="response")
tidy.to_csv(OUT_TIDY_LONG, index=False)

# Create a 100-record sample (simple random sample with fixed seed)
sample100 = clean.sample(n=100, random_state=42) if len(clean) >= 100 else clean.copy()
sample100.to_csv(OUT_SAMPLE100, index=False)

len(clean), len(sample100)
