In [14]:
"""
================================================================================
Meta-Analysis Data Preparation Pipeline
================================================================================

Workflow Overview:
-----------------
STEP 1: Clean & Standardize Raw Data
    ├── 1.1 Data Import and Initial Standardization
    ├── 1.2 Harmonization of Indexing and Sample Inclusion
    ├── 1.3 Normalization of Missing and Irregular Values
    └── 1.4 Extraction and Structuring of Effect-Size Measures

STEP 2: Merge & Validate Final Dataset
    ├── 2.1 Merge moderator and effect-size data
    ├── 2.2 Verify Study_ID and Effect_ID consistency
    └── 2.3 Export final dataset

Input Files:
-----------
- moderator_raw.csv: Study-level moderator variables (N studies × K moderators)
- effect_size_raw_results.csv: Effect-size estimates (Hedges' g, SE, 95% CI)

Output File:
-----------
- Meta_ready_cleaned.csv: Final merged, cleaned, analysis-ready dataset

Dependencies:
------------
- numpy (>=1.20.0)
- pandas (>=1.3.0)

================================================================================
"""

import numpy as np
import pandas as pd

# Suppress pandas warnings for cleaner output
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)


################################################################################
# STEP 1: CLEAN & STANDARDIZE RAW DATA
################################################################################
"""
This step implements comprehensive data cleaning and standardization procedures
to ensure data quality, consistency, and compatibility for meta-analytic synthesis.

Sub-steps:
    1.1 Data Import and Initial Standardization
    1.2 Harmonization of Indexing and Sample Inclusion  
    1.3 Normalization of Missing and Irregular Values
    1.4 Extraction and Structuring of Effect-Size Measures
"""

# ==============================================================================
# 1.1 Data Import and Initial Standardization
# ==============================================================================
print("=" * 80)
print("STEP 1: CLEAN & STANDARDIZE RAW DATA")
print("=" * 80)
print("\n" + "─" * 80)
print("1.1 Data Import and Initial Standardization")
print("─" * 80)

# Load raw data files with explicit type specification
# - dtype=str: Preserves original formatting and prevents automatic type inference
# - encoding="utf-8-sig": Handles Byte Order Mark (BOM) from Excel-exported CSVs
moderator_raw = pd.read_csv("moderator_raw.csv", dtype=str, encoding="utf-8-sig")
effect_size_raw = pd.read_csv("effect_size_raw_results.csv", dtype=str, encoding="utf-8-sig")

print(f"✓ Moderator variables loaded: {moderator_raw.shape[0]} rows × {moderator_raw.shape[1]} columns")
print(f"✓ Effect-size data loaded:    {effect_size_raw.shape[0]} rows × {effect_size_raw.shape[1]} columns")

# Trim leading/trailing whitespace from all string cells
# Critical for preventing merge failures due to inconsistent formatting
moderator_raw = moderator_raw.applymap(lambda x: x.strip() if isinstance(x, str) else x)
effect_size_raw = effect_size_raw.applymap(lambda x: x.strip() if isinstance(x, str) else x)
print("✓ Whitespace normalized across all string fields")


# ==============================================================================
# 1.2 Harmonization of Indexing and Sample Inclusion
# ==============================================================================
print("\n" + "─" * 80)
print("1.2 Harmonization of Indexing and Sample Inclusion")
print("─" * 80)

# Standardize the 'index' column (unique identifier for each effect size)
# Ensures consistent string format and removes any extraneous whitespace
moderator_raw["index"] = moderator_raw["index"].astype(str).str.strip()
effect_size_raw["index"] = effect_size_raw["index"].astype(str).str.strip()

# Identify common indices across both datasets
# Only effect sizes with complete moderator AND effect-size data are retained
# This ensures sample inclusion criteria are consistently applied
indices_moderator = set(moderator_raw["index"])
indices_effect = set(effect_size_raw["index"])
common_indices = indices_moderator & indices_effect  # Set intersection

print(f"  Indices in moderator file:    {len(indices_moderator):3d}")
print(f"  Indices in effect-size file:  {len(indices_effect):3d}")
print(f"  Common indices (retained):    {len(common_indices):3d}")

# Identify and report indices unique to each file
# These represent incomplete data and will be excluded
only_moderator = sorted(indices_moderator - indices_effect)
only_effect = sorted(indices_effect - indices_moderator)

if only_moderator:
    print(f"\n  ⚠️  Excluded (moderator only): {only_moderator}")
if only_effect:
    print(f"  ⚠️  Excluded (effect-size only): {only_effect}")

# Filter both datasets to include only common indices
# Ensures one-to-one correspondence for subsequent merge
moderator_raw = moderator_raw[moderator_raw["index"].isin(common_indices)].copy()
effect_size_raw = effect_size_raw[effect_size_raw["index"].isin(common_indices)].copy()

print(f"\n✓ Sample harmonization complete:")
print(f"  - Moderator dataset:   {moderator_raw.shape[0]:3d} effect sizes")
print(f"  - Effect-size dataset: {effect_size_raw.shape[0]:3d} effect sizes")


# ==============================================================================
# 1.3 Normalization of Missing and Irregular Values
# ==============================================================================
print("\n" + "─" * 80)
print("1.3 Normalization of Missing and Irregular Values")
print("─" * 80)

# --- 1.3.1 Standardize Missing Value Codes ---
print("  [1.3.1] Standardizing missing value representations...")

# Define all representations of missing data used in the raw files
# These will be standardized to numpy NaN for consistent handling
missing_codes = [
    "NG",      # Not Given
    "N/R",     # Not Reported
    "n/a",     # Not Applicable (lowercase)
    "N/A",     # Not Applicable (uppercase)
    "NA",      # Not Available
    "NaN",     # Already NaN as string
    "",        # Empty string
    "-",       # Dash placeholder
    "Mixed",   # Mixed/heterogeneous categories
    "mixed",   # Mixed (lowercase)
    "MIXED"    # Mixed (uppercase)
]

# Replace all missing codes with numpy NaN across both dataframes
moderator_raw = moderator_raw.replace(missing_codes, np.nan)
effect_size_raw = effect_size_raw.replace(missing_codes, np.nan)
print(f"  ✓ {len(missing_codes)} missing value codes normalized to NaN")


# --- 1.3.2 Standardize Yes/No Values ---
print("  [1.3.2] Standardizing Yes/No categorical values...")

# Map all variations of Yes/No to standard capitalized format
# Ensures consistency for categorical moderator analysis
yes_no_map = {
    # Yes variations
    "yes": "Yes", "YES": "Yes", "y": "Yes", "Y": "Yes",
    # No variations
    "no": "No", "NO": "No", "n": "No", "N": "No"
}

moderator_raw = moderator_raw.replace(yes_no_map)
effect_size_raw = effect_size_raw.replace(yes_no_map)
print("  ✓ Yes/No values standardized (yes/no/y/n → Yes/No)")


# --- 1.3.3 Standardize Age Group Categories ---
print("  [1.3.3] Standardizing Age_Group categories...")

if "Age_Group" in moderator_raw.columns:
    age_group_map = {
        "adult": "Adult",
        "adolescent": "Adolescent",
        "adolescent ": "Adolescent"  # Handle trailing space
    }
    moderator_raw["Age_Group"] = moderator_raw["Age_Group"].replace(age_group_map)
    print("  ✓ Age_Group: adult → Adult, adolescent → Adolescent")


# --- 1.3.4 Standardize Proficiency Level Categories ---
print("  [1.3.4] Standardizing Proficiency_Level categories...")

if "Proficiency_Level" in moderator_raw.columns:
    # Convert to consistent capitalization
    # Mixed/compound levels (e.g., "Intermediate_Advanced") → NaN
    # because they cannot be analyzed as a single proficiency category
    proficiency_map = {
        "intermediate": "Intermediate",
        "Intermediate_Advanced": np.nan,  # Compound level → missing
        "mixed": np.nan,
        "Mixed": np.nan
    }
    moderator_raw["Proficiency_Level"] = moderator_raw["Proficiency_Level"].replace(proficiency_map)
    print("  ✓ Proficiency_Level: intermediate → Intermediate")
    print("  ✓ Mixed/compound levels set to NaN")


# ==============================================================================
# 1.4 Extraction and Structuring of Effect-Size Measures
# ==============================================================================
print("\n" + "─" * 80)
print("1.4 Extraction and Structuring of Effect-Size Measures")
print("─" * 80)

# --- 1.4.1 Transform Gender Ratio to Proportion ---
print("  [1.4.1] Converting gender ratio to proportion of females...")

if "Gender_Ratio_FM" in moderator_raw.columns:
    # Store original column position to maintain column order
    gender_col_position = moderator_raw.columns.get_loc("Gender_Ratio_FM")
    
    # Extract female and male counts from "numberF/numberM" format (e.g., "20F/12M")
    # Regex: ^\s*(\d+)\s*[Ff]\s*/\s*(\d+)\s*[Mm]\s*$
    #   - Captures: (female_count) F / (male_count) M
    gender_parts = moderator_raw["Gender_Ratio_FM"].str.extract(
        r"^\s*(\d+)\s*[Ff]\s*/\s*(\d+)\s*[Mm]\s*$"
    )
    
    # Convert to numeric and calculate proportion
    n_female = pd.to_numeric(gender_parts[0], errors="coerce")
    n_male = pd.to_numeric(gender_parts[1], errors="coerce")
    n_total = n_female + n_male
    proportion_female = (n_female / n_total).round(2)
    
    # Replace original column with proportion values
    moderator_raw["Gender_Ratio_FM"] = proportion_female
    
    # Restore column to original position
    cols = moderator_raw.columns.tolist()
    if cols[gender_col_position] != "Gender_Ratio_FM":
        cols.remove("Gender_Ratio_FM")
        cols.insert(gender_col_position, "Gender_Ratio_FM")
        moderator_raw = moderator_raw[cols]
    
    print(f"  ✓ Gender_Ratio_FM → proportion of females (0-1 scale, 2 decimals)")


# --- 1.4.2 Transform Training Duration Variables ---
print("  [1.4.2] Processing training duration variables...")

# Convert Training_TotalMinute (handles arithmetic expressions)
if "Training_TotalMinute" in moderator_raw.columns:
    minutes_str = moderator_raw["Training_TotalMinute"].astype(str).str.strip()
    minutes_str = minutes_str.str.replace(" ", "", regex=False)  # Remove internal spaces
    
    minutes_numeric = pd.Series([np.nan] * len(minutes_str), index=moderator_raw.index)
    
    for idx, val in minutes_str.items():
        if pd.isna(val) or val == "nan" or val == "":
            minutes_numeric[idx] = np.nan
        elif "*" in val:
            # Multiplication expression (e.g., "13*90" = weeks × minutes_per_week)
            try:
                parts = val.split("*")
                if len(parts) == 2:
                    minutes_numeric[idx] = round(float(parts[0]) * float(parts[1]), 2)
                else:
                    minutes_numeric[idx] = np.nan
            except:
                minutes_numeric[idx] = np.nan
        else:
            # Direct numeric value
            try:
                minutes_numeric[idx] = round(float(val), 2)
            except:
                minutes_numeric[idx] = np.nan
    
    moderator_raw["Training_TotalMinute"] = minutes_numeric
    print("  ✓ Training_TotalMinute: expressions evaluated (e.g., '13*90' → 1170.0)")

# Convert Training_TotalWeeks to numeric
if "Training_TotalWeeks" in moderator_raw.columns:
    moderator_raw["Training_TotalWeeks"] = pd.to_numeric(
        moderator_raw["Training_TotalWeeks"], errors="coerce"
    )
    print("  ✓ Training_TotalWeeks: converted to numeric")


# --- 1.4.3 Create Treatment Duration Categorical Variable ---
print("  [1.4.3] Creating Treatment_Duration categorical variable...")

if "Training_TotalWeeks" in moderator_raw.columns:
    
    def categorize_duration(weeks):
        """
        Categorize training duration into Short/Medium/Long based on week count.
        
        Classification scheme:
            - Short:  1-4 weeks
            - Medium: 5-8 weeks  
            - Long:   ≥9 weeks
        
        Parameters:
        -----------
        weeks : float or NaN
            Number of training weeks
        
        Returns:
        --------
        str or NaN
            Duration category ('Short', 'Medium', or 'Long')
        """
        if pd.isna(weeks):
            return np.nan
        elif weeks <= 4:
            return "Short"
        elif weeks <= 8:
            return "Medium"
        else:
            return "Long"
    
    # Apply categorization
    moderator_raw["Treatment_Duration"] = moderator_raw["Training_TotalWeeks"].apply(categorize_duration)
    
    # Position Treatment_Duration column right after Training_TotalWeeks
    weeks_col_position = moderator_raw.columns.get_loc("Training_TotalWeeks")
    cols = moderator_raw.columns.tolist()
    cols.remove("Treatment_Duration")
    cols.insert(weeks_col_position + 1, "Treatment_Duration")
    moderator_raw = moderator_raw[cols]
    
    # Report distribution
    duration_counts = moderator_raw["Treatment_Duration"].value_counts(dropna=False)
    print(f"  ✓ Treatment_Duration created:")
    print(f"    - Short (1-4 weeks):   {duration_counts.get('Short', 0):2d} studies")
    print(f"    - Medium (5-8 weeks):  {duration_counts.get('Medium', 0):2d} studies")
    print(f"    - Long (≥9 weeks):     {duration_counts.get('Long', 0):2d} studies")
    
    missing_count = moderator_raw["Treatment_Duration"].isna().sum()
    if missing_count > 0:
        print(f"    - Missing data:        {missing_count:2d} studies")


print("\n✓ STEP 1 COMPLETE: Raw data cleaned and standardized")
print("=" * 80)


################################################################################
# STEP 2: MERGE & VALIDATE FINAL DATASET
################################################################################
"""
This step merges the cleaned moderator and effect-size datasets, validates
data integrity, and exports the final analysis-ready dataset.

Sub-steps:
    2.1 Merge moderator and effect-size data
    2.2 Verify Study_ID and Effect_ID consistency  
    2.3 Export final dataset
"""

print("\n" + "=" * 80)
print("STEP 2: MERGE & VALIDATE FINAL DATASET")
print("=" * 80)


# ==============================================================================
# 2.1 Merge Moderator and Effect-Size Data
# ==============================================================================
print("\n" + "─" * 80)
print("2.1 Merge Moderator and Effect-Size Data")
print("─" * 80)

# Prepare effect-size data for merging
# Select only essential columns for meta-analysis
effect_clean = effect_size_raw[[
    "index",        # Merge key (unique identifier)
    "Study_ID",     # Study identifier
    "Effect_ID",    # Effect-size identifier  
    "Hedges_g",     # Effect-size estimate (bias-corrected)
    "SE",           # Standard error
    "Variance",     # Variance of effect size
    "CI_Lower",     # 95% CI lower bound
    "CI_Upper"      # 95% CI upper bound
]].copy()

# Rename ID columns to enable validation after merge
effect_clean = effect_clean.rename(columns={
    "Study_ID": "Study_ID_effect",
    "Effect_ID": "Effect_ID_effect"
})

print(f"  Effect-size data prepared: {effect_clean.shape[0]} rows × {effect_clean.shape[1]} columns")

# Perform inner merge on 'index' column
# - how="inner": Retain only rows present in BOTH datasets
# - validate="1:1": Enforce one-to-one relationship (no duplicate indices)
merged_df = pd.merge(
    moderator_raw,
    effect_clean,
    on="index",
    how="inner",      # Inner join: only matched indices
    validate="1:1"    # Enforce 1:1 relationship
)

print(f"✓ Datasets merged successfully:")
print(f"  - Final dimensions: {merged_df.shape[0]} rows × {merged_df.shape[1]} columns")


# ==============================================================================
# 2.2 Verify Study_ID and Effect_ID Consistency
# ==============================================================================
print("\n" + "─" * 80)
print("2.2 Verify Study_ID and Effect_ID Consistency")
print("─" * 80)

# Check Study_ID consistency across datasets
study_mismatch = merged_df[
    merged_df["Study_ID"].astype(str) != merged_df["Study_ID_effect"].astype(str)
]

if len(study_mismatch) > 0:
    print(f"  ⚠️  Study_ID mismatches detected: {len(study_mismatch)} cases")
    print(study_mismatch[["index", "Study_ID", "Study_ID_effect"]])
else:
    print("  ✓ Study_ID validation: All values match between datasets")

# Check Effect_ID consistency across datasets
effect_mismatch = merged_df[
    merged_df["Effect_ID"].astype(str) != merged_df["Effect_ID_effect"].astype(str)
]

if len(effect_mismatch) > 0:
    print(f"  ⚠️  Effect_ID mismatches detected: {len(effect_mismatch)} cases")
    print(effect_mismatch[["index", "Effect_ID", "Effect_ID_effect"]])
else:
    print("  ✓ Effect_ID validation: All values match between datasets")

# Remove duplicate ID columns (keep original columns from moderator dataset)
merged_df = merged_df.drop(columns=["Study_ID_effect", "Effect_ID_effect"])
print(f"  ✓ Duplicate ID columns removed")


# ==============================================================================
# 2.3 Export Final Dataset
# ==============================================================================
print("\n" + "─" * 80)
print("2.3 Export Final Dataset")
print("─" * 80)

# Export final cleaned dataset to CSV
# - index=False: Don't write pandas row numbers
# - encoding="utf-8-sig": Include BOM for Excel compatibility
output_filename = "Meta_ready_cleaned.csv"
merged_df.to_csv(output_filename, index=False, encoding="utf-8-sig")

print(f"✓ Final dataset exported: {output_filename}")
print(f"  - Total effect sizes: {merged_df.shape[0]}")
print(f"  - Total variables:    {merged_df.shape[1]}")
print(f"\nKey transformations applied:")
print(f"  • Gender_Ratio_FM → Proportion of females (0-1 scale, 2 decimals)")
print(f"  • Training_TotalMinute → Numeric minutes (expressions evaluated)")
print(f"  • Treatment_Duration → Categorical (Short/Medium/Long)")
print(f"  • Missing values → Standardized to NaN")
print(f"  • Yes/No values → Standardized capitalization")
print(f"  • Only studies with complete data in both files included")

print("\n" + "=" * 80)
print("✅ DATA PREPARATION PIPELINE COMPLETE")
print("=" * 80)
print("Dataset is now ready for meta-analytic modeling.")
print("=" * 80)

STEP 1: CLEAN & STANDARDIZE RAW DATA

────────────────────────────────────────────────────────────────────────────────
1.1 Data Import and Initial Standardization
────────────────────────────────────────────────────────────────────────────────
✓ Moderator variables loaded: 29 rows × 24 columns
✓ Effect-size data loaded:    29 rows × 32 columns
✓ Whitespace normalized across all string fields

────────────────────────────────────────────────────────────────────────────────
1.2 Harmonization of Indexing and Sample Inclusion
────────────────────────────────────────────────────────────────────────────────
  Indices in moderator file:     29
  Indices in effect-size file:   29
  Common indices (retained):     29

✓ Sample harmonization complete:
  - Moderator dataset:    29 effect sizes
  - Effect-size dataset:  29 effect sizes

────────────────────────────────────────────────────────────────────────────────
1.3 Normalization of Missing and Irregular Values
─────────────────────────────────