In [1]:
import pandas as pd
import re
import json
import unicodedata

SKIP_COLUMNS = {
    "Postnatal_Maternal_Complications",
    "Neonatal__Fetal_Complications",
    "Antenatal_Peripartum_Maternal_Complications",
    "Mode_of_delivery2",
    "Any other Antenatal, Postnatal, or Delivery Complications both fetal and maternal"
}

def clean_feature_string(s):
    s = s.strip().lower()
    s = unicodedata.normalize('NFKD', s)
    s = re.sub(r'[\s_\-]+$', '', s)
    s = re.sub(r'^[\s_\-]+', '', s)
    s = re.sub(r'[^a-z0-9\s\(\)\[\]\{\}\.\'\"\/\-]+', '', s)
    s = s.strip()
    return s

def detect_multivalue_columns_strict(df):
    multivalue_cols = {}
    for col in df.columns:
        if col in SKIP_COLUMNS:
            continue
        has_multi = False
        unique_values = set()
        for val in df[col].dropna().astype(str):
            if re.search(r',| and ', val):
                has_multi = True
            for s in re.split(r'\s*,\s*|\s+and\s+', val):
                clean_val = clean_feature_string(s)
                if clean_val and clean_val.lower() != 'nan':
                    unique_values.add(clean_val)
        if has_multi and len(unique_values) > 1:
            multivalue_cols[col] = sorted(unique_values)
    return multivalue_cols

def expand_columns_simple(df, multivalue_cols):
    conversion_report = {}
    for col, values in multivalue_cols.items():
        conversion_report[col] = []
        for v in values:
            new_col = f"{col}__{v}"
            conversion_report[col].append(new_col)
            df[new_col] = df[col].astype(str).apply(
                lambda x: int(any(v == clean_feature_string(s) for s in re.split(r'\s*,\s*|\s+and\s+', x))) if pd.notna(x) else 0
            )
    return df.drop(columns=list(multivalue_cols.keys())), conversion_report

def update_json(json_path, multivalue_cols):
    try:
        with open(json_path, 'r') as f:
            data = json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        data = {}
    expanded_map = {}
    for col, values in multivalue_cols.items():
        expanded_map[col] = [f"{col}__{v}" for v in values]
    data['multi_value_column_expansion'] = expanded_map
    with open(json_path, 'w') as f:
        json.dump(data, f, indent=4)

def multi_value_column_expander_simple(
    input_csv: str = None,
    output_csv: str = None,
    json_path: str = None,
    df: pd.DataFrame = None
) -> pd.DataFrame:
    if df is None:
        if input_csv is None:
            raise ValueError("Must provide either input_csv or df")
        df = pd.read_csv(input_csv)
    else:
        df = df.copy()
    multivalue_cols = detect_multivalue_columns_strict(df)
    if not multivalue_cols:
        print("No multi-value columns detected.")
        return df
    print("Multi-value columns detected and will be expanded (with cleaned, lower-case values):")
    for col, values in multivalue_cols.items():
        print(f"  {col}: {values}")
    df_new, conversion_report = expand_columns_simple(df, multivalue_cols)
    print("\nCONVERSION SUMMARY:")
    for orig_col, new_cols in conversion_report.items():
        print(f"  '{orig_col}' was expanded into: {new_cols}")
    if output_csv:
        df_new.to_csv(output_csv, index=False)
        print(f"Expanded CSV saved to: {output_csv}")
    if json_path:
        update_json(json_path, multivalue_cols)
        print(f"JSON metadata updated at: {json_path}")
    return df_new

# Example usage in a notebook cell:
# expanded_df = multi_value_column_expander_simple(
#     input_csv='yourfile.csv',
#     output_csv='output.csv',
#     json_path='Mode_of_delivery2.json'
# )
expanded_df = multi_value_column_expander_simple(input_csv='Final Data For Model Training September 17.csv', output_csv='cleaned_data.csv', json_path='Mode_of_delivery2.json')
# expanded_df = multi_value_column_expander_simple(df=my_df)

Multi-value columns detected and will be expanded (with cleaned, lower-case values):
  First degree relative with following medical conditions: ['anemia', 'anomalous babies', 'asthma', 'bleeding disorder', 'cardiac arrest', 'cardiac defects', 'cardiac disease', 'congenital anomalies', 'deep vein thrombosis', 'diabetes mellitus (dm)', 'down syndrome', 'eclampsia', 'eclampsia /pre-eclampsia', 'epilepsy', 'hypertension', 'multiple pregnancy', 'neural tube defects', 'no significant problem', 'postpartum hemorrhage(pph)', 'pre-eclampsia', 'renal disease', 'thalasemia', 'thyroid disease', 'tuberculosis', 'twins']
  PreviouslyDiagnosedMedicalConditions: ['anemia', 'asthma', 'cardiac disease', 'deep vein thrombosis after last delivery', 'diabetes mellitus (dm)', 'eclampsia', 'epilepsy', 'hypertension', 'hypertension (htn)', 'multiple pregnancy', 'no previously diagnosed problem', 'obstetric cholestasis', 'postpartum hemorrhage(pph)', 'pre-eclampsia', 'renal disease', 'thyroid disease', 'tuberc

  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_col] = df[col].astype(str).apply(
  df[new_co


CONVERSION SUMMARY:
  'First degree relative with following medical conditions' was expanded into: ['First degree relative with following medical conditions__anemia', 'First degree relative with following medical conditions__anomalous babies', 'First degree relative with following medical conditions__asthma', 'First degree relative with following medical conditions__bleeding disorder', 'First degree relative with following medical conditions__cardiac arrest', 'First degree relative with following medical conditions__cardiac defects', 'First degree relative with following medical conditions__cardiac disease', 'First degree relative with following medical conditions__congenital anomalies', 'First degree relative with following medical conditions__deep vein thrombosis', 'First degree relative with following medical conditions__diabetes mellitus (dm)', 'First degree relative with following medical conditions__down syndrome', 'First degree relative with following medical conditions__eclamp