# Data Cleaning Notebook
This notebook handles cleaning of raw datasets.

In [None]:
import pandas as pd
import numpy as np

## Load Raw Data

In [26]:
file_path = "./cleaned_data_manual.csv"

cleaned_data_manual = pd.read_csv(file_path, encoding="latin1")

cleaned_data_manual.head()

Unnamed: 0,age_group,gender,role_in_household,household_size,household_head_education,primary_income,main_drinking_water,distance_to_water_source,litres_per_person_per_day,bathing_frequency,...,health_service_disruptions,health_services_affected,health_toilet_closures,health_sanitation_management,health_handwashing_impact,health_hygiene_problems,health_sanitation_difficulty,community_water_challenges,community_water_suggestions,additional_information
0,1824,Male,Child,4,No formal education,Farming,Rainwater,Rain water,30 - 70,Once per day,...,,,,,,,,Clean water,Pipe water,Construct pipe water.
1,1824,Male,Tenant,10,Secondary School,Business,Water Tank,500m1km,30 - 70,Twice per day,...,,,,,,,,High cost of purchasing water,Pipe,Poor roads affect the movement of water tanks
2,2534,Male,Head of household,2,Tertiary school,Business,Water Tank,More than 1km,101 -120,Three or more times per day,...,,,,,,,,High cost of water,Provide of quality and equitable water for all.,There is a need for installation of water pipe...
3,1824,Male,Child,19,Secondary School,Salaried job,Water Tank,More than 1km,25 - 30,Three or more times per day,...,,,,,,,,Water shortages,Provision of water pipes and water treatments,We actually need clean water for our health
4,1824,Female,Child,20,No formal education,Farming,River/Lake,500m1km,25 - 30,Twice per day,...,,,,,,,,High cost of water,Setting up more piped water in every neighborh...,No


## Inspect Data

In [27]:
# Check general info: columns, data types, missing values
cleaned_data_manual.info()

# Check basic statistics for numeric columns
cleaned_data_manual.describe()

# Check for missing values per column
cleaned_data_manual.isnull().sum()

# Check for duplicate rows
cleaned_data_manual.duplicated().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 52 columns):
 #   Column                                     Non-Null Count  Dtype 
---  ------                                     --------------  ----- 
 0   age_group                                  121 non-null    object
 1   gender                                     121 non-null    object
 2     role_in_household                        121 non-null    object
 3     household_size                           121 non-null    int64 
 4     household_head_education                 121 non-null    object
 5    primary_income                            121 non-null    object
 6    main_drinking_water                       121 non-null    object
 7     distance_to_water_source                 121 non-null    object
 8    litres_per_person_per_day                 121 non-null    object
 9     bathing_frequency                        121 non-null    object
 10   weekly_water_cost                    

0

## Basic Cleaning Steps

In [117]:
# df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
partially_cleaned_dataset = cleaned_data_manual.copy()
partially_cleaned_dataset.head()

partially_cleaned_dataset.isnull().sum()

age_group                                      0
gender                                         0
  role_in_household                            0
  household_size                               0
  household_head_education                     0
 primary_income                                0
 main_drinking_water                           0
  distance_to_water_source                     0
 litres_per_person_per_day                     0
  bathing_frequency                            0
 weekly_water_cost                             1
  equitable_access                             0
household_shortage_frequency                   0
  household_shortage_seriousness               0
  shortage_reason                              0
  school_affiliation                           0
 school_role                                  26
school_type                                   26
school_ownership                              26
  school_student_population                   26
school_staff_populat

In [None]:
# Identify numeric-like columns automatically (will be False for 'once a day', etc.)
numeric_guess = partially_cleaned_dataset.apply(
    lambda col: pd.to_numeric(col, errors="coerce").notna().sum() > 0
)

numeric_cols = numeric_guess[numeric_guess].index.tolist()
categorical_cols = [
    col for col in partially_cleaned_dataset.columns if col not in numeric_cols
]

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

Numeric columns: ['  household_size', ' weekly_water_cost', 'school_monthly_water_cost  ', ' health_monthly_water_cost ']
Categorical columns: ['age_group', 'gender ', '  role_in_household', '  household_head_education ', ' primary_income ', ' main_drinking_water', '  distance_to_water_source', ' litres_per_person_per_day', '  bathing_frequency', '  equitable_access  ', 'household_shortage_frequency', '  household_shortage_seriousness', '  shortage_reason', '  school_affiliation', ' school_role ', 'school_type ', 'school_ownership  ', '  school_student_population', 'school_staff_population ', ' school_has_potable_water', '  school_water_source', ' school_hours_water_available', 'school_equitable_access ', 'school_water_challenges', '  school_drinking_frequency', '  school_has_handwashing_facilities  ', ' school_handwashing_frequency', '  school_sleep_without_bathing', 'school_sleep_without_bathing_frequency', ' health_affiliation', ' health_facility_typehealth_facility_type', '  health

## Clean numeric columns

In [121]:
# Clean column names by stripping leading/trailing whitespace
partially_cleaned_dataset.columns = partially_cleaned_dataset.columns.str.strip()

# Manually specify numeric columns based on data understanding
numeric_cols = [
    "household_size",
    "weekly_water_cost",
    "school_monthly_water_cost",
    "health_monthly_water_cost",
]

# Categorical columns (everything else)
categorical_cols = [
    col for col in partially_cleaned_dataset.columns if col not in numeric_cols
]

# Clean numeric columns: remove non-numeric characters and convert to numeric type
for col in numeric_cols:
    partially_cleaned_dataset[col] = (
        partially_cleaned_dataset[col]
        .astype(str)
        .str.replace(r"[^0-9.]", "", regex=True)
    )
    partially_cleaned_dataset[col] = pd.to_numeric(
        partially_cleaned_dataset[col], errors="coerce"
    )
    partially_cleaned_dataset[col].fillna(
        partially_cleaned_dataset[col].median(), inplace=True
    )

## Handle categorical columns

In [122]:
# Columns related to school/health (missing means not affiliated)
# Columns with missing values
categorical_missing_cols = [
    "school_role",
    "school_type",
    "school_ownership",
    "school_student_population",
    "school_staff_population",
    "school_has_potable_water",
    "school_water_source",
    "school_hours_water_available",
    "school_monthly_water_cost",
    "health_sanitation_difficulty",
    "additional_information",
]

# Fill missing appropriately
for col in categorical_missing_cols:
    if col in partially_cleaned_dataset.columns:
        if "school" in col or "health" in col:
            partially_cleaned_dataset[col] = partially_cleaned_dataset[col].fillna(
                "Not Applicable"
            )
        else:  # general text
            partially_cleaned_dataset[col] = partially_cleaned_dataset[col].fillna(
                "Not Provided"
            )

# Check for any remaining missing values
print(partially_cleaned_dataset.isna().sum())


partially_cleaned_dataset.replace({r"\x96": "-"}, regex=True, inplace=True)

partially_cleaned_dataset.head()

age_group                                     0
gender                                        0
role_in_household                             0
household_size                                0
household_head_education                      0
primary_income                                0
main_drinking_water                           0
distance_to_water_source                      0
litres_per_person_per_day                     0
bathing_frequency                             0
weekly_water_cost                             0
equitable_access                              0
household_shortage_frequency                  0
household_shortage_seriousness                0
shortage_reason                               0
school_affiliation                            0
school_role                                   0
school_type                                   0
school_ownership                              0
school_student_population                     0
school_staff_population                 

Unnamed: 0,age_group,gender,role_in_household,household_size,household_head_education,primary_income,main_drinking_water,distance_to_water_source,litres_per_person_per_day,bathing_frequency,weekly_water_cost,equitable_access,household_shortage_frequency,household_shortage_seriousness,shortage_reason,school_affiliation,school_role,school_type,school_ownership,school_student_population,school_staff_population,school_has_potable_water,school_water_source,school_monthly_water_cost,school_hours_water_available,school_equitable_access,school_water_challenges,school_drinking_frequency,school_has_handwashing_facilities,school_handwashing_frequency,school_sleep_without_bathing,school_sleep_without_bathing_frequency,health_affiliation,health_facility_typehealth_facility_type,health_facility_ownership,health_daily_patients,health_staff_count,health_water_source,health_water_reliability,health_monthly_water_cost,health_shortage_frequency,health_shortage_coping,health_service_disruptions,health_services_affected,health_toilet_closures,health_sanitation_management,health_handwashing_impact,health_hygiene_problems,health_sanitation_difficulty,community_water_challenges,community_water_suggestions,additional_information
0,18-24,Male,Child,4,No formal education,Farming,Rainwater,Rain water,30 - 70,Once per day,0.0,not sure,Most frequently,Very serious,High cost of water,Yes,Student,Secondary,Community-based,Less than 500,Less than 50,not always reliable,Borehole / Well,0.0,2-6 hours,Yes,Poor water quality,2-3 times,No,Rarely,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Clean water,Pipe water,Construct pipe water.
1,18-24,Male,Tenant,10,Secondary School,Business,Water Tank,500m-1km,30 - 70,Twice per day,45000.0,No,Occasionally,Moderately serious,High cost of water,Yes,Student,University/College,Public,Less than 500,Less than 50,not always reliable,Water tank,500.0,2-6 hours,Maybe,Water is expensive,2-3 times,Yes,Once,Yes,A few times per month,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of purchasing water,Pipe,Poor roads affect the movement of water tanks
2,25-34,Male,Head of household,2,Tertiary school,Business,Water Tank,More than 1km,101 -120,Three or more times per day,14000.0,No,Occasionally,Very serious,"High cost of water, Limited daily supply, Long...",No,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,500000.0,Not Applicable,,,,,,,,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Provide of quality and equitable water for all.,There is a need for installation of water pipe...
3,18-24,Male,Child,19,Secondary School,Salaried job,Water Tank,More than 1km,25 - 30,Three or more times per day,750000.0,Yes,Frequently,Very serious,High cost of water,Yes,Student,University/College,Private,"More than 2,000",50-100,not always reliable,Water tank,500000.0,Less than 2 hours,Yes,Poor water quality,Rarely,No,never,Yes,Daily,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Water shortages,Provision of water pipes and water treatments,We actually need clean water for our health
4,18-24,Female,Child,20,No formal education,Farming,River/Lake,500m-1km,25 - 30,Twice per day,35000.0,not sure,Frequently,Very serious,High cost of water,Yes,Student,University/College,Public,"1,001-2,000",More than 200,not always reliable,Piped water,1000000.0,2-6 hours,Maybe,Long queues,More than 3 times,Maybe,2-3 times,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Setting up more piped water in every neighborh...,No


In [123]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
partially_cleaned_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121 entries, 0 to 120
Data columns (total 52 columns):
 #   Column                                    Non-Null Count  Dtype  
---  ------                                    --------------  -----  
 0   age_group                                 121 non-null    object 
 1   gender                                    121 non-null    object 
 2   role_in_household                         121 non-null    object 
 3   household_size                            121 non-null    int64  
 4   household_head_education                  121 non-null    object 
 5   primary_income                            121 non-null    object 
 6   main_drinking_water                       121 non-null    object 
 7   distance_to_water_source                  121 non-null    object 
 8   litres_per_person_per_day                 121 non-null    object 
 9   bathing_frequency                         121 non-null    object 
 10  weekly_water_cost                     

In [125]:
partially_cleaned_dataset.head()

Unnamed: 0,age_group,gender,role_in_household,household_size,household_head_education,primary_income,main_drinking_water,distance_to_water_source,litres_per_person_per_day,bathing_frequency,weekly_water_cost,equitable_access,household_shortage_frequency,household_shortage_seriousness,shortage_reason,school_affiliation,school_role,school_type,school_ownership,school_student_population,school_staff_population,school_has_potable_water,school_water_source,school_monthly_water_cost,school_hours_water_available,school_equitable_access,school_water_challenges,school_drinking_frequency,school_has_handwashing_facilities,school_handwashing_frequency,school_sleep_without_bathing,school_sleep_without_bathing_frequency,health_affiliation,health_facility_typehealth_facility_type,health_facility_ownership,health_daily_patients,health_staff_count,health_water_source,health_water_reliability,health_monthly_water_cost,health_shortage_frequency,health_shortage_coping,health_service_disruptions,health_services_affected,health_toilet_closures,health_sanitation_management,health_handwashing_impact,health_hygiene_problems,health_sanitation_difficulty,community_water_challenges,community_water_suggestions,additional_information
0,18-24,Male,Child,4,No formal education,Farming,Rainwater,Rain water,30 - 70,Once per day,0.0,not sure,Most frequently,Very serious,High cost of water,Yes,Student,Secondary,Community-based,Less than 500,Less than 50,not always reliable,Borehole / Well,0.0,2-6 hours,Yes,Poor water quality,2-3 times,No,Rarely,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Clean water,Pipe water,Construct pipe water.
1,18-24,Male,Tenant,10,Secondary School,Business,Water Tank,500m-1km,30 - 70,Twice per day,45000.0,No,Occasionally,Moderately serious,High cost of water,Yes,Student,University/College,Public,Less than 500,Less than 50,not always reliable,Water tank,500.0,2-6 hours,Maybe,Water is expensive,2-3 times,Yes,Once,Yes,A few times per month,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of purchasing water,Pipe,Poor roads affect the movement of water tanks
2,25-34,Male,Head of household,2,Tertiary school,Business,Water Tank,More than 1km,101 -120,Three or more times per day,14000.0,No,Occasionally,Very serious,"High cost of water, Limited daily supply, Long...",No,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,500000.0,Not Applicable,,,,,,,,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Provide of quality and equitable water for all.,There is a need for installation of water pipe...
3,18-24,Male,Child,19,Secondary School,Salaried job,Water Tank,More than 1km,25 - 30,Three or more times per day,750000.0,Yes,Frequently,Very serious,High cost of water,Yes,Student,University/College,Private,"More than 2,000",50-100,not always reliable,Water tank,500000.0,Less than 2 hours,Yes,Poor water quality,Rarely,No,never,Yes,Daily,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Water shortages,Provision of water pipes and water treatments,We actually need clean water for our health
4,18-24,Female,Child,20,No formal education,Farming,River/Lake,500m-1km,25 - 30,Twice per day,35000.0,not sure,Frequently,Very serious,High cost of water,Yes,Student,University/College,Public,"1,001-2,000",More than 200,not always reliable,Piped water,1000000.0,2-6 hours,Maybe,Long queues,More than 3 times,Maybe,2-3 times,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Setting up more piped water in every neighborh...,No


In [131]:
import re


def convert_litres(value):
    """Convert textual water consumption entries into numeric litres per person per day."""

    if pd.isna(value):
        return np.nan

    s = str(value).strip().lower()

    # Normalize dashes
    s = re.sub(r"[\u2012\u2013\u2014\u2212~]", "-", s)
    s = s.replace("–", "-")  # extra normalization
    s = s.replace(",", "")  # remove commas in numbers

    # Case 1: ranges like "25 - 30"
    range_match = re.search(r"(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)", s)
    if range_match:
        return (float(range_match.group(1)) + float(range_match.group(2))) / 2.0

    # Case 2: "more than 120" or "greater than 120"
    more_than_match = re.search(r"(?:more than|greater than)\s*(\d+(?:\.\d+)?)", s)
    if more_than_match:
        return float(more_than_match.group(1)) + 10.0

    # Case 3: "less than 20" -> small buffer below
    less_than_match = re.search(r"(?:less than|below)\s*(\d+(?:\.\d+)?)", s)
    if less_than_match:
        return max(0, float(less_than_match.group(1)) - 5.0)

    # Case 4: single numeric value
    single_num_match = re.search(r"(\d+(?:\.\d+)?)", s)
    if single_num_match:
        return float(single_num_match.group(1))

    # Fallback: cannot parse
    return np.nan


partially_cleaned_dataset["litres_per_person_per_day_num"] = partially_cleaned_dataset[
    "litres_per_person_per_day"
].apply(convert_litres)

# Quick check
print(partially_cleaned_dataset["litres_per_person_per_day_num"].describe())
print(partially_cleaned_dataset["litres_per_person_per_day_num"].isna().sum())

count    121.000000
mean      63.359504
std       37.823341
min       27.500000
25%       27.500000
50%       50.000000
75%       85.500000
max      130.000000
Name: litres_per_person_per_day_num, dtype: float64
0


In [140]:
# Clean text format
partially_cleaned_dataset["household_shortage_frequency"] = (
    partially_cleaned_dataset["household_shortage_frequency"]
    .astype(str)
    .str.strip()
    .str.lower()
)

# Map to ordered numeric scale
frequency_map = {
    "never": 0,
    "rarely": 1,
    "occasionally": 2,
    "frequently": 3,
    "most frequently": 4,
}

partially_cleaned_dataset["household_shortage_frequency"] = partially_cleaned_dataset[
    "household_shortage_frequency"
].map(frequency_map)

partially_cleaned_dataset["household_shortage_frequency"].head()

partially_cleaned_dataset["household_shortage_frequency"].unique()

array([4, 2, 3, 1, 0], dtype=int64)

In [141]:
partially_cleaned_dataset.head()

Unnamed: 0,age_group,gender,role_in_household,household_size,household_head_education,primary_income,main_drinking_water,distance_to_water_source,litres_per_person_per_day,bathing_frequency,weekly_water_cost,equitable_access,household_shortage_frequency,household_shortage_seriousness,shortage_reason,school_affiliation,school_role,school_type,school_ownership,school_student_population,school_staff_population,school_has_potable_water,school_water_source,school_monthly_water_cost,school_hours_water_available,school_equitable_access,school_water_challenges,school_drinking_frequency,school_has_handwashing_facilities,school_handwashing_frequency,school_sleep_without_bathing,school_sleep_without_bathing_frequency,health_affiliation,health_facility_typehealth_facility_type,health_facility_ownership,health_daily_patients,health_staff_count,health_water_source,health_water_reliability,health_monthly_water_cost,health_shortage_frequency,health_shortage_coping,health_service_disruptions,health_services_affected,health_toilet_closures,health_sanitation_management,health_handwashing_impact,health_hygiene_problems,health_sanitation_difficulty,community_water_challenges,community_water_suggestions,additional_information,litres_per_person_per_day_num,household_shortage_frequency_num
0,18-24,Male,Child,4,No formal education,Farming,Rainwater,Rain water,30 - 70,Once per day,0.0,not sure,4,Very serious,High cost of water,Yes,Student,Secondary,Community-based,Less than 500,Less than 50,not always reliable,Borehole / Well,0.0,2-6 hours,Yes,Poor water quality,2-3 times,No,Rarely,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Clean water,Pipe water,Construct pipe water.,50.0,4
1,18-24,Male,Tenant,10,Secondary School,Business,Water Tank,500m-1km,30 - 70,Twice per day,45000.0,No,2,Moderately serious,High cost of water,Yes,Student,University/College,Public,Less than 500,Less than 50,not always reliable,Water tank,500.0,2-6 hours,Maybe,Water is expensive,2-3 times,Yes,Once,Yes,A few times per month,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of purchasing water,Pipe,Poor roads affect the movement of water tanks,50.0,2
2,25-34,Male,Head of household,2,Tertiary school,Business,Water Tank,More than 1km,101 -120,Three or more times per day,14000.0,No,2,Very serious,"High cost of water, Limited daily supply, Long...",No,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,500000.0,Not Applicable,,,,,,,,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Provide of quality and equitable water for all.,There is a need for installation of water pipe...,110.5,2
3,18-24,Male,Child,19,Secondary School,Salaried job,Water Tank,More than 1km,25 - 30,Three or more times per day,750000.0,Yes,3,Very serious,High cost of water,Yes,Student,University/College,Private,"More than 2,000",50-100,not always reliable,Water tank,500000.0,Less than 2 hours,Yes,Poor water quality,Rarely,No,never,Yes,Daily,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Water shortages,Provision of water pipes and water treatments,We actually need clean water for our health,27.5,3
4,18-24,Female,Child,20,No formal education,Farming,River/Lake,500m-1km,25 - 30,Twice per day,35000.0,not sure,3,Very serious,High cost of water,Yes,Student,University/College,Public,"1,001-2,000",More than 200,not always reliable,Piped water,1000000.0,2-6 hours,Maybe,Long queues,More than 3 times,Maybe,2-3 times,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Setting up more piped water in every neighborh...,No,27.5,3


In [142]:
# Clean text
partially_cleaned_dataset["household_shortage_seriousness"] = (
    partially_cleaned_dataset["household_shortage_seriousness"]
    .astype(str)
    .str.strip()
    .str.lower()
)

# Map to numeric
seriousness_map = {
    "slightly serious": 0,
    "moderately serious": 1,
    "serious": 2,
    "very serious": 3,
}

partially_cleaned_dataset["household_shortage_seriousness"] = partially_cleaned_dataset[
    "household_shortage_seriousness"
].map(seriousness_map)

# Check
partially_cleaned_dataset["household_shortage_seriousness"].head()

0    3.0
1    1.0
2    3.0
3    3.0
4    3.0
Name: household_shortage_seriousness, dtype: float64

In [143]:
partially_cleaned_dataset.head()

Unnamed: 0,age_group,gender,role_in_household,household_size,household_head_education,primary_income,main_drinking_water,distance_to_water_source,litres_per_person_per_day,bathing_frequency,weekly_water_cost,equitable_access,household_shortage_frequency,household_shortage_seriousness,shortage_reason,school_affiliation,school_role,school_type,school_ownership,school_student_population,school_staff_population,school_has_potable_water,school_water_source,school_monthly_water_cost,school_hours_water_available,school_equitable_access,school_water_challenges,school_drinking_frequency,school_has_handwashing_facilities,school_handwashing_frequency,school_sleep_without_bathing,school_sleep_without_bathing_frequency,health_affiliation,health_facility_typehealth_facility_type,health_facility_ownership,health_daily_patients,health_staff_count,health_water_source,health_water_reliability,health_monthly_water_cost,health_shortage_frequency,health_shortage_coping,health_service_disruptions,health_services_affected,health_toilet_closures,health_sanitation_management,health_handwashing_impact,health_hygiene_problems,health_sanitation_difficulty,community_water_challenges,community_water_suggestions,additional_information,litres_per_person_per_day_num,household_shortage_frequency_num
0,18-24,Male,Child,4,No formal education,Farming,Rainwater,Rain water,30 - 70,Once per day,0.0,not sure,4,3.0,High cost of water,Yes,Student,Secondary,Community-based,Less than 500,Less than 50,not always reliable,Borehole / Well,0.0,2-6 hours,Yes,Poor water quality,2-3 times,No,Rarely,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Clean water,Pipe water,Construct pipe water.,50.0,4
1,18-24,Male,Tenant,10,Secondary School,Business,Water Tank,500m-1km,30 - 70,Twice per day,45000.0,No,2,1.0,High cost of water,Yes,Student,University/College,Public,Less than 500,Less than 50,not always reliable,Water tank,500.0,2-6 hours,Maybe,Water is expensive,2-3 times,Yes,Once,Yes,A few times per month,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of purchasing water,Pipe,Poor roads affect the movement of water tanks,50.0,2
2,25-34,Male,Head of household,2,Tertiary school,Business,Water Tank,More than 1km,101 -120,Three or more times per day,14000.0,No,2,3.0,"High cost of water, Limited daily supply, Long...",No,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,500000.0,Not Applicable,,,,,,,,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Provide of quality and equitable water for all.,There is a need for installation of water pipe...,110.5,2
3,18-24,Male,Child,19,Secondary School,Salaried job,Water Tank,More than 1km,25 - 30,Three or more times per day,750000.0,Yes,3,3.0,High cost of water,Yes,Student,University/College,Private,"More than 2,000",50-100,not always reliable,Water tank,500000.0,Less than 2 hours,Yes,Poor water quality,Rarely,No,never,Yes,Daily,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Water shortages,Provision of water pipes and water treatments,We actually need clean water for our health,27.5,3
4,18-24,Female,Child,20,No formal education,Farming,River/Lake,500m-1km,25 - 30,Twice per day,35000.0,not sure,3,3.0,High cost of water,Yes,Student,University/College,Public,"1,001-2,000",More than 200,not always reliable,Piped water,1000000.0,2-6 hours,Maybe,Long queues,More than 3 times,Maybe,2-3 times,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Setting up more piped water in every neighborh...,No,27.5,3


In [144]:
from collections import Counter

partially_cleaned_dataset["shortage_reason_list"] = partially_cleaned_dataset[
    "shortage_reason"
].str.split(", ")


# Flatten the lists into a single list
all_reasons = [
    reason.strip()
    for sublist in partially_cleaned_dataset["shortage_reason_list"].dropna()
    for reason in sublist
]
# Count each reason
reason_counts = Counter(all_reasons)
print(reason_counts)

Counter({'High cost of water': 80, 'Limited daily supply': 41, 'Long distance to water source': 41, 'Poor water quality': 28, 'Seasonal scarcity': 13, 'Water is life regardless of the cost': 1, 'Road problems during rainy season.': 1})


In [158]:
# Example mapping for categorical columns
equitable_mapping = {"yes": 1, "no": 0, "not sure": None, "unsure": None}

school_has_handwashing_mapping = {"yes": 1, "no": 0, "maybe": None}

school_has_potable_mapping = {
    "always available": 2,
    "not always reliable": 1,
    "not available": 0,
}

# Apply mapping directly to the original columns without creating new numeric columns
partially_cleaned_dataset["equitable_access"] = (
    partially_cleaned_dataset["equitable_access"]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace(equitable_mapping)
)

partially_cleaned_dataset["school_has_handwashing_facilities"] = (
    partially_cleaned_dataset["school_has_handwashing_facilities"]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace(school_has_handwashing_mapping)
)

partially_cleaned_dataset["school_has_potable_water"] = (
    partially_cleaned_dataset["school_has_potable_water"]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace(school_has_potable_mapping)
)

# Check the dataset
partially_cleaned_dataset[
    [
        "equitable_access",
        "school_has_handwashing_facilities",
        "school_has_potable_water",
    ]
].head()

Unnamed: 0,equitable_access,school_has_handwashing_facilities,school_has_potable_water
0,,0,1
1,0.0,1,1
2,0.0,,not applicable
3,1.0,0,1
4,,none,1


In [148]:
partially_cleaned_dataset.head()

Unnamed: 0,age_group,gender,role_in_household,household_size,household_head_education,primary_income,main_drinking_water,distance_to_water_source,litres_per_person_per_day,bathing_frequency,weekly_water_cost,equitable_access,household_shortage_frequency,household_shortage_seriousness,shortage_reason,school_affiliation,school_role,school_type,school_ownership,school_student_population,school_staff_population,school_has_potable_water,school_water_source,school_monthly_water_cost,school_hours_water_available,school_equitable_access,school_water_challenges,school_drinking_frequency,school_has_handwashing_facilities,school_handwashing_frequency,school_sleep_without_bathing,school_sleep_without_bathing_frequency,health_affiliation,health_facility_typehealth_facility_type,health_facility_ownership,health_daily_patients,health_staff_count,health_water_source,health_water_reliability,health_monthly_water_cost,health_shortage_frequency,health_shortage_coping,health_service_disruptions,health_services_affected,health_toilet_closures,health_sanitation_management,health_handwashing_impact,health_hygiene_problems,health_sanitation_difficulty,community_water_challenges,community_water_suggestions,additional_information,litres_per_person_per_day_num,household_shortage_frequency_num,shortage_reason_list
0,18-24,Male,Child,4,No formal education,Farming,Rainwater,Rain water,30 - 70,Once per day,0.0,,4,3.0,High cost of water,Yes,Student,Secondary,Community-based,Less than 500,Less than 50,1,Borehole / Well,0.0,2-6 hours,Yes,Poor water quality,2-3 times,0,Rarely,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Clean water,Pipe water,Construct pipe water.,50.0,4,[High cost of water]
1,18-24,Male,Tenant,10,Secondary School,Business,Water Tank,500m-1km,30 - 70,Twice per day,45000.0,0.0,2,1.0,High cost of water,Yes,Student,University/College,Public,Less than 500,Less than 50,1,Water tank,500.0,2-6 hours,Maybe,Water is expensive,2-3 times,1,Once,Yes,A few times per month,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of purchasing water,Pipe,Poor roads affect the movement of water tanks,50.0,2,[High cost of water]
2,25-34,Male,Head of household,2,Tertiary school,Business,Water Tank,More than 1km,101 -120,Three or more times per day,14000.0,0.0,2,3.0,"High cost of water, Limited daily supply, Long...",No,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,not applicable,Not Applicable,500000.0,Not Applicable,,,,,,,,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Provide of quality and equitable water for all.,There is a need for installation of water pipe...,110.5,2,"[High cost of water, Limited daily supply, Lon..."
3,18-24,Male,Child,19,Secondary School,Salaried job,Water Tank,More than 1km,25 - 30,Three or more times per day,750000.0,1.0,3,3.0,High cost of water,Yes,Student,University/College,Private,"More than 2,000",50-100,1,Water tank,500000.0,Less than 2 hours,Yes,Poor water quality,Rarely,0,never,Yes,Daily,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Water shortages,Provision of water pipes and water treatments,We actually need clean water for our health,27.5,3,[High cost of water]
4,18-24,Female,Child,20,No formal education,Farming,River/Lake,500m-1km,25 - 30,Twice per day,35000.0,,3,3.0,High cost of water,Yes,Student,University/College,Public,"1,001-2,000",More than 200,1,Piped water,1000000.0,2-6 hours,Maybe,Long queues,More than 3 times,none,2-3 times,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Setting up more piped water in every neighborh...,No,27.5,3,[High cost of water]


In [149]:
def convert_distance_fixed(value):
    try:
        s = str(value).strip().lower()
    except (TypeError, ValueError):
        return np.nan

    # Normalize dashes
    s = re.sub(r"[\u2012\u2013\u2014\u2212]", "-", s)

    # Replace "km" with meters
    s = re.sub(r"(\d+(?:\.\d+)?)\s*km", lambda m: str(float(m.group(1)) * 1000), s)

    # Case 1: ranges like "500-1000"
    match = re.search(r"(\d+(?:\.\d+)?)\s*-\s*(\d+(?:\.\d+)?)", s)
    if match:
        low = float(match.group(1))
        high = float(match.group(2))
        return (low + high) / 2.0

    # Case 2: "more than 1000" etc.
    m = re.search(r"(?:more than|greater than)\s*(\d+(?:\.\d+)?)", s)
    if m:
        num = float(m.group(1))
        return num + 20.0

    # Case 3: single numeric
    m2 = re.search(r"(\d+(?:\.\d+)?)", s)
    if m2:
        return float(m2.group(1))

    return np.nan


# Apply the fixed function
partially_cleaned_dataset["distance_to_water_source_numeric"] = (
    partially_cleaned_dataset["distance_to_water_source"].apply(convert_distance_fixed)
)
partially_cleaned_dataset["distance_to_water_source_numeric"].head()

0       NaN
1     500.0
2    1020.0
3    1020.0
4     500.0
Name: distance_to_water_source_numeric, dtype: float64

In [150]:
partially_cleaned_dataset.head()

Unnamed: 0,age_group,gender,role_in_household,household_size,household_head_education,primary_income,main_drinking_water,distance_to_water_source,litres_per_person_per_day,bathing_frequency,weekly_water_cost,equitable_access,household_shortage_frequency,household_shortage_seriousness,shortage_reason,school_affiliation,school_role,school_type,school_ownership,school_student_population,school_staff_population,school_has_potable_water,school_water_source,school_monthly_water_cost,school_hours_water_available,school_equitable_access,school_water_challenges,school_drinking_frequency,school_has_handwashing_facilities,school_handwashing_frequency,school_sleep_without_bathing,school_sleep_without_bathing_frequency,health_affiliation,health_facility_typehealth_facility_type,health_facility_ownership,health_daily_patients,health_staff_count,health_water_source,health_water_reliability,health_monthly_water_cost,health_shortage_frequency,health_shortage_coping,health_service_disruptions,health_services_affected,health_toilet_closures,health_sanitation_management,health_handwashing_impact,health_hygiene_problems,health_sanitation_difficulty,community_water_challenges,community_water_suggestions,additional_information,litres_per_person_per_day_num,household_shortage_frequency_num,shortage_reason_list,distance_to_water_source_numeric
0,18-24,Male,Child,4,No formal education,Farming,Rainwater,Rain water,30 - 70,Once per day,0.0,,4,3.0,High cost of water,Yes,Student,Secondary,Community-based,Less than 500,Less than 50,1,Borehole / Well,0.0,2-6 hours,Yes,Poor water quality,2-3 times,0,Rarely,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Clean water,Pipe water,Construct pipe water.,50.0,4,[High cost of water],
1,18-24,Male,Tenant,10,Secondary School,Business,Water Tank,500m-1km,30 - 70,Twice per day,45000.0,0.0,2,1.0,High cost of water,Yes,Student,University/College,Public,Less than 500,Less than 50,1,Water tank,500.0,2-6 hours,Maybe,Water is expensive,2-3 times,1,Once,Yes,A few times per month,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of purchasing water,Pipe,Poor roads affect the movement of water tanks,50.0,2,[High cost of water],500.0
2,25-34,Male,Head of household,2,Tertiary school,Business,Water Tank,More than 1km,101 -120,Three or more times per day,14000.0,0.0,2,3.0,"High cost of water, Limited daily supply, Long...",No,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,not applicable,Not Applicable,500000.0,Not Applicable,,,,,,,,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Provide of quality and equitable water for all.,There is a need for installation of water pipe...,110.5,2,"[High cost of water, Limited daily supply, Lon...",1020.0
3,18-24,Male,Child,19,Secondary School,Salaried job,Water Tank,More than 1km,25 - 30,Three or more times per day,750000.0,1.0,3,3.0,High cost of water,Yes,Student,University/College,Private,"More than 2,000",50-100,1,Water tank,500000.0,Less than 2 hours,Yes,Poor water quality,Rarely,0,never,Yes,Daily,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Water shortages,Provision of water pipes and water treatments,We actually need clean water for our health,27.5,3,[High cost of water],1020.0
4,18-24,Female,Child,20,No formal education,Farming,River/Lake,500m-1km,25 - 30,Twice per day,35000.0,,3,3.0,High cost of water,Yes,Student,University/College,Public,"1,001-2,000",More than 200,1,Piped water,1000000.0,2-6 hours,Maybe,Long queues,More than 3 times,none,2-3 times,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Setting up more piped water in every neighborh...,No,27.5,3,[High cost of water],500.0


In [154]:
# Conversion rate
ssp_to_usd_rate = 1 / 4500  # 1 SSP = 1/4500 USD


# Function to extract numeric SSP values
def ssp_to_numeric(value):
    if pd.isna(value):
        return None
    s = str(value).replace(",", "").strip()
    match = re.search(r"(\d+(\.\d+)?)", s)
    if match:
        return float(match.group(1))
    return None


# Columns to convert
money_cols = [
    "weekly_water_cost",
    "school_monthly_water_cost",
    "health_monthly_water_cost",
]

for col in money_cols:
    # Extract pure numeric SSP value
    partially_cleaned_dataset[col + "_ssp"] = partially_cleaned_dataset[col].apply(
        ssp_to_numeric
    )

    # Convert SSP → USD
    partially_cleaned_dataset[col + "_usd"] = (
        partially_cleaned_dataset[col + "_ssp"] * ssp_to_usd_rate
    )

# Show results
print(
    partially_cleaned_dataset[
        [
            "weekly_water_cost_usd",
            "school_monthly_water_cost_usd",
            "health_monthly_water_cost_usd",
        ]
    ].head()
)

   weekly_water_cost_usd  school_monthly_water_cost_usd  \
0               0.000000                       0.000000   
1              10.000000                       0.111111   
2               3.111111                     111.111111   
3             166.666667                     111.111111   
4               7.777778                     222.222222   

   health_monthly_water_cost_usd  
0                      55.555556  
1                      55.555556  
2                      55.555556  
3                      55.555556  
4                      55.555556  


In [155]:
partially_cleaned_dataset.head()

Unnamed: 0,age_group,gender,role_in_household,household_size,household_head_education,primary_income,main_drinking_water,distance_to_water_source,litres_per_person_per_day,bathing_frequency,weekly_water_cost,equitable_access,household_shortage_frequency,household_shortage_seriousness,shortage_reason,school_affiliation,school_role,school_type,school_ownership,school_student_population,school_staff_population,school_has_potable_water,school_water_source,school_monthly_water_cost,school_hours_water_available,school_equitable_access,school_water_challenges,school_drinking_frequency,school_has_handwashing_facilities,school_handwashing_frequency,school_sleep_without_bathing,school_sleep_without_bathing_frequency,health_affiliation,health_facility_typehealth_facility_type,health_facility_ownership,health_daily_patients,health_staff_count,health_water_source,health_water_reliability,health_monthly_water_cost,health_shortage_frequency,health_shortage_coping,health_service_disruptions,health_services_affected,health_toilet_closures,health_sanitation_management,health_handwashing_impact,health_hygiene_problems,health_sanitation_difficulty,community_water_challenges,community_water_suggestions,additional_information,litres_per_person_per_day_num,household_shortage_frequency_num,shortage_reason_list,distance_to_water_source_numeric,weekly_water_cost_ssp,weekly_water_cost_usd,school_monthly_water_cost_ssp,school_monthly_water_cost_usd,health_monthly_water_cost_ssp,health_monthly_water_cost_usd
0,18-24,Male,Child,4,No formal education,Farming,Rainwater,Rain water,30 - 70,Once per day,0.0,,4,3.0,High cost of water,Yes,Student,Secondary,Community-based,Less than 500,Less than 50,1,Borehole / Well,0.0,2-6 hours,Yes,Poor water quality,2-3 times,0,Rarely,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Clean water,Pipe water,Construct pipe water.,50.0,4,[High cost of water],,0.0,0.0,0.0,0.0,250000.0,55.555556
1,18-24,Male,Tenant,10,Secondary School,Business,Water Tank,500m-1km,30 - 70,Twice per day,45000.0,0.0,2,1.0,High cost of water,Yes,Student,University/College,Public,Less than 500,Less than 50,1,Water tank,500.0,2-6 hours,Maybe,Water is expensive,2-3 times,1,Once,Yes,A few times per month,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of purchasing water,Pipe,Poor roads affect the movement of water tanks,50.0,2,[High cost of water],500.0,45000.0,10.0,500.0,0.111111,250000.0,55.555556
2,25-34,Male,Head of household,2,Tertiary school,Business,Water Tank,More than 1km,101 -120,Three or more times per day,14000.0,0.0,2,3.0,"High cost of water, Limited daily supply, Long...",No,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,not applicable,Not Applicable,500000.0,Not Applicable,,,,,,,,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Provide of quality and equitable water for all.,There is a need for installation of water pipe...,110.5,2,"[High cost of water, Limited daily supply, Lon...",1020.0,14000.0,3.111111,500000.0,111.111111,250000.0,55.555556
3,18-24,Male,Child,19,Secondary School,Salaried job,Water Tank,More than 1km,25 - 30,Three or more times per day,750000.0,1.0,3,3.0,High cost of water,Yes,Student,University/College,Private,"More than 2,000",50-100,1,Water tank,500000.0,Less than 2 hours,Yes,Poor water quality,Rarely,0,never,Yes,Daily,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Water shortages,Provision of water pipes and water treatments,We actually need clean water for our health,27.5,3,[High cost of water],1020.0,750000.0,166.666667,500000.0,111.111111,250000.0,55.555556
4,18-24,Female,Child,20,No formal education,Farming,River/Lake,500m-1km,25 - 30,Twice per day,35000.0,,3,3.0,High cost of water,Yes,Student,University/College,Public,"1,001-2,000",More than 200,1,Piped water,1000000.0,2-6 hours,Maybe,Long queues,More than 3 times,none,2-3 times,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Setting up more piped water in every neighborh...,No,27.5,3,[High cost of water],500.0,35000.0,7.777778,1000000.0,222.222222,250000.0,55.555556


In [159]:
# Function to convert ranges to midpoint
def convert_hours(value):
    try:
        s = str(value).strip().lower()
    except (TypeError, ValueError):
        return np.nan

    # Handle 'Less than 2 hours' or similar
    if "less than" in s:
        match = re.search(r"less than (\d+)", s)
        if match:
            return float(match.group(1)) / 2  # approximate midpoint
    # Handle 'Not applicable' or 'Not available'
    if "not" in s:
        return np.nan

    # Handle ranges like '2-6 hours'
    match = re.search(r"(\d+)\s*[-–]\s*(\d+)", s)
    if match:
        low = float(match.group(1))
        high = float(match.group(2))
        return (low + high) / 2.0

    # Handle single numeric values
    match = re.search(r"(\d+)", s)
    if match:
        return float(match.group(1))

    return np.nan


# Apply the conversion
partially_cleaned_dataset["school_hours_water_available_numeric"] = (
    partially_cleaned_dataset["school_hours_water_available"].apply(convert_hours)
)

# Check the first rows
partially_cleaned_dataset[
    ["school_hours_water_available", "school_hours_water_available_numeric"]
].head(10)

Unnamed: 0,school_hours_water_available,school_hours_water_available_numeric
0,2-6 hours,4.0
1,2-6 hours,4.0
2,Not Applicable,
3,Less than 2 hours,1.0
4,2-6 hours,4.0
5,2-6 hours,4.0
6,6-12 hours,9.0
7,2-6 hours,4.0
8,Less than 2 hours,1.0
9,2-6 hours,4.0


In [160]:
partially_cleaned_dataset.head()

Unnamed: 0,age_group,gender,role_in_household,household_size,household_head_education,primary_income,main_drinking_water,distance_to_water_source,litres_per_person_per_day,bathing_frequency,weekly_water_cost,equitable_access,household_shortage_frequency,household_shortage_seriousness,shortage_reason,school_affiliation,school_role,school_type,school_ownership,school_student_population,school_staff_population,school_has_potable_water,school_water_source,school_monthly_water_cost,school_hours_water_available,school_equitable_access,school_water_challenges,school_drinking_frequency,school_has_handwashing_facilities,school_handwashing_frequency,school_sleep_without_bathing,school_sleep_without_bathing_frequency,health_affiliation,health_facility_typehealth_facility_type,health_facility_ownership,health_daily_patients,health_staff_count,health_water_source,health_water_reliability,health_monthly_water_cost,health_shortage_frequency,health_shortage_coping,health_service_disruptions,health_services_affected,health_toilet_closures,health_sanitation_management,health_handwashing_impact,health_hygiene_problems,health_sanitation_difficulty,community_water_challenges,community_water_suggestions,additional_information,litres_per_person_per_day_num,household_shortage_frequency_num,shortage_reason_list,distance_to_water_source_numeric,weekly_water_cost_ssp,weekly_water_cost_usd,school_monthly_water_cost_ssp,school_monthly_water_cost_usd,health_monthly_water_cost_ssp,health_monthly_water_cost_usd,school_has_potable_water_numeric,school_has_potable_water_clean,school_hours_water_available_numeric
0,18-24,Male,Child,4,No formal education,Farming,Rainwater,Rain water,30 - 70,Once per day,0.0,,4,3.0,High cost of water,Yes,Student,Secondary,Community-based,Less than 500,Less than 50,1,Borehole / Well,0.0,2-6 hours,Yes,Poor water quality,2-3 times,0,Rarely,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Clean water,Pipe water,Construct pipe water.,50.0,4,[High cost of water],,0.0,0.0,0.0,0.0,250000.0,55.555556,,1,4.0
1,18-24,Male,Tenant,10,Secondary School,Business,Water Tank,500m-1km,30 - 70,Twice per day,45000.0,0.0,2,1.0,High cost of water,Yes,Student,University/College,Public,Less than 500,Less than 50,1,Water tank,500.0,2-6 hours,Maybe,Water is expensive,2-3 times,1,Once,Yes,A few times per month,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of purchasing water,Pipe,Poor roads affect the movement of water tanks,50.0,2,[High cost of water],500.0,45000.0,10.0,500.0,0.111111,250000.0,55.555556,,1,4.0
2,25-34,Male,Head of household,2,Tertiary school,Business,Water Tank,More than 1km,101 -120,Three or more times per day,14000.0,0.0,2,3.0,"High cost of water, Limited daily supply, Long...",No,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,not applicable,Not Applicable,500000.0,Not Applicable,,,,,,,,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Provide of quality and equitable water for all.,There is a need for installation of water pipe...,110.5,2,"[High cost of water, Limited daily supply, Lon...",1020.0,14000.0,3.111111,500000.0,111.111111,250000.0,55.555556,,not applicable,
3,18-24,Male,Child,19,Secondary School,Salaried job,Water Tank,More than 1km,25 - 30,Three or more times per day,750000.0,1.0,3,3.0,High cost of water,Yes,Student,University/College,Private,"More than 2,000",50-100,1,Water tank,500000.0,Less than 2 hours,Yes,Poor water quality,Rarely,0,never,Yes,Daily,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Water shortages,Provision of water pipes and water treatments,We actually need clean water for our health,27.5,3,[High cost of water],1020.0,750000.0,166.666667,500000.0,111.111111,250000.0,55.555556,,1,1.0
4,18-24,Female,Child,20,No formal education,Farming,River/Lake,500m-1km,25 - 30,Twice per day,35000.0,,3,3.0,High cost of water,Yes,Student,University/College,Public,"1,001-2,000",More than 200,1,Piped water,1000000.0,2-6 hours,Maybe,Long queues,More than 3 times,none,2-3 times,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Setting up more piped water in every neighborh...,No,27.5,3,[High cost of water],500.0,35000.0,7.777778,1000000.0,222.222222,250000.0,55.555556,,1,4.0


In [None]:
# Mapping function for school_handwashing_frequency
def convert_handwashing_frequency(value):
    s = str(value).strip().lower()

    if s in ["never"]:
        return 0.0
    elif s in ["once", "1"]:
        return 1.0
    elif s in ["2-3 times", "2–3 times", "2-3", "2–3"]:
        return 2.5  # midpoint of 2 and 3
    elif s in ["more than 3 times", "more than 3"]:
        return 4.0
    else:
        # If already numeric
        try:
            return float(s)
        except Exception:
            return np.nan



# Create numeric column while keeping original
partially_cleaned_dataset["school_handwashing_frequency_numeric"] = (
    partially_cleaned_dataset["school_handwashing_frequency"].apply(
        convert_handwashing_frequency
    )
)

# Preview
partially_cleaned_dataset[
    ["school_handwashing_frequency", "school_handwashing_frequency_numeric"]
].head(10)

Unnamed: 0,school_handwashing_frequency,school_handwashing_frequency_numeric
0,Rarely,
1,Once,1.0
2,,
3,never,0.0
4,2-3 times,2.5
5,Once,1.0
6,Once,1.0
7,2-3 times,2.5
8,2-3 times,2.5
9,2-3 times,2.5


In [162]:
partially_cleaned_dataset.head()

Unnamed: 0,age_group,gender,role_in_household,household_size,household_head_education,primary_income,main_drinking_water,distance_to_water_source,litres_per_person_per_day,bathing_frequency,weekly_water_cost,equitable_access,household_shortage_frequency,household_shortage_seriousness,shortage_reason,school_affiliation,school_role,school_type,school_ownership,school_student_population,school_staff_population,school_has_potable_water,school_water_source,school_monthly_water_cost,school_hours_water_available,school_equitable_access,school_water_challenges,school_drinking_frequency,school_has_handwashing_facilities,school_handwashing_frequency,school_sleep_without_bathing,school_sleep_without_bathing_frequency,health_affiliation,health_facility_typehealth_facility_type,health_facility_ownership,health_daily_patients,health_staff_count,health_water_source,health_water_reliability,health_monthly_water_cost,health_shortage_frequency,health_shortage_coping,health_service_disruptions,health_services_affected,health_toilet_closures,health_sanitation_management,health_handwashing_impact,health_hygiene_problems,health_sanitation_difficulty,community_water_challenges,community_water_suggestions,additional_information,litres_per_person_per_day_num,household_shortage_frequency_num,shortage_reason_list,distance_to_water_source_numeric,weekly_water_cost_ssp,weekly_water_cost_usd,school_monthly_water_cost_ssp,school_monthly_water_cost_usd,health_monthly_water_cost_ssp,health_monthly_water_cost_usd,school_has_potable_water_numeric,school_has_potable_water_clean,school_hours_water_available_numeric,school_handwashing_frequency_numeric
0,18-24,Male,Child,4,No formal education,Farming,Rainwater,Rain water,30 - 70,Once per day,0.0,,4,3.0,High cost of water,Yes,Student,Secondary,Community-based,Less than 500,Less than 50,1,Borehole / Well,0.0,2-6 hours,Yes,Poor water quality,2-3 times,0,Rarely,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Clean water,Pipe water,Construct pipe water.,50.0,4,[High cost of water],,0.0,0.0,0.0,0.0,250000.0,55.555556,,1,4.0,
1,18-24,Male,Tenant,10,Secondary School,Business,Water Tank,500m-1km,30 - 70,Twice per day,45000.0,0.0,2,1.0,High cost of water,Yes,Student,University/College,Public,Less than 500,Less than 50,1,Water tank,500.0,2-6 hours,Maybe,Water is expensive,2-3 times,1,Once,Yes,A few times per month,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of purchasing water,Pipe,Poor roads affect the movement of water tanks,50.0,2,[High cost of water],500.0,45000.0,10.0,500.0,0.111111,250000.0,55.555556,,1,4.0,1.0
2,25-34,Male,Head of household,2,Tertiary school,Business,Water Tank,More than 1km,101 -120,Three or more times per day,14000.0,0.0,2,3.0,"High cost of water, Limited daily supply, Long...",No,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,not applicable,Not Applicable,500000.0,Not Applicable,,,,,,,,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Provide of quality and equitable water for all.,There is a need for installation of water pipe...,110.5,2,"[High cost of water, Limited daily supply, Lon...",1020.0,14000.0,3.111111,500000.0,111.111111,250000.0,55.555556,,not applicable,,
3,18-24,Male,Child,19,Secondary School,Salaried job,Water Tank,More than 1km,25 - 30,Three or more times per day,750000.0,1.0,3,3.0,High cost of water,Yes,Student,University/College,Private,"More than 2,000",50-100,1,Water tank,500000.0,Less than 2 hours,Yes,Poor water quality,Rarely,0,never,Yes,Daily,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Water shortages,Provision of water pipes and water treatments,We actually need clean water for our health,27.5,3,[High cost of water],1020.0,750000.0,166.666667,500000.0,111.111111,250000.0,55.555556,,1,1.0,0.0
4,18-24,Female,Child,20,No formal education,Farming,River/Lake,500m-1km,25 - 30,Twice per day,35000.0,,3,3.0,High cost of water,Yes,Student,University/College,Public,"1,001-2,000",More than 200,1,Piped water,1000000.0,2-6 hours,Maybe,Long queues,More than 3 times,none,2-3 times,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Setting up more piped water in every neighborh...,No,27.5,3,[High cost of water],500.0,35000.0,7.777778,1000000.0,222.222222,250000.0,55.555556,,1,4.0,2.5


In [163]:
# Mapping function for yes/no
def convert_yes_no(value):
    s = str(value).strip().lower()
    if s == "yes":
        return 1
    elif s == "no":
        return 0
    else:
        return np.nan


# Create numeric column without changing original
partially_cleaned_dataset["school_sleep_without_bathing_numeric"] = (
    partially_cleaned_dataset["school_sleep_without_bathing"].apply(convert_yes_no)
)

# Preview
partially_cleaned_dataset[
    ["school_sleep_without_bathing", "school_sleep_without_bathing_numeric"]
].head(10)

Unnamed: 0,school_sleep_without_bathing,school_sleep_without_bathing_numeric
0,Yes,1.0
1,Yes,1.0
2,,
3,Yes,1.0
4,Yes,1.0
5,Yes,1.0
6,Yes,1.0
7,Yes,1.0
8,Yes,1.0
9,Yes,1.0


In [164]:
partially_cleaned_dataset.head()

Unnamed: 0,age_group,gender,role_in_household,household_size,household_head_education,primary_income,main_drinking_water,distance_to_water_source,litres_per_person_per_day,bathing_frequency,weekly_water_cost,equitable_access,household_shortage_frequency,household_shortage_seriousness,shortage_reason,school_affiliation,school_role,school_type,school_ownership,school_student_population,school_staff_population,school_has_potable_water,school_water_source,school_monthly_water_cost,school_hours_water_available,school_equitable_access,school_water_challenges,school_drinking_frequency,school_has_handwashing_facilities,school_handwashing_frequency,school_sleep_without_bathing,school_sleep_without_bathing_frequency,health_affiliation,health_facility_typehealth_facility_type,health_facility_ownership,health_daily_patients,health_staff_count,health_water_source,health_water_reliability,health_monthly_water_cost,health_shortage_frequency,health_shortage_coping,health_service_disruptions,health_services_affected,health_toilet_closures,health_sanitation_management,health_handwashing_impact,health_hygiene_problems,health_sanitation_difficulty,community_water_challenges,community_water_suggestions,additional_information,litres_per_person_per_day_num,household_shortage_frequency_num,shortage_reason_list,distance_to_water_source_numeric,weekly_water_cost_ssp,weekly_water_cost_usd,school_monthly_water_cost_ssp,school_monthly_water_cost_usd,health_monthly_water_cost_ssp,health_monthly_water_cost_usd,school_has_potable_water_numeric,school_has_potable_water_clean,school_hours_water_available_numeric,school_handwashing_frequency_numeric,school_sleep_without_bathing_numeric
0,18-24,Male,Child,4,No formal education,Farming,Rainwater,Rain water,30 - 70,Once per day,0.0,,4,3.0,High cost of water,Yes,Student,Secondary,Community-based,Less than 500,Less than 50,1,Borehole / Well,0.0,2-6 hours,Yes,Poor water quality,2-3 times,0,Rarely,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Clean water,Pipe water,Construct pipe water.,50.0,4,[High cost of water],,0.0,0.0,0.0,0.0,250000.0,55.555556,,1,4.0,,1.0
1,18-24,Male,Tenant,10,Secondary School,Business,Water Tank,500m-1km,30 - 70,Twice per day,45000.0,0.0,2,1.0,High cost of water,Yes,Student,University/College,Public,Less than 500,Less than 50,1,Water tank,500.0,2-6 hours,Maybe,Water is expensive,2-3 times,1,Once,Yes,A few times per month,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of purchasing water,Pipe,Poor roads affect the movement of water tanks,50.0,2,[High cost of water],500.0,45000.0,10.0,500.0,0.111111,250000.0,55.555556,,1,4.0,1.0,1.0
2,25-34,Male,Head of household,2,Tertiary school,Business,Water Tank,More than 1km,101 -120,Three or more times per day,14000.0,0.0,2,3.0,"High cost of water, Limited daily supply, Long...",No,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,not applicable,Not Applicable,500000.0,Not Applicable,,,,,,,,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Provide of quality and equitable water for all.,There is a need for installation of water pipe...,110.5,2,"[High cost of water, Limited daily supply, Lon...",1020.0,14000.0,3.111111,500000.0,111.111111,250000.0,55.555556,,not applicable,,,
3,18-24,Male,Child,19,Secondary School,Salaried job,Water Tank,More than 1km,25 - 30,Three or more times per day,750000.0,1.0,3,3.0,High cost of water,Yes,Student,University/College,Private,"More than 2,000",50-100,1,Water tank,500000.0,Less than 2 hours,Yes,Poor water quality,Rarely,0,never,Yes,Daily,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Water shortages,Provision of water pipes and water treatments,We actually need clean water for our health,27.5,3,[High cost of water],1020.0,750000.0,166.666667,500000.0,111.111111,250000.0,55.555556,,1,1.0,0.0,1.0
4,18-24,Female,Child,20,No formal education,Farming,River/Lake,500m-1km,25 - 30,Twice per day,35000.0,,3,3.0,High cost of water,Yes,Student,University/College,Public,"1,001-2,000",More than 200,1,Piped water,1000000.0,2-6 hours,Maybe,Long queues,More than 3 times,none,2-3 times,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Setting up more piped water in every neighborh...,No,27.5,3,[High cost of water],500.0,35000.0,7.777778,1000000.0,222.222222,250000.0,55.555556,,1,4.0,2.5,1.0


In [165]:
# Mapping for frequency
frequency_mapping = {
    "daily": 7,
    "a few times per week": 4,
    "a few times per month": 2,
    "rarely (once in a while)": 1,
}

# Create numeric column without changing original
partially_cleaned_dataset["school_sleep_without_bathing_frequency_numeric"] = (
    partially_cleaned_dataset["school_sleep_without_bathing_frequency"]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace(frequency_mapping)
)

# Preview
partially_cleaned_dataset[
    [
        "school_sleep_without_bathing_frequency",
        "school_sleep_without_bathing_frequency_numeric",
    ]
].head(10)

Unnamed: 0,school_sleep_without_bathing_frequency,school_sleep_without_bathing_frequency_numeric
0,A few times per week,4.0
1,A few times per month,2.0
2,,
3,Daily,7.0
4,A few times per week,4.0
5,A few times per week,4.0
6,A few times per month,2.0
7,A few times per week,4.0
8,A few times per month,2.0
9,Daily,7.0


In [167]:
partially_cleaned_dataset.head()

Unnamed: 0,age_group,gender,role_in_household,household_size,household_head_education,primary_income,main_drinking_water,distance_to_water_source,litres_per_person_per_day,bathing_frequency,weekly_water_cost,equitable_access,household_shortage_frequency,household_shortage_seriousness,shortage_reason,school_affiliation,school_role,school_type,school_ownership,school_student_population,school_staff_population,school_has_potable_water,school_water_source,school_monthly_water_cost,school_hours_water_available,school_equitable_access,school_water_challenges,school_drinking_frequency,school_has_handwashing_facilities,school_handwashing_frequency,school_sleep_without_bathing,school_sleep_without_bathing_frequency,health_affiliation,health_facility_typehealth_facility_type,health_facility_ownership,health_daily_patients,health_staff_count,health_water_source,health_water_reliability,health_monthly_water_cost,health_shortage_frequency,health_shortage_coping,health_service_disruptions,health_services_affected,health_toilet_closures,health_sanitation_management,health_handwashing_impact,health_hygiene_problems,health_sanitation_difficulty,community_water_challenges,community_water_suggestions,additional_information,litres_per_person_per_day_num,household_shortage_frequency_num,shortage_reason_list,distance_to_water_source_numeric,weekly_water_cost_ssp,weekly_water_cost_usd,school_monthly_water_cost_ssp,school_monthly_water_cost_usd,health_monthly_water_cost_ssp,health_monthly_water_cost_usd,school_has_potable_water_numeric,school_has_potable_water_clean,school_hours_water_available_numeric,school_handwashing_frequency_numeric,school_sleep_without_bathing_numeric,school_sleep_without_bathing_frequency_numeric
0,18-24,Male,Child,4,No formal education,Farming,Rainwater,Rain water,30 - 70,Once per day,0.0,,4,3.0,High cost of water,Yes,Student,Secondary,Community-based,Less than 500,Less than 50,1,Borehole / Well,0.0,2-6 hours,Yes,Poor water quality,2-3 times,0,Rarely,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Clean water,Pipe water,Construct pipe water.,50.0,4,[High cost of water],,0.0,0.0,0.0,0.0,250000.0,55.555556,,1,4.0,,1.0,4.0
1,18-24,Male,Tenant,10,Secondary School,Business,Water Tank,500m-1km,30 - 70,Twice per day,45000.0,0.0,2,1.0,High cost of water,Yes,Student,University/College,Public,Less than 500,Less than 50,1,Water tank,500.0,2-6 hours,Maybe,Water is expensive,2-3 times,1,Once,Yes,A few times per month,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of purchasing water,Pipe,Poor roads affect the movement of water tanks,50.0,2,[High cost of water],500.0,45000.0,10.0,500.0,0.111111,250000.0,55.555556,,1,4.0,1.0,1.0,2.0
2,25-34,Male,Head of household,2,Tertiary school,Business,Water Tank,More than 1km,101 -120,Three or more times per day,14000.0,0.0,2,3.0,"High cost of water, Limited daily supply, Long...",No,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,not applicable,Not Applicable,500000.0,Not Applicable,,,,,,,,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Provide of quality and equitable water for all.,There is a need for installation of water pipe...,110.5,2,"[High cost of water, Limited daily supply, Lon...",1020.0,14000.0,3.111111,500000.0,111.111111,250000.0,55.555556,,not applicable,,,,
3,18-24,Male,Child,19,Secondary School,Salaried job,Water Tank,More than 1km,25 - 30,Three or more times per day,750000.0,1.0,3,3.0,High cost of water,Yes,Student,University/College,Private,"More than 2,000",50-100,1,Water tank,500000.0,Less than 2 hours,Yes,Poor water quality,Rarely,0,never,Yes,Daily,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Water shortages,Provision of water pipes and water treatments,We actually need clean water for our health,27.5,3,[High cost of water],1020.0,750000.0,166.666667,500000.0,111.111111,250000.0,55.555556,,1,1.0,0.0,1.0,7.0
4,18-24,Female,Child,20,No formal education,Farming,River/Lake,500m-1km,25 - 30,Twice per day,35000.0,,3,3.0,High cost of water,Yes,Student,University/College,Public,"1,001-2,000",More than 200,1,Piped water,1000000.0,2-6 hours,Maybe,Long queues,More than 3 times,none,2-3 times,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Setting up more piped water in every neighborh...,No,27.5,3,[High cost of water],500.0,35000.0,7.777778,1000000.0,222.222222,250000.0,55.555556,,1,4.0,2.5,1.0,4.0


In [168]:
# Mapping for water reliability
reliability_mapping = {"unreliable": 0, "moderately reliable": 1, "very reliable": 2}

# Create numeric column without changing original
partially_cleaned_dataset["health_water_reliability_numeric"] = (
    partially_cleaned_dataset["health_water_reliability"]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace(reliability_mapping)
)

# Preview
partially_cleaned_dataset[
    ["health_water_reliability", "health_water_reliability_numeric"]
].head(10)

Unnamed: 0,health_water_reliability,health_water_reliability_numeric
0,,
1,,
2,,
3,,
4,,
5,Moderately reliable,1.0
6,,
7,,
8,,
9,Very reliable,2.0


In [169]:
partially_cleaned_dataset.head()

Unnamed: 0,age_group,gender,role_in_household,household_size,household_head_education,primary_income,main_drinking_water,distance_to_water_source,litres_per_person_per_day,bathing_frequency,weekly_water_cost,equitable_access,household_shortage_frequency,household_shortage_seriousness,shortage_reason,school_affiliation,school_role,school_type,school_ownership,school_student_population,school_staff_population,school_has_potable_water,school_water_source,school_monthly_water_cost,school_hours_water_available,school_equitable_access,school_water_challenges,school_drinking_frequency,school_has_handwashing_facilities,school_handwashing_frequency,school_sleep_without_bathing,school_sleep_without_bathing_frequency,health_affiliation,health_facility_typehealth_facility_type,health_facility_ownership,health_daily_patients,health_staff_count,health_water_source,health_water_reliability,health_monthly_water_cost,health_shortage_frequency,health_shortage_coping,health_service_disruptions,health_services_affected,health_toilet_closures,health_sanitation_management,health_handwashing_impact,health_hygiene_problems,health_sanitation_difficulty,community_water_challenges,community_water_suggestions,additional_information,litres_per_person_per_day_num,household_shortage_frequency_num,shortage_reason_list,distance_to_water_source_numeric,weekly_water_cost_ssp,weekly_water_cost_usd,school_monthly_water_cost_ssp,school_monthly_water_cost_usd,health_monthly_water_cost_ssp,health_monthly_water_cost_usd,school_has_potable_water_numeric,school_has_potable_water_clean,school_hours_water_available_numeric,school_handwashing_frequency_numeric,school_sleep_without_bathing_numeric,school_sleep_without_bathing_frequency_numeric,health_water_reliability_numeric
0,18-24,Male,Child,4,No formal education,Farming,Rainwater,Rain water,30 - 70,Once per day,0.0,,4,3.0,High cost of water,Yes,Student,Secondary,Community-based,Less than 500,Less than 50,1,Borehole / Well,0.0,2-6 hours,Yes,Poor water quality,2-3 times,0,Rarely,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Clean water,Pipe water,Construct pipe water.,50.0,4,[High cost of water],,0.0,0.0,0.0,0.0,250000.0,55.555556,,1,4.0,,1.0,4.0,
1,18-24,Male,Tenant,10,Secondary School,Business,Water Tank,500m-1km,30 - 70,Twice per day,45000.0,0.0,2,1.0,High cost of water,Yes,Student,University/College,Public,Less than 500,Less than 50,1,Water tank,500.0,2-6 hours,Maybe,Water is expensive,2-3 times,1,Once,Yes,A few times per month,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of purchasing water,Pipe,Poor roads affect the movement of water tanks,50.0,2,[High cost of water],500.0,45000.0,10.0,500.0,0.111111,250000.0,55.555556,,1,4.0,1.0,1.0,2.0,
2,25-34,Male,Head of household,2,Tertiary school,Business,Water Tank,More than 1km,101 -120,Three or more times per day,14000.0,0.0,2,3.0,"High cost of water, Limited daily supply, Long...",No,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,not applicable,Not Applicable,500000.0,Not Applicable,,,,,,,,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Provide of quality and equitable water for all.,There is a need for installation of water pipe...,110.5,2,"[High cost of water, Limited daily supply, Lon...",1020.0,14000.0,3.111111,500000.0,111.111111,250000.0,55.555556,,not applicable,,,,,
3,18-24,Male,Child,19,Secondary School,Salaried job,Water Tank,More than 1km,25 - 30,Three or more times per day,750000.0,1.0,3,3.0,High cost of water,Yes,Student,University/College,Private,"More than 2,000",50-100,1,Water tank,500000.0,Less than 2 hours,Yes,Poor water quality,Rarely,0,never,Yes,Daily,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Water shortages,Provision of water pipes and water treatments,We actually need clean water for our health,27.5,3,[High cost of water],1020.0,750000.0,166.666667,500000.0,111.111111,250000.0,55.555556,,1,1.0,0.0,1.0,7.0,
4,18-24,Female,Child,20,No formal education,Farming,River/Lake,500m-1km,25 - 30,Twice per day,35000.0,,3,3.0,High cost of water,Yes,Student,University/College,Public,"1,001-2,000",More than 200,1,Piped water,1000000.0,2-6 hours,Maybe,Long queues,More than 3 times,none,2-3 times,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Setting up more piped water in every neighborh...,No,27.5,3,[High cost of water],500.0,35000.0,7.777778,1000000.0,222.222222,250000.0,55.555556,,1,4.0,2.5,1.0,4.0,


In [170]:
# Mapping for frequency
disruption_mapping = {
    "never": 0,
    "rarely": 1,
    "occasionally": 2,
    "frequently": 3,
    "daily": 3,
    "not applicable": None,
}

# Convert health_service_disruptions to numeric
partially_cleaned_dataset["health_service_disruptions_numeric"] = (
    partially_cleaned_dataset["health_service_disruptions"]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace(disruption_mapping)
)

# Preview
partially_cleaned_dataset[
    ["health_service_disruptions", "health_service_disruptions_numeric"]
].head(10)

Unnamed: 0,health_service_disruptions,health_service_disruptions_numeric
0,,
1,,
2,,
3,,
4,,
5,Rarely,1.0
6,,
7,,
8,,
9,frequently,3.0


In [171]:
partially_cleaned_dataset.head()

Unnamed: 0,age_group,gender,role_in_household,household_size,household_head_education,primary_income,main_drinking_water,distance_to_water_source,litres_per_person_per_day,bathing_frequency,weekly_water_cost,equitable_access,household_shortage_frequency,household_shortage_seriousness,shortage_reason,school_affiliation,school_role,school_type,school_ownership,school_student_population,school_staff_population,school_has_potable_water,school_water_source,school_monthly_water_cost,school_hours_water_available,school_equitable_access,school_water_challenges,school_drinking_frequency,school_has_handwashing_facilities,school_handwashing_frequency,school_sleep_without_bathing,school_sleep_without_bathing_frequency,health_affiliation,health_facility_typehealth_facility_type,health_facility_ownership,health_daily_patients,health_staff_count,health_water_source,health_water_reliability,health_monthly_water_cost,health_shortage_frequency,health_shortage_coping,health_service_disruptions,health_services_affected,health_toilet_closures,health_sanitation_management,health_handwashing_impact,health_hygiene_problems,health_sanitation_difficulty,community_water_challenges,community_water_suggestions,additional_information,litres_per_person_per_day_num,household_shortage_frequency_num,shortage_reason_list,distance_to_water_source_numeric,weekly_water_cost_ssp,weekly_water_cost_usd,school_monthly_water_cost_ssp,school_monthly_water_cost_usd,health_monthly_water_cost_ssp,health_monthly_water_cost_usd,school_has_potable_water_numeric,school_has_potable_water_clean,school_hours_water_available_numeric,school_handwashing_frequency_numeric,school_sleep_without_bathing_numeric,school_sleep_without_bathing_frequency_numeric,health_water_reliability_numeric,health_service_disruptions_numeric
0,18-24,Male,Child,4,No formal education,Farming,Rainwater,Rain water,30 - 70,Once per day,0.0,,4,3.0,High cost of water,Yes,Student,Secondary,Community-based,Less than 500,Less than 50,1,Borehole / Well,0.0,2-6 hours,Yes,Poor water quality,2-3 times,0,Rarely,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Clean water,Pipe water,Construct pipe water.,50.0,4,[High cost of water],,0.0,0.0,0.0,0.0,250000.0,55.555556,,1,4.0,,1.0,4.0,,
1,18-24,Male,Tenant,10,Secondary School,Business,Water Tank,500m-1km,30 - 70,Twice per day,45000.0,0.0,2,1.0,High cost of water,Yes,Student,University/College,Public,Less than 500,Less than 50,1,Water tank,500.0,2-6 hours,Maybe,Water is expensive,2-3 times,1,Once,Yes,A few times per month,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of purchasing water,Pipe,Poor roads affect the movement of water tanks,50.0,2,[High cost of water],500.0,45000.0,10.0,500.0,0.111111,250000.0,55.555556,,1,4.0,1.0,1.0,2.0,,
2,25-34,Male,Head of household,2,Tertiary school,Business,Water Tank,More than 1km,101 -120,Three or more times per day,14000.0,0.0,2,3.0,"High cost of water, Limited daily supply, Long...",No,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,not applicable,Not Applicable,500000.0,Not Applicable,,,,,,,,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Provide of quality and equitable water for all.,There is a need for installation of water pipe...,110.5,2,"[High cost of water, Limited daily supply, Lon...",1020.0,14000.0,3.111111,500000.0,111.111111,250000.0,55.555556,,not applicable,,,,,,
3,18-24,Male,Child,19,Secondary School,Salaried job,Water Tank,More than 1km,25 - 30,Three or more times per day,750000.0,1.0,3,3.0,High cost of water,Yes,Student,University/College,Private,"More than 2,000",50-100,1,Water tank,500000.0,Less than 2 hours,Yes,Poor water quality,Rarely,0,never,Yes,Daily,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Water shortages,Provision of water pipes and water treatments,We actually need clean water for our health,27.5,3,[High cost of water],1020.0,750000.0,166.666667,500000.0,111.111111,250000.0,55.555556,,1,1.0,0.0,1.0,7.0,,
4,18-24,Female,Child,20,No formal education,Farming,River/Lake,500m-1km,25 - 30,Twice per day,35000.0,,3,3.0,High cost of water,Yes,Student,University/College,Public,"1,001-2,000",More than 200,1,Piped water,1000000.0,2-6 hours,Maybe,Long queues,More than 3 times,none,2-3 times,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Setting up more piped water in every neighborh...,No,27.5,3,[High cost of water],500.0,35000.0,7.777778,1000000.0,222.222222,250000.0,55.555556,,1,4.0,2.5,1.0,4.0,,


In [172]:
# Mapping for health shortage frequency
shortage_freq_mapping = {
    "daily": 7,
    "a few times per week": 3,
    "a few times per month": 1,
    "rarely": 0.5,
    "never": 0,
}

# Create numeric column
partially_cleaned_dataset["health_shortage_frequency_numeric"] = (
    partially_cleaned_dataset["health_shortage_frequency"]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace(shortage_freq_mapping)
)

# Preview
partially_cleaned_dataset[
    ["health_shortage_frequency", "health_shortage_frequency_numeric"]
].head(10)

Unnamed: 0,health_shortage_frequency,health_shortage_frequency_numeric
0,,
1,,
2,,
3,,
4,,
5,A few times per month,1.0
6,,
7,,
8,,
9,Daily,7.0


In [173]:
partially_cleaned_dataset.head()

Unnamed: 0,age_group,gender,role_in_household,household_size,household_head_education,primary_income,main_drinking_water,distance_to_water_source,litres_per_person_per_day,bathing_frequency,weekly_water_cost,equitable_access,household_shortage_frequency,household_shortage_seriousness,shortage_reason,school_affiliation,school_role,school_type,school_ownership,school_student_population,school_staff_population,school_has_potable_water,school_water_source,school_monthly_water_cost,school_hours_water_available,school_equitable_access,school_water_challenges,school_drinking_frequency,school_has_handwashing_facilities,school_handwashing_frequency,school_sleep_without_bathing,school_sleep_without_bathing_frequency,health_affiliation,health_facility_typehealth_facility_type,health_facility_ownership,health_daily_patients,health_staff_count,health_water_source,health_water_reliability,health_monthly_water_cost,health_shortage_frequency,health_shortage_coping,health_service_disruptions,health_services_affected,health_toilet_closures,health_sanitation_management,health_handwashing_impact,health_hygiene_problems,health_sanitation_difficulty,community_water_challenges,community_water_suggestions,additional_information,litres_per_person_per_day_num,household_shortage_frequency_num,shortage_reason_list,distance_to_water_source_numeric,weekly_water_cost_ssp,weekly_water_cost_usd,school_monthly_water_cost_ssp,school_monthly_water_cost_usd,health_monthly_water_cost_ssp,health_monthly_water_cost_usd,school_has_potable_water_numeric,school_has_potable_water_clean,school_hours_water_available_numeric,school_handwashing_frequency_numeric,school_sleep_without_bathing_numeric,school_sleep_without_bathing_frequency_numeric,health_water_reliability_numeric,health_service_disruptions_numeric,health_shortage_frequency_numeric
0,18-24,Male,Child,4,No formal education,Farming,Rainwater,Rain water,30 - 70,Once per day,0.0,,4,3.0,High cost of water,Yes,Student,Secondary,Community-based,Less than 500,Less than 50,1,Borehole / Well,0.0,2-6 hours,Yes,Poor water quality,2-3 times,0,Rarely,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Clean water,Pipe water,Construct pipe water.,50.0,4,[High cost of water],,0.0,0.0,0.0,0.0,250000.0,55.555556,,1,4.0,,1.0,4.0,,,
1,18-24,Male,Tenant,10,Secondary School,Business,Water Tank,500m-1km,30 - 70,Twice per day,45000.0,0.0,2,1.0,High cost of water,Yes,Student,University/College,Public,Less than 500,Less than 50,1,Water tank,500.0,2-6 hours,Maybe,Water is expensive,2-3 times,1,Once,Yes,A few times per month,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of purchasing water,Pipe,Poor roads affect the movement of water tanks,50.0,2,[High cost of water],500.0,45000.0,10.0,500.0,0.111111,250000.0,55.555556,,1,4.0,1.0,1.0,2.0,,,
2,25-34,Male,Head of household,2,Tertiary school,Business,Water Tank,More than 1km,101 -120,Three or more times per day,14000.0,0.0,2,3.0,"High cost of water, Limited daily supply, Long...",No,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,not applicable,Not Applicable,500000.0,Not Applicable,,,,,,,,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Provide of quality and equitable water for all.,There is a need for installation of water pipe...,110.5,2,"[High cost of water, Limited daily supply, Lon...",1020.0,14000.0,3.111111,500000.0,111.111111,250000.0,55.555556,,not applicable,,,,,,,
3,18-24,Male,Child,19,Secondary School,Salaried job,Water Tank,More than 1km,25 - 30,Three or more times per day,750000.0,1.0,3,3.0,High cost of water,Yes,Student,University/College,Private,"More than 2,000",50-100,1,Water tank,500000.0,Less than 2 hours,Yes,Poor water quality,Rarely,0,never,Yes,Daily,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Water shortages,Provision of water pipes and water treatments,We actually need clean water for our health,27.5,3,[High cost of water],1020.0,750000.0,166.666667,500000.0,111.111111,250000.0,55.555556,,1,1.0,0.0,1.0,7.0,,,
4,18-24,Female,Child,20,No formal education,Farming,River/Lake,500m-1km,25 - 30,Twice per day,35000.0,,3,3.0,High cost of water,Yes,Student,University/College,Public,"1,001-2,000",More than 200,1,Piped water,1000000.0,2-6 hours,Maybe,Long queues,More than 3 times,none,2-3 times,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Setting up more piped water in every neighborh...,No,27.5,3,[High cost of water],500.0,35000.0,7.777778,1000000.0,222.222222,250000.0,55.555556,,1,4.0,2.5,1.0,4.0,,,


In [174]:
from collections import Counter

# Split multiple responses and flatten the list
all_services = (
    partially_cleaned_dataset["health_services_affected"]
    .dropna()
    .apply(lambda x: [s.strip() for s in x.split(",")])
)
flat_list = [service for sublist in all_services for service in sublist]

# Count frequency of each service
service_counts = Counter(flat_list)

# Display
print(service_counts)

Counter({'Patient hygiene (washing': 16, 'bathing)': 16, 'Cleaning & sanitation': 9, 'Maternity services': 7, 'Laboratory services': 5, 'Surgery & operations': 4})


In [175]:
# Convert health_toilet_closures to numeric
partially_cleaned_dataset["health_toilet_closures_numeric"] = (
    partially_cleaned_dataset["health_toilet_closures"]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace({"no": 0, "occasionally": 1, "frequently": 2})
)

# Preview
partially_cleaned_dataset[
    ["health_toilet_closures", "health_toilet_closures_numeric"]
].head(10)

Unnamed: 0,health_toilet_closures,health_toilet_closures_numeric
0,,
1,,
2,,
3,,
4,,
5,occasionally,1.0
6,,
7,,
8,,
9,frequently,2.0


In [176]:
partially_cleaned_dataset.head()

Unnamed: 0,age_group,gender,role_in_household,household_size,household_head_education,primary_income,main_drinking_water,distance_to_water_source,litres_per_person_per_day,bathing_frequency,weekly_water_cost,equitable_access,household_shortage_frequency,household_shortage_seriousness,shortage_reason,school_affiliation,school_role,school_type,school_ownership,school_student_population,school_staff_population,school_has_potable_water,school_water_source,school_monthly_water_cost,school_hours_water_available,school_equitable_access,school_water_challenges,school_drinking_frequency,school_has_handwashing_facilities,school_handwashing_frequency,school_sleep_without_bathing,school_sleep_without_bathing_frequency,health_affiliation,health_facility_typehealth_facility_type,health_facility_ownership,health_daily_patients,health_staff_count,health_water_source,health_water_reliability,health_monthly_water_cost,health_shortage_frequency,health_shortage_coping,health_service_disruptions,health_services_affected,health_toilet_closures,health_sanitation_management,health_handwashing_impact,health_hygiene_problems,health_sanitation_difficulty,community_water_challenges,community_water_suggestions,additional_information,litres_per_person_per_day_num,household_shortage_frequency_num,shortage_reason_list,distance_to_water_source_numeric,weekly_water_cost_ssp,weekly_water_cost_usd,school_monthly_water_cost_ssp,school_monthly_water_cost_usd,health_monthly_water_cost_ssp,health_monthly_water_cost_usd,school_has_potable_water_numeric,school_has_potable_water_clean,school_hours_water_available_numeric,school_handwashing_frequency_numeric,school_sleep_without_bathing_numeric,school_sleep_without_bathing_frequency_numeric,health_water_reliability_numeric,health_service_disruptions_numeric,health_shortage_frequency_numeric,health_toilet_closures_numeric
0,18-24,Male,Child,4,No formal education,Farming,Rainwater,Rain water,30 - 70,Once per day,0.0,,4,3.0,High cost of water,Yes,Student,Secondary,Community-based,Less than 500,Less than 50,1,Borehole / Well,0.0,2-6 hours,Yes,Poor water quality,2-3 times,0,Rarely,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Clean water,Pipe water,Construct pipe water.,50.0,4,[High cost of water],,0.0,0.0,0.0,0.0,250000.0,55.555556,,1,4.0,,1.0,4.0,,,,
1,18-24,Male,Tenant,10,Secondary School,Business,Water Tank,500m-1km,30 - 70,Twice per day,45000.0,0.0,2,1.0,High cost of water,Yes,Student,University/College,Public,Less than 500,Less than 50,1,Water tank,500.0,2-6 hours,Maybe,Water is expensive,2-3 times,1,Once,Yes,A few times per month,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of purchasing water,Pipe,Poor roads affect the movement of water tanks,50.0,2,[High cost of water],500.0,45000.0,10.0,500.0,0.111111,250000.0,55.555556,,1,4.0,1.0,1.0,2.0,,,,
2,25-34,Male,Head of household,2,Tertiary school,Business,Water Tank,More than 1km,101 -120,Three or more times per day,14000.0,0.0,2,3.0,"High cost of water, Limited daily supply, Long...",No,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,not applicable,Not Applicable,500000.0,Not Applicable,,,,,,,,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Provide of quality and equitable water for all.,There is a need for installation of water pipe...,110.5,2,"[High cost of water, Limited daily supply, Lon...",1020.0,14000.0,3.111111,500000.0,111.111111,250000.0,55.555556,,not applicable,,,,,,,,
3,18-24,Male,Child,19,Secondary School,Salaried job,Water Tank,More than 1km,25 - 30,Three or more times per day,750000.0,1.0,3,3.0,High cost of water,Yes,Student,University/College,Private,"More than 2,000",50-100,1,Water tank,500000.0,Less than 2 hours,Yes,Poor water quality,Rarely,0,never,Yes,Daily,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Water shortages,Provision of water pipes and water treatments,We actually need clean water for our health,27.5,3,[High cost of water],1020.0,750000.0,166.666667,500000.0,111.111111,250000.0,55.555556,,1,1.0,0.0,1.0,7.0,,,,
4,18-24,Female,Child,20,No formal education,Farming,River/Lake,500m-1km,25 - 30,Twice per day,35000.0,,3,3.0,High cost of water,Yes,Student,University/College,Public,"1,001-2,000",More than 200,1,Piped water,1000000.0,2-6 hours,Maybe,Long queues,More than 3 times,none,2-3 times,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Setting up more piped water in every neighborh...,No,27.5,3,[High cost of water],500.0,35000.0,7.777778,1000000.0,222.222222,250000.0,55.555556,,1,4.0,2.5,1.0,4.0,,,,


In [177]:
from collections import Counter

# Split multiple responses, remove extra spaces, and flatten
all_services = (
    partially_cleaned_dataset["health_services_affected"]
    .dropna()
    .apply(lambda x: [s.strip() for s in x.split(",")])
)
flat_list = [service for sublist in all_services for service in sublist]

# Count frequency of each unique service
service_counts = Counter(flat_list)

# Display the counts
for service, count in service_counts.items():
    print(f"{service}: {count}")

Patient hygiene (washing: 16
bathing): 16
Cleaning & sanitation: 9
Maternity services: 7
Surgery & operations: 4
Laboratory services: 5


In [178]:
# Mapping for health_handwashing_impact
impact_mapping = {"rarely affected": 0, "sometimes affected": 1, "always affected": 2}

# Convert the column to lowercase and strip spaces
partially_cleaned_dataset["health_handwashing_impact_numeric"] = (
    partially_cleaned_dataset["health_handwashing_impact"]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace(impact_mapping)
)

# Quick check
partially_cleaned_dataset[
    ["health_handwashing_impact", "health_handwashing_impact_numeric"]
].head(10)

Unnamed: 0,health_handwashing_impact,health_handwashing_impact_numeric
0,,
1,,
2,,
3,,
4,,
5,Sometimes affected,1.0
6,,
7,,
8,,
9,Always affected,2.0


In [179]:
partially_cleaned_dataset.head()

Unnamed: 0,age_group,gender,role_in_household,household_size,household_head_education,primary_income,main_drinking_water,distance_to_water_source,litres_per_person_per_day,bathing_frequency,weekly_water_cost,equitable_access,household_shortage_frequency,household_shortage_seriousness,shortage_reason,school_affiliation,school_role,school_type,school_ownership,school_student_population,school_staff_population,school_has_potable_water,school_water_source,school_monthly_water_cost,school_hours_water_available,school_equitable_access,school_water_challenges,school_drinking_frequency,school_has_handwashing_facilities,school_handwashing_frequency,school_sleep_without_bathing,school_sleep_without_bathing_frequency,health_affiliation,health_facility_typehealth_facility_type,health_facility_ownership,health_daily_patients,health_staff_count,health_water_source,health_water_reliability,health_monthly_water_cost,health_shortage_frequency,health_shortage_coping,health_service_disruptions,health_services_affected,health_toilet_closures,health_sanitation_management,health_handwashing_impact,health_hygiene_problems,health_sanitation_difficulty,community_water_challenges,community_water_suggestions,additional_information,litres_per_person_per_day_num,household_shortage_frequency_num,shortage_reason_list,distance_to_water_source_numeric,weekly_water_cost_ssp,weekly_water_cost_usd,school_monthly_water_cost_ssp,school_monthly_water_cost_usd,health_monthly_water_cost_ssp,health_monthly_water_cost_usd,school_has_potable_water_numeric,school_has_potable_water_clean,school_hours_water_available_numeric,school_handwashing_frequency_numeric,school_sleep_without_bathing_numeric,school_sleep_without_bathing_frequency_numeric,health_water_reliability_numeric,health_service_disruptions_numeric,health_shortage_frequency_numeric,health_toilet_closures_numeric,health_handwashing_impact_numeric
0,18-24,Male,Child,4,No formal education,Farming,Rainwater,Rain water,30 - 70,Once per day,0.0,,4,3.0,High cost of water,Yes,Student,Secondary,Community-based,Less than 500,Less than 50,1,Borehole / Well,0.0,2-6 hours,Yes,Poor water quality,2-3 times,0,Rarely,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Clean water,Pipe water,Construct pipe water.,50.0,4,[High cost of water],,0.0,0.0,0.0,0.0,250000.0,55.555556,,1,4.0,,1.0,4.0,,,,,
1,18-24,Male,Tenant,10,Secondary School,Business,Water Tank,500m-1km,30 - 70,Twice per day,45000.0,0.0,2,1.0,High cost of water,Yes,Student,University/College,Public,Less than 500,Less than 50,1,Water tank,500.0,2-6 hours,Maybe,Water is expensive,2-3 times,1,Once,Yes,A few times per month,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of purchasing water,Pipe,Poor roads affect the movement of water tanks,50.0,2,[High cost of water],500.0,45000.0,10.0,500.0,0.111111,250000.0,55.555556,,1,4.0,1.0,1.0,2.0,,,,,
2,25-34,Male,Head of household,2,Tertiary school,Business,Water Tank,More than 1km,101 -120,Three or more times per day,14000.0,0.0,2,3.0,"High cost of water, Limited daily supply, Long...",No,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,not applicable,Not Applicable,500000.0,Not Applicable,,,,,,,,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Provide of quality and equitable water for all.,There is a need for installation of water pipe...,110.5,2,"[High cost of water, Limited daily supply, Lon...",1020.0,14000.0,3.111111,500000.0,111.111111,250000.0,55.555556,,not applicable,,,,,,,,,
3,18-24,Male,Child,19,Secondary School,Salaried job,Water Tank,More than 1km,25 - 30,Three or more times per day,750000.0,1.0,3,3.0,High cost of water,Yes,Student,University/College,Private,"More than 2,000",50-100,1,Water tank,500000.0,Less than 2 hours,Yes,Poor water quality,Rarely,0,never,Yes,Daily,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Water shortages,Provision of water pipes and water treatments,We actually need clean water for our health,27.5,3,[High cost of water],1020.0,750000.0,166.666667,500000.0,111.111111,250000.0,55.555556,,1,1.0,0.0,1.0,7.0,,,,,
4,18-24,Female,Child,20,No formal education,Farming,River/Lake,500m-1km,25 - 30,Twice per day,35000.0,,3,3.0,High cost of water,Yes,Student,University/College,Public,"1,001-2,000",More than 200,1,Piped water,1000000.0,2-6 hours,Maybe,Long queues,More than 3 times,none,2-3 times,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Setting up more piped water in every neighborh...,No,27.5,3,[High cost of water],500.0,35000.0,7.777778,1000000.0,222.222222,250000.0,55.555556,,1,4.0,2.5,1.0,4.0,,,,,


In [180]:
# Mapping dictionaries
hygiene_mapping = {"slightly": 1, "significantly": 2}

sanitation_mapping = {
    "strongly disagree": 0,
    "neutral": 1,
    "agree": 2,
    "strongly agree": 3,
}

# Convert to numeric columns
partially_cleaned_dataset["health_hygiene_problems_numeric"] = (
    partially_cleaned_dataset["health_hygiene_problems"]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace(hygiene_mapping)
)

partially_cleaned_dataset["health_sanitation_difficulty_numeric"] = (
    partially_cleaned_dataset["health_sanitation_difficulty"]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace(sanitation_mapping)
)

# Preview
partially_cleaned_dataset[
    [
        "health_hygiene_problems",
        "health_hygiene_problems_numeric",
        "health_sanitation_difficulty",
        "health_sanitation_difficulty_numeric",
    ]
].head(10)

Unnamed: 0,health_hygiene_problems,health_hygiene_problems_numeric,health_sanitation_difficulty,health_sanitation_difficulty_numeric
0,,,Not Applicable,not applicable
1,,,Not Applicable,not applicable
2,,,Not Applicable,not applicable
3,,,Not Applicable,not applicable
4,,,Not Applicable,not applicable
5,slightly,1.0,Neutral,1
6,,,Not Applicable,not applicable
7,,,Not Applicable,not applicable
8,,,Not Applicable,not applicable
9,slightly,1.0,Strongly disagree,0


In [181]:
partially_cleaned_dataset.head()

Unnamed: 0,age_group,gender,role_in_household,household_size,household_head_education,primary_income,main_drinking_water,distance_to_water_source,litres_per_person_per_day,bathing_frequency,weekly_water_cost,equitable_access,household_shortage_frequency,household_shortage_seriousness,shortage_reason,school_affiliation,school_role,school_type,school_ownership,school_student_population,school_staff_population,school_has_potable_water,school_water_source,school_monthly_water_cost,school_hours_water_available,school_equitable_access,school_water_challenges,school_drinking_frequency,school_has_handwashing_facilities,school_handwashing_frequency,school_sleep_without_bathing,school_sleep_without_bathing_frequency,health_affiliation,health_facility_typehealth_facility_type,health_facility_ownership,health_daily_patients,health_staff_count,health_water_source,health_water_reliability,health_monthly_water_cost,health_shortage_frequency,health_shortage_coping,health_service_disruptions,health_services_affected,health_toilet_closures,health_sanitation_management,health_handwashing_impact,health_hygiene_problems,health_sanitation_difficulty,community_water_challenges,community_water_suggestions,additional_information,litres_per_person_per_day_num,household_shortage_frequency_num,shortage_reason_list,distance_to_water_source_numeric,weekly_water_cost_ssp,weekly_water_cost_usd,school_monthly_water_cost_ssp,school_monthly_water_cost_usd,health_monthly_water_cost_ssp,health_monthly_water_cost_usd,school_has_potable_water_numeric,school_has_potable_water_clean,school_hours_water_available_numeric,school_handwashing_frequency_numeric,school_sleep_without_bathing_numeric,school_sleep_without_bathing_frequency_numeric,health_water_reliability_numeric,health_service_disruptions_numeric,health_shortage_frequency_numeric,health_toilet_closures_numeric,health_handwashing_impact_numeric,health_hygiene_problems_numeric,health_sanitation_difficulty_numeric
0,18-24,Male,Child,4,No formal education,Farming,Rainwater,Rain water,30 - 70,Once per day,0.0,,4,3.0,High cost of water,Yes,Student,Secondary,Community-based,Less than 500,Less than 50,1,Borehole / Well,0.0,2-6 hours,Yes,Poor water quality,2-3 times,0,Rarely,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Clean water,Pipe water,Construct pipe water.,50.0,4,[High cost of water],,0.0,0.0,0.0,0.0,250000.0,55.555556,,1,4.0,,1.0,4.0,,,,,,,not applicable
1,18-24,Male,Tenant,10,Secondary School,Business,Water Tank,500m-1km,30 - 70,Twice per day,45000.0,0.0,2,1.0,High cost of water,Yes,Student,University/College,Public,Less than 500,Less than 50,1,Water tank,500.0,2-6 hours,Maybe,Water is expensive,2-3 times,1,Once,Yes,A few times per month,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of purchasing water,Pipe,Poor roads affect the movement of water tanks,50.0,2,[High cost of water],500.0,45000.0,10.0,500.0,0.111111,250000.0,55.555556,,1,4.0,1.0,1.0,2.0,,,,,,,not applicable
2,25-34,Male,Head of household,2,Tertiary school,Business,Water Tank,More than 1km,101 -120,Three or more times per day,14000.0,0.0,2,3.0,"High cost of water, Limited daily supply, Long...",No,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,not applicable,Not Applicable,500000.0,Not Applicable,,,,,,,,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Provide of quality and equitable water for all.,There is a need for installation of water pipe...,110.5,2,"[High cost of water, Limited daily supply, Lon...",1020.0,14000.0,3.111111,500000.0,111.111111,250000.0,55.555556,,not applicable,,,,,,,,,,,not applicable
3,18-24,Male,Child,19,Secondary School,Salaried job,Water Tank,More than 1km,25 - 30,Three or more times per day,750000.0,1.0,3,3.0,High cost of water,Yes,Student,University/College,Private,"More than 2,000",50-100,1,Water tank,500000.0,Less than 2 hours,Yes,Poor water quality,Rarely,0,never,Yes,Daily,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Water shortages,Provision of water pipes and water treatments,We actually need clean water for our health,27.5,3,[High cost of water],1020.0,750000.0,166.666667,500000.0,111.111111,250000.0,55.555556,,1,1.0,0.0,1.0,7.0,,,,,,,not applicable
4,18-24,Female,Child,20,No formal education,Farming,River/Lake,500m-1km,25 - 30,Twice per day,35000.0,,3,3.0,High cost of water,Yes,Student,University/College,Public,"1,001-2,000",More than 200,1,Piped water,1000000.0,2-6 hours,Maybe,Long queues,More than 3 times,none,2-3 times,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Setting up more piped water in every neighborh...,No,27.5,3,[High cost of water],500.0,35000.0,7.777778,1000000.0,222.222222,250000.0,55.555556,,1,4.0,2.5,1.0,4.0,,,,,,,not applicable


In [184]:
# Define keyword themes for each column
challenge_keywords = {
    "high_cost": ["high cost", "expensive", "price"],
    "shortage": ["shortage", "limited", "insufficient"],
    "quality": ["dirty water", "poor quality", "unhealthy"],
    "access": ["distance", "far", "hard to reach"],
    "infrastructure": ["roads", "pipeline", "facility", "lack"],
}

suggestion_keywords = {
    "pipes": ["pipe", "pipelines", "tap"],
    "boreholes": ["borehole"],
    "treatment": ["treat", "chlorine", "purification", "filter"],
    "more_taps": ["more taps", "additional taps"],
    "education": ["education", "awareness", "training"],
}


# Function to tag themes
def tag_keywords(text, keywords):
    if pd.isna(text):
        return {k: 0 for k in keywords}
    text_lower = str(text).lower()
    return {
        k: int(any(kw in text_lower for kw in kw_list))
        for k, kw_list in keywords.items()
    }


# Apply tagging for each column
challenge_tags = partially_cleaned_dataset["community_water_challenges"].apply(
    lambda x: pd.Series(tag_keywords(x, challenge_keywords))
)

suggestion_tags = partially_cleaned_dataset["community_water_suggestions"].apply(
    lambda x: pd.Series(tag_keywords(x, suggestion_keywords))
)

additional_tags = partially_cleaned_dataset["additional_information"].apply(
    lambda x: pd.Series(tag_keywords(x, {**challenge_keywords, **suggestion_keywords}))
)

# Combine tags with the original dataset
partially_cleaned_dataset = pd.concat(
    [
        partially_cleaned_dataset,
        challenge_tags,
        suggestion_tags,
        additional_tags.add_prefix("additional_"),
    ],
    axis=1,
)

# Quick checks
print(partially_cleaned_dataset.head())

# Summarize counts across dataset
print(
    "Challenge counts:\n",
    partially_cleaned_dataset[
        ["high_cost", "shortage", "quality", "access", "infrastructure"]
    ].sum(),
)
print(
    "Suggestion counts:\n",
    partially_cleaned_dataset[
        ["pipes", "boreholes", "treatment", "more_taps", "education"]
    ].sum(),
)
print(
    "Additional information counts:\n",
    partially_cleaned_dataset.filter(like="additional_").sum(),
)

  age_group  gender  role_in_household  household_size  \
0     18-24    Male              Child               4   
1     18-24    Male             Tenant              10   
2     25-34    Male  Head of household               2   
3     18-24    Male              Child              19   
4     18-24  Female              Child              20   

  household_head_education primary_income main_drinking_water  \
0      No formal education        Farming           Rainwater   
1         Secondary School       Business          Water Tank   
2          Tertiary school       Business          Water Tank   
3         Secondary School   Salaried job          Water Tank   
4      No formal education        Farming          River/Lake   

  distance_to_water_source litres_per_person_per_day  \
0               Rain water                   30 - 70   
1                 500m-1km                   30 - 70   
2            More than 1km                  101 -120   
3            More than 1km          

In [185]:
partially_cleaned_dataset.head()

Unnamed: 0,age_group,gender,role_in_household,household_size,household_head_education,primary_income,main_drinking_water,distance_to_water_source,litres_per_person_per_day,bathing_frequency,weekly_water_cost,equitable_access,household_shortage_frequency,household_shortage_seriousness,shortage_reason,school_affiliation,school_role,school_type,school_ownership,school_student_population,school_staff_population,school_has_potable_water,school_water_source,school_monthly_water_cost,school_hours_water_available,school_equitable_access,school_water_challenges,school_drinking_frequency,school_has_handwashing_facilities,school_handwashing_frequency,school_sleep_without_bathing,school_sleep_without_bathing_frequency,health_affiliation,health_facility_typehealth_facility_type,health_facility_ownership,health_daily_patients,health_staff_count,health_water_source,health_water_reliability,health_monthly_water_cost,health_shortage_frequency,health_shortage_coping,health_service_disruptions,health_services_affected,health_toilet_closures,health_sanitation_management,health_handwashing_impact,health_hygiene_problems,health_sanitation_difficulty,community_water_challenges,community_water_suggestions,additional_information,litres_per_person_per_day_num,household_shortage_frequency_num,shortage_reason_list,distance_to_water_source_numeric,weekly_water_cost_ssp,weekly_water_cost_usd,school_monthly_water_cost_ssp,school_monthly_water_cost_usd,health_monthly_water_cost_ssp,health_monthly_water_cost_usd,school_has_potable_water_numeric,school_has_potable_water_clean,school_hours_water_available_numeric,school_handwashing_frequency_numeric,school_sleep_without_bathing_numeric,school_sleep_without_bathing_frequency_numeric,health_water_reliability_numeric,health_service_disruptions_numeric,health_shortage_frequency_numeric,health_toilet_closures_numeric,health_handwashing_impact_numeric,health_hygiene_problems_numeric,health_sanitation_difficulty_numeric,high_cost,shortage,quality,access,infrastructure,pipes,boreholes,treatment,more_taps,education,additional_high_cost,additional_shortage,additional_quality,additional_access,additional_infrastructure,additional_pipes,additional_boreholes,additional_treatment,additional_more_taps,additional_education,high_cost.1,shortage.1,quality.1,access.1,infrastructure.1,pipes.1,boreholes.1,treatment.1,more_taps.1,education.1,additional_high_cost.1,additional_shortage.1,additional_quality.1,additional_access.1,additional_infrastructure.1,additional_pipes.1,additional_boreholes.1,additional_treatment.1,additional_more_taps.1,additional_education.1
0,18-24,Male,Child,4,No formal education,Farming,Rainwater,Rain water,30 - 70,Once per day,0.0,,4,3.0,High cost of water,Yes,Student,Secondary,Community-based,Less than 500,Less than 50,1,Borehole / Well,0.0,2-6 hours,Yes,Poor water quality,2-3 times,0,Rarely,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Clean water,Pipe water,Construct pipe water.,50.0,4,[High cost of water],,0.0,0.0,0.0,0.0,250000.0,55.555556,,1,4.0,,1.0,4.0,,,,,,,not applicable,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,18-24,Male,Tenant,10,Secondary School,Business,Water Tank,500m-1km,30 - 70,Twice per day,45000.0,0.0,2,1.0,High cost of water,Yes,Student,University/College,Public,Less than 500,Less than 50,1,Water tank,500.0,2-6 hours,Maybe,Water is expensive,2-3 times,1,Once,Yes,A few times per month,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of purchasing water,Pipe,Poor roads affect the movement of water tanks,50.0,2,[High cost of water],500.0,45000.0,10.0,500.0,0.111111,250000.0,55.555556,,1,4.0,1.0,1.0,2.0,,,,,,,not applicable,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,25-34,Male,Head of household,2,Tertiary school,Business,Water Tank,More than 1km,101 -120,Three or more times per day,14000.0,0.0,2,3.0,"High cost of water, Limited daily supply, Long...",No,Not Applicable,Not Applicable,Not Applicable,Not Applicable,Not Applicable,not applicable,Not Applicable,500000.0,Not Applicable,,,,,,,,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Provide of quality and equitable water for all.,There is a need for installation of water pipe...,110.5,2,"[High cost of water, Limited daily supply, Lon...",1020.0,14000.0,3.111111,500000.0,111.111111,250000.0,55.555556,,not applicable,,,,,,,,,,,not applicable,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0
3,18-24,Male,Child,19,Secondary School,Salaried job,Water Tank,More than 1km,25 - 30,Three or more times per day,750000.0,1.0,3,3.0,High cost of water,Yes,Student,University/College,Private,"More than 2,000",50-100,1,Water tank,500000.0,Less than 2 hours,Yes,Poor water quality,Rarely,0,never,Yes,Daily,No,,,,,,,250000.0,,,,,,,,,Not Applicable,Water shortages,Provision of water pipes and water treatments,We actually need clean water for our health,27.5,3,[High cost of water],1020.0,750000.0,166.666667,500000.0,111.111111,250000.0,55.555556,,1,1.0,0.0,1.0,7.0,,,,,,,not applicable,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0
4,18-24,Female,Child,20,No formal education,Farming,River/Lake,500m-1km,25 - 30,Twice per day,35000.0,,3,3.0,High cost of water,Yes,Student,University/College,Public,"1,001-2,000",More than 200,1,Piped water,1000000.0,2-6 hours,Maybe,Long queues,More than 3 times,none,2-3 times,Yes,A few times per week,No,,,,,,,250000.0,,,,,,,,,Not Applicable,High cost of water,Setting up more piped water in every neighborh...,No,27.5,3,[High cost of water],500.0,35000.0,7.777778,1000000.0,222.222222,250000.0,55.555556,,1,4.0,2.5,1.0,4.0,,,,,,,not applicable,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [186]:
# Save the fully cleaned dataset to CSV
partially_cleaned_dataset.to_csv(
    "final_cleaned_dataset.csv", index=False, encoding="utf-8"
)