## This notebook uses the micro-data sample from the Ghana census 2021 to construct a stratification frame, both at the district level and at the spatial unit level. 

# 0. Import Packages

In [None]:
import pandas as pd
import shapely.wkt
import pickle
import os
import re

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# 1. Read Data

In [None]:
# Pre-processed micro-data sample  
df_census = pd.read_parquet('../data/df_census_complete.parquet') 
df_census.head()

In [None]:
# Read subdistricts to spatial unit mapping
with open('../data/subdistrict_to_spatial_unit_dict.pickle', 'rb') as handle:
    df_subdistricts = pd.read_pickle(handle).drop(columns=['polygon'])
df_subdistricts.head()

In [None]:
# Dataframe with RWI at the spatial unit level
df_rwi = pd.read_csv('../data/df_rwi_unit.csv')
df_rwi['spatial_unit'] = df_rwi['spatial_unit'].apply(shapely.wkt.loads)
df_rwi.head()

In [None]:
# Dataframe with averaged RWI at the district level
df_districts_rwi = pd.read_csv('../data/df_districts_rwi.csv')
df_districts_rwi.head()

In [None]:
# Population density estimates at the spatial unit level
df_pop_density = pd.read_csv('../data/gha_pop_density.csv')
df_pop_density['spatial_unit'] = df_pop_density['spatial_unit'].apply(shapely.wkt.loads)
df_pop_density.head()

In [None]:
# Read dataframe of regions and their respective populations and the districts within them

def format_district_name(district: str) -> str:
    # Remove all spaces
    transformed_string = district.replace(' ', '')
    
    # Remove the trailing 'Municipal' and 'MetropolitanArea' if it exists
    transformed_string = re.sub(r'Municipal$', '', transformed_string)
    transformed_string = re.sub(r'MetropolitanArea\(STMA\)$', '', transformed_string)
    transformed_string = re.sub(r'MetropolitanArea\(CCMA\)$', '', transformed_string)
    transformed_string = re.sub(r'MetropolitanArea\(AMA\)$', '', transformed_string)
    transformed_string = re.sub(r'MetropolitanArea\(KMA\)$', '', transformed_string)
    transformed_string = re.sub(r'MetropolitanArea\(TMA\)$', '', transformed_string)

    return transformed_string

df_regions = pd.read_csv('../data/census_regions.csv')
df_regions['population'] = df_regions['population'].apply(lambda x: int(str(x).replace(',', '')))
df_regions['districts']= df_regions.drop(columns=['region', 'population']).values.tolist()
df_regions = df_regions[['region', 'population', 'districts']]
df_regions['districts'] = df_regions['districts'].apply(lambda y: [format_district_name(x) for x in y if str(x) != 'nan'])
df_regions.head()

In [None]:
for i in range(len(df_regions)):
    print(df_regions.loc[i,'districts'])

# 2. Create Stratification Frame at the District Level

In [None]:
grouped_data = df_census.groupby([
    'District', 
    'Age',
    'Female',
    'SchoolMiddleSchoolOrGreater', 
    'RWI', 
    'NDC_%', 
    'NPP_%', 
    'Age_coded'
]).sum()

grouped_data.reset_index().to_csv('../data/outputs/df_stratification_frame.csv', index=False)
grouped_data.head()

# 3. Create Stratification Frame at the Spatial Unit Level

In [None]:
# Drop RWI since it's at the district level and Age coded 
df_census_i = df_census.drop(columns=['RWI', 'Age_coded'])

In [None]:
# Average covariates by district  
df_grouped = df_census_i.groupby('District', as_index=False)[['Age', 'Female',
                                          'SchoolMiddleSchoolOrGreater',
                                          'NDC_%', 'NPP_%']].mean()
df_grouped.head()

In [None]:
# Merge spatial units to micro-data by district
df_subdistricts_i = df_subdistricts.merge(df_grouped, how='left', left_on='subdistrict', right_on='District').drop(columns=['subdistrict'])
df_subdistricts_i.head()

In [None]:
# Add RWI at the spatial unit level
df_subdistricts_rwi = df_subdistricts_i.merge(df_rwi, how='left', on='spatial_unit')
df_subdistricts_rwi.head()

In [None]:
# Merge district-level RWIs to the rwi column for spatial units that do not have an RWI
df_subdistricts_rwi_ = df_subdistricts_rwi.merge(df_districts_rwi, how='left', on='District')
df_subdistricts_rwi_['RWI_x'] = df_subdistricts_rwi_['RWI_x'].fillna(df_subdistricts_rwi_['RWI_y'])
df_subdistricts_rwi_ = df_subdistricts_rwi_.drop(columns=['RWI_y'])
df_subdistricts_rwi_ = df_subdistricts_rwi_.rename(columns={'RWI_x': 'RWI'})

df_subdistricts_rwi_.to_csv('../data/outputs/df_stratification_frame_unit.csv', index=False)
df_subdistricts_rwi_.head()

# 4. Get Population Estimates across Stratum for each Spatial Unit

In this section, we use the stratification frame at the spatial unit level and the population estimates at the spatial unit level to get population estimates across each stratum for every spatial unit. 

In [None]:
# merge population estimates to each spatial unit
df_subdistricts_rwi_ = df_subdistricts_rwi_.merge(df_pop_density[['spatial_unit', 'pop_2020']], how='left', on='spatial_unit')
df_subdistricts_rwi_.head()

In [None]:
# get spatial unit population average across all districts and fill in NaN values in pop_2020 column with the district average
df_pop_avgs = df_subdistricts_rwi_.groupby('District', as_index=False)['pop_2020'].mean().rename(columns={'pop_2020': 'pop_2020_avg'})
df_subdistricts_rwi_ = df_subdistricts_rwi_.merge(df_pop_avgs, how='left', on='District')
df_subdistricts_rwi_['pop_2020'] = df_subdistricts_rwi_['pop_2020'].fillna(df_subdistricts_rwi_['pop_2020_avg'])
df_subdistricts_rwi_ = df_subdistricts_rwi_.drop(columns=['pop_2020_avg'])
df_subdistricts_rwi_.to_csv('../data/outputs/df_stratification_frame_unit_population.csv', index=False)
df_subdistricts_rwi_.head()

In [None]:
def get_spatial_unit_population(spatial_unit):
    """
    Helper function to get population estimate and ID of spatial unit.

    Parameters:
        spatial_unit: Spatial unit object (Polygon object)

    Returns:
        pop: Population count of the spatial unit.
        ID: ID of the spatial unit.
    """
    try: 
        # get population count of spatial unit
        pop = df_pop_density[df_pop_density['spatial_unit'] == spatial_unit]['pop_2020'].values[0]

        # get ID of spatial unit
        ID = df_pop_density[df_pop_density['spatial_unit'] == spatial_unit]['ID'].values[0]
        
    except:
        return False 
    
    return (pop, ID)


def get_spatial_units(district: str):
    """
    Helper function to get the spatial units associated with GADN region.

    Parameters:
        district: A string indicating the district.

    Returns:
        spatial_units: A list spatial unit (Polygon object) associated with the district.
    """
    spatial_units = df_subdistricts[df_subdistricts['subdistrict'] == district]['spatial_unit'].values.tolist()
    
    return spatial_units

In [None]:
# for each district, calculate the population estimates for each stratum at the spatial unit level 
total_population_dict = {}  # Dictionary to store total population for each subregion

grouped_data = grouped_data.reset_index()
for d in grouped_data.District.unique():
    subregion_group = grouped_data[grouped_data['District'] == d]
    subregion_weights = (subregion_group['Weight'] / subregion_group['Weight'].sum()).values
    est_pop_dict = {}
    for unit in get_spatial_units(d):
        if not get_spatial_unit_population(unit): 
            continue
        total_population, ID = get_spatial_unit_population(unit)
        est_pop_dict[str(ID)] = subregion_weights * total_population
        
    df_subregion = pd.concat([subregion_group.reset_index(), pd.DataFrame(est_pop_dict.values()).T], ignore_index=True, axis=1)
    df_subregion.columns = subregion_group.reset_index().columns.tolist() + ["est_pop_" + k for k in est_pop_dict.keys()]
    df_subregion = df_subregion.drop(columns=['index'])
    
    # save each district as a new dataframe 
    df_subregion.to_parquet("../data/outputs/district_dfs/" + d.replace('-','')) # choose folder name
    

# 5. Population Validation


In [None]:
# Verify population counts with district and region

# Define the folder path containing the Parquet files
folder_path = '../data/outputs/district_dfs/'

# List of columns to exclude from summing
columns_to_exclude = ['Age', 'Female', 'SchoolMiddleSchoolOrGreater', 'Weight', 'RWI', 'NDC_%', 'NPP_%', 'Age_coded', 'District'] 

# Initialize an empty DataFrame to store the sum results
sum_results = {}

# Iterate through all Parquet files in the folder
for filename in os.listdir(folder_path):
    parquet_file_path = os.path.join(folder_path, filename)

    # Read the Parquet file into a DataFrame
    df = pd.read_parquet(parquet_file_path)

    # Calculate the sum across all columns (except excluded columns)
    sum_df = df.drop(columns=columns_to_exclude).sum().sum()

    # Append the sum results to the sum_results DataFrame
    sum_results[filename] = sum_df


In [None]:
# def convert_district_string(input_string):
#     if input_string == "Komenda Edina Eguafo Abirem Municipal":
#         return 'Komenda-Edina-Eguafo-Abirem-'
    
#     if input_string == 'Abura Asebu Kwamankese':
#         return 'Abura-Asebu-Kwamankese'

#     # Replace '/' with '-'
#     replaced_string = input_string.replace('-', '')
#     replaced_string = replaced_string.replace('/', '-')
    
#     # Remove all spaces
#     transformed_string = replaced_string.replace(' ', '')
    
#     # Remove the trailing 'Municipal' and 'MetropolitanArea(STMA)' if it exists
#     result_string = re.sub(r'Municipal$', '', transformed_string)
#     result_string = re.sub(r'MetropolitanArea\(STMA\)$', '', result_string)
#     result_string = re.sub(r'MetropolitanArea\(CCMA\)$', '', result_string)
    
#     return result_string

# Region population validation
for i, r in df_regions.iterrows(): 
    summed_pop = 0
    for district in r['districts']:
        print(district)
        # summed_pop += sum_results[convert_district_string(district)]
        # summed_pop += sum_results[district.replace("/", "&")]
        summed_pop += sum_results[district]
    
    actual_pop = r['population']
    print(r['region'], "\nActual:", actual_pop, "\nEstimated:", round(summed_pop), "\n% change: " + str(round(((summed_pop - actual_pop) / summed_pop)*100, 2)) + "%\n")