# LSMS urban calculations

For processed LSMS data, calculate DEGURBA urbanization calculation

In [1]:
import sys, os, math
import rasterio, pygeohash

import pandas as pd
import geopandas as gpd
import GOSTrocks.rasterMisc as rMisc

from tqdm.notebook import tqdm
from shapely.geometry import Point
from GOSTrocks.misc import tPrint


In [2]:
base_folder = "C:/WBG/Work/Projects/LSMS_Urban"
data_folder = os.path.join(base_folder, "data")
output_folder = os.path.join(base_folder, "output")
lsms_folder = os.path.join(data_folder, "Data")

for c_folder in [data_folder, output_folder, lsms_folder]:
    if not os.path.exists(c_folder):
        os.makedirs(c_folder)

m_crs = 6933

# Download LSMS data if not already present
data_download_url = "https://github.com/lsms-worldbank/LSMS-ISA-harmonised-dataset-on-agricultural-productivity-and-welfare/releases/download/v2.0/Data.zip"
data_zip_path = os.path.join(data_folder, "LSMS_data.zip")
if not os.path.exists(data_zip_path):
    import urllib.request
    print("Downloading LSMS data...")
    urllib.request.urlretrieve(data_download_url, data_zip_path)
    print("Download complete.")

    import zipfile
    with zipfile.ZipFile(data_zip_path, 'r') as zip_ref:
        zip_ref.extractall(data_folder)

# Download and unzip urbanization data if not already present
local_source_folder = "C:/WBG/Work/data/URBAN/"
urban_data_url = "https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/GHSL/GHS_SMOD_GLOBE_R2023A/GHS_SMOD_E{year}_GLOBE_R2023A_54009_1000/V2-0/GHS_SMOD_E{year}_GLOBE_R2023A_54009_1000_V2_0.zip"  
out_zip_file = os.path.join(local_source_folder, "ZIPPED_SMOD", "GHS_SMOD_E{year}_GLOBE_R2023A_54009_1000_V2_0.zip")
urban_pop_url  = "https://jeodpp.jrc.ec.europa.eu/ftp/jrc-opendata/GHSL/GHS_POP_GLOBE_R2023A/GHS_POP_E{year}_GLOBE_R2023A_54009_1000/V1-0/GHS_POP_E{year}_GLOBE_R2023A_54009_1000_V1_0.zip"                
out_pop_zip_file = os.path.join(local_source_folder, "ZIPPED_SMOD", "GHS_POP_E{year}_GLOBE_R2023A_54009_1000_V1_0.zip")
out_folder = os.path.join(local_source_folder, "SMOD")
out_pop_folder = os.path.join(local_source_folder, "SMOD_POP")
ghs_smod_path = os.path.join(local_source_folder, "SMOD", "GHS_SMOD_E{year}_GLOBE_R2023A_54009_1000_V2_0.tif")
ghs_pop_path = os.path.join(local_source_folder, "SMOD_POP", "GHS_POP_E{year}_GLOBE_R2023A_54009_1000_V1_0.tif")

for year in [2005, 2010, 2015, 2020, 2025, 2030]:
    if not os.path.exists(out_zip_file.format(year=year)):
        print(f"Downloading GHS_SMOD data for {year}...")
        urllib.request.urlretrieve(urban_data_url.format(year=year), out_zip_file.format(year=year))
        print("Download complete.")

        with zipfile.ZipFile(out_zip_file.format(year=year), 'r') as zip_ref:
            zip_ref.extractall(out_folder)
    if not os.path.exists(out_pop_zip_file.format(year=year)):
        print(f"Downloading GHS_POP data for {year}...")
        urllib.request.urlretrieve(urban_pop_url.format(year=year), out_pop_zip_file.format(year=year))
        print("Download complete.")

        with zipfile.ZipFile(out_pop_zip_file.format(year=year), 'r') as zip_ref:
            zip_ref.extractall(out_folder)
    

In [3]:
os.listdir(lsms_folder)

['250716_hh_interview_date_AFECE.dta',
 'Household_dataset.dta',
 'Individual_dataset.dta',
 'Plotcrop_dataset.dta',
 'Plot_dataset.dta']

In [4]:
# Read household data and create GeoDataFrame
household_data = pd.read_stata(os.path.join(lsms_folder, "Household_dataset.dta"))
hh_data_geoms = [Point(xy) for xy in zip(household_data['lon_modified'], household_data['lat_modified'])]
household_gdf = gpd.GeoDataFrame(household_data, geometry=hh_data_geoms, crs=4326)
# Remove empty and invalid geometries
household_gdf = household_gdf[~household_gdf['geometry'].is_empty]
household_gdf = household_gdf[household_gdf['geometry'].is_valid]

# Create location_wave column that is designed to match with GHS-SMOD data
household_gdf['location_wave'] = household_gdf.apply(lambda x: str(x['country']) + "_" + str(x['wave']), axis=1)

# The interview date data is in a separate file not currently published with the main LSMS dataset
interview_date = pd.read_stata(os.path.join(lsms_folder, "250716_hh_interview_date_AFECE.dta"))
household_gdf = household_gdf.merge(interview_date[['hh_id_merge', 'household_interview_day']], on='hh_id_merge', how='left')
household_gdf['year'] = household_gdf['household_interview_day'].dt.year

# round the year of the interview to the GHS-SMOD data. Round down to nearest 5 years starting from 2010
# If the year is within 1 year of the next 5-year interval, round up instead
def round_ghs_year(year):
    if year < 2006:
        return 2005
    lower_bound = 2010 + 5 * ((year - 2010) // 5)
    upper_bound = lower_bound + 5
    if year - lower_bound <= 1:
        return lower_bound
    elif upper_bound - year <= 1:
        return upper_bound
    else:
        return lower_bound
    
household_gdf['ghs_smod_year'] = household_gdf['year'].apply(round_ghs_year)    
household_gdf.head()

One or more strings in the dta file could not be decoded using utf-8, and
so the fallback encoding of latin-1 is being used.  This can happen when a file
has been incorrectly encoded by Stata or some other software. You should verify
the string values returned are correct.
  household_data = pd.read_stata(os.path.join(lsms_folder, "Household_dataset.dta"))


Unnamed: 0,country,wave,hh_id_merge,hh_id_obs,season,pw,ea_id_merge,ea_id_obs,strataid,urban,...,totcons_LCU,totcons_USD,cons_quint,hh_asset_index,HDDS,geometry,location_wave,household_interview_day,year,ghs_smod_year
0,Ethiopia,1.0,1010101601162,1000001.0,1.0,1887.646118,10101088801601,1000002.0,7.0,No,...,4228.026367,301.780086,4.0,5.598236,5.0,POINT (37.89088 14.35382),Ethiopia_1.0,2012-01-09,2012.0,2010.0
1,Ethiopia,1.0,1010101601002,1000013.0,1.0,2236.134521,10101088801601,1000002.0,7.0,No,...,2336.555908,166.774278,2.0,0.241174,6.0,POINT (37.89088 14.35382),Ethiopia_1.0,2012-01-17,2012.0,2010.0
2,Ethiopia,1.0,1010101601017,1000014.0,1.0,2236.134521,10101088801601,1000002.0,7.0,No,...,2122.807129,151.517721,1.0,-0.369163,4.0,POINT (37.89088 14.35382),Ethiopia_1.0,2012-01-21,2012.0,2010.0
3,Ethiopia,1.0,1010101601034,1000015.0,1.0,2236.134521,10101088801601,1000002.0,7.0,No,...,6116.216309,436.551744,4.0,-0.175312,5.0,POINT (37.89088 14.35382),Ethiopia_1.0,2012-01-16,2012.0,2010.0
4,Ethiopia,1.0,1010101601049,1000016.0,1.0,2236.134521,10101088801601,1000002.0,7.0,No,...,3479.487305,248.352278,3.0,-0.175312,5.0,POINT (37.89088 14.35382),Ethiopia_1.0,2012-01-09,2012.0,2010.0


In [5]:
# The two MALI waves do not have dates in the interview dataset. Based on the table of information
# in the published paper, these will be set to 2015.0
household_gdf.loc[(household_gdf['country'] == 'Mali'), 'ghs_smod_year'] = 2015

# Calculate min and max interview years per country-wave to help with imputation
xx = household_gdf.groupby('location_wave').apply(lambda x: [x['household_interview_day'].min().year, x['household_interview_day'].max().year])
wave_dates = pd.DataFrame(xx.tolist(), index=xx.index, columns=['min_year', 'max_year'])
wave_dates.reset_index(inplace=True)

# For the remaining NAN values, look up based on country and wave
for idx, row in household_gdf[household_gdf['ghs_smod_year'].isna()].iterrows():
    c_wave = row['location_wave']
    c_min_year = wave_dates.loc[wave_dates['location_wave'] == c_wave, 'min_year'].values[0]
    household_gdf.loc[idx, 'ghs_smod_year'] = round_ghs_year(c_min_year)

# Check if there are any remaining NAN values
household_gdf.loc[household_gdf['ghs_smod_year'].isna(), ['country', 'wave', 'year', 'ghs_smod_year']]

  xx = household_gdf.groupby('location_wave').apply(lambda x: [x['household_interview_day'].min().year, x['household_interview_day'].max().year])


Unnamed: 0,country,wave,year,ghs_smod_year


In [6]:
# Create geohash and buffered geometries for zonal stats
household_gdf['geohash'] = household_gdf.apply(lambda row: pygeohash.encode(row['lat_modified'], row['lon_modified'], precision=7), axis=1) 
household_gdf_buffered = household_gdf.to_crs(epsg=m_crs)
household_gdf_buffered['geometry'] = household_gdf_buffered.geometry.buffer(5000)
household_gdf_buffered.loc[household_gdf_buffered['urban'] == "Yes", 'geometry'] = household_gdf.loc[household_gdf['urban'] == "Yes"].geometry.buffer(2000)

# concatenate geohash and ghs_smod_year for unique identification of household and interview
household_gdf_buffered['geohash_year'] = household_gdf_buffered.apply(lambda x: str(x['geohash']) + "_" + str(x['ghs_smod_year']), axis=1)

# Get a list of unique locations for zonal stats based on geohash_smod_year
unique_geohashes = household_gdf_buffered['geohash_year'].unique()

print(f"Calculating zonal stats for {len(unique_geohashes)} unique geohash-year combinations of {household_gdf_buffered.shape[0]} households")


  household_gdf_buffered.loc[household_gdf_buffered['urban'] == "Yes", 'geometry'] = household_gdf.loc[household_gdf['urban'] == "Yes"].geometry.buffer(2000)


Calculating zonal stats for 14123 unique geohash-year combinations of 293669 households


In [7]:
unq_locations = household_gdf_buffered['geohash_year'].duplicated(keep='first')
zonal_households_gdf = household_gdf_buffered[~unq_locations].copy()
zonal_households_gdf.loc[:,['geohash_year', 'ghs_smod_year', 'geocoords_id', 'lat_modified', 'lon_modified', 'geometry']]

Unnamed: 0,geohash_year,ghs_smod_year,geocoords_id,lat_modified,lon_modified,geometry
0,sfdcwjp_2010.0,2010.0,1001574.0,14.353816,37.890876,"POLYGON ((3660949.711 1812566.502, 3660925.635..."
12,sfe1m3p_2010.0,2010.0,1001569.0,14.288590,38.210252,"POLYGON ((3691765.075 1804497.319, 3691740.999..."
24,sfe272x_2010.0,2010.0,1001542.0,14.109761,38.473835,"POLYGON ((2038.474 14.11, 2028.843 -181.925, 2..."
36,sf7qg8h_2010.0,2010.0,1001519.0,13.844084,38.480325,"POLYGON ((3717823.426 1749447.24, 3717799.35 1..."
48,sfe00k6_2010.0,2010.0,1001535.0,14.086276,37.983187,"POLYGON ((3669856.382 1779454.599, 3669832.305..."
...,...,...,...,...,...,...
288661,kxvu9fd_2010.0,2010.0,7000074.0,-0.600329,30.665941,"POLYGON ((2030.666 -0.6, 2021.035 -196.635, 19..."
289521,s8j3mmu_2010.0,2010.0,7000263.0,0.252456,30.120001,"POLYGON ((2030.12 0.252, 2020.489 -195.782, 19..."
291545,s8j8vyn_2010.0,2010.0,7000235.0,0.165164,30.496117,"POLYGON ((2947456.857 21070.508, 2947432.78 20..."
291597,s8n6wgx_2010.0,2010.0,7000527.0,0.459744,31.595730,"POLYGON ((3053554.443 58650.454, 3053530.367 5..."


In [8]:
# Group zonal households by ghs_smod_year to process each year's raster separately
zonal_stats_list = []
zonal_pop_list = []
for ghs_year in tqdm(zonal_households_gdf['ghs_smod_year'].unique()):
    if not math.isnan(ghs_year):
        ghs_smod_path_year = ghs_smod_path.format(year=str(int(ghs_year)))
        ghs_pop_path_year = ghs_pop_path.format(year=str(int(ghs_year)))

        in_smod = rasterio.open(ghs_smod_path_year)
        in_pop = rasterio.open(ghs_pop_path_year)

        zonal_subset = zonal_households_gdf[zonal_households_gdf['ghs_smod_year'] == ghs_year]
        
        # Generate zonal statistics for buffered household locations for this SMOD year
        out_ghs_columns = ['GHS_SMOD_11', 'GHS_SMOD_12', 'GHS_SMOD_13', 'GHS_SMOD_21', 'GHS_SMOD_22', 'GHS_SMOD_23', 'GHS_SMOD_30']
        zonal_stats = rMisc.zonalStats(zonal_subset, in_smod, rastType='C', unqVals=[11,12,13,21,22,23,30], reProj=True)
        zonal_stats_df = pd.DataFrame(zonal_stats, columns=out_ghs_columns)
        zonal_stats_df['g_id'] = zonal_subset['geohash'].values
        
        for cCol in out_ghs_columns:
            zonal_households_gdf.loc[zonal_subset.index, cCol] = zonal_stats_df[cCol].values

        # Clip population and SMOD rasters to the zonal subset in order to calculate urban population
        pop_data, pop_meta = rMisc.clipRaster(in_pop, zonal_subset, crop=False)
        ghs_data, ghs_meta = rMisc.clipRaster(in_smod, zonal_subset, crop=False)

        # Loop through the urban classes and calculate the urban population
        for urb_class in ['11', '12', '13', '21', '22', '23', '30']:
            out_col = f'pop_urb_{urb_class}'
            out_ghs_columns.append(out_col)
            if not out_col in zonal_households_gdf.columns:            
                urb_mask = (ghs_data == int(urb_class)).astype(rasterio.uint8)
                pop_masked = pop_data * urb_mask

                with rMisc.create_rasterio_inmemory(pop_meta, pop_masked) as temp_pop:
                    urb_pop = rMisc.zonalStats(zonal_subset, temp_pop, rastType='N', minVal=0, reProj=True)
                    urb_pop_df = pd.DataFrame(urb_pop, columns=["SUM", "MIN", "MAX", "MEAN"])
                    zonal_households_gdf.loc[zonal_subset.index, out_col] = urb_pop_df['SUM'].values

out_ghs_columns.append('geohash_year')
households_results = pd.merge(zonal_households_gdf.loc[:, out_ghs_columns], household_gdf_buffered, on="geohash_year", how='left')

import warnings
def try_get_max(row, col_list, prefix):
    with warnings.catch_warnings():
        warnings.simplefilter(action='ignore', category=FutureWarning)
        try:
            return row[col_list].idxmax().replace(prefix, '')
        except:
            return -1


# Calculate the dominant urban class based on population and area
urban_pop_cols = [f'pop_urb_{c}' for c in ['11', '12', '13', '21', '22', '23', '30']]
households_results['dominant_urban_pop'] = households_results[urban_pop_cols].apply(lambda row: try_get_max(row, urban_pop_cols, 'pop_urb_'), axis=1)

urban_area_cols = ['GHS_SMOD_11', 'GHS_SMOD_12', 'GHS_SMOD_13', 'GHS_SMOD_21', 'GHS_SMOD_22', 'GHS_SMOD_23', 'GHS_SMOD_30']
households_results['dominant_urban_area'] = households_results[urban_area_cols].apply(lambda row: try_get_max(row, urban_area_cols, 'GHS_SMOD_'), axis=1)

# Calculate total population and urban population
households_results['total_pop'] = households_results[urban_pop_cols].sum(axis=1)
households_results['urban_pop'] = households_results['pop_urb_30'] + households_results['pop_urb_23']


  0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
households_results.head()

Unnamed: 0,GHS_SMOD_11,GHS_SMOD_12,GHS_SMOD_13,GHS_SMOD_21,GHS_SMOD_22,GHS_SMOD_23,GHS_SMOD_30,pop_urb_11,pop_urb_12,pop_urb_13,...,geometry,location_wave,household_interview_day,year,ghs_smod_year,geohash,dominant_urban_pop,dominant_urban_area,total_pop,urban_pop
0,74.0,5.0,0.0,0.0,0.0,0.0,0.0,1015.656867,1246.945977,0.0,...,"POLYGON ((3660949.711 1812566.502, 3660925.635...",Ethiopia_1.0,2012-01-09,2012.0,2010.0,sfdcwjp,12,11,2262.602844,0.0
1,74.0,5.0,0.0,0.0,0.0,0.0,0.0,1015.656867,1246.945977,0.0,...,"POLYGON ((3660949.711 1812566.502, 3660925.635...",Ethiopia_1.0,2012-01-17,2012.0,2010.0,sfdcwjp,12,11,2262.602844,0.0
2,74.0,5.0,0.0,0.0,0.0,0.0,0.0,1015.656867,1246.945977,0.0,...,"POLYGON ((3660949.711 1812566.502, 3660925.635...",Ethiopia_1.0,2012-01-21,2012.0,2010.0,sfdcwjp,12,11,2262.602844,0.0
3,74.0,5.0,0.0,0.0,0.0,0.0,0.0,1015.656867,1246.945977,0.0,...,"POLYGON ((3660949.711 1812566.502, 3660925.635...",Ethiopia_1.0,2012-01-16,2012.0,2010.0,sfdcwjp,12,11,2262.602844,0.0
4,74.0,5.0,0.0,0.0,0.0,0.0,0.0,1015.656867,1246.945977,0.0,...,"POLYGON ((3660949.711 1812566.502, 3660925.635...",Ethiopia_1.0,2012-01-09,2012.0,2010.0,sfdcwjp,12,11,2262.602844,0.0


In [None]:
# Export results to CSV and GeoPackage
out_urban_cols = ['hh_id_merge', 'geohash','urban','year','ghs_smod_year','geometry','geocoords_id', 'lat_modified', 'lon_modified', 'dominant_urban_pop', 'dominant_urban_area','total_pop','urban_pop'] + urban_pop_cols + urban_area_cols
households_results.loc[:,out_urban_cols].to_csv(os.path.join(output_folder, "LSMS_household_urban_classifications.csv"), index=False)
households_results = gpd.GeoDataFrame(households_results, geometry='geometry', crs=4326)
households_results.loc[:,].to_file(os.path.join(output_folder, "LSMS_household_urban_classifications.gpkg"), driver='GPKG')

In [14]:
households_results.loc[:,out_urban_cols].columns

Index(['hh_id_merge', 'geohash', 'urban', 'year', 'ghs_smod_year', 'geometry',
       'geocoords_id', 'lat_modified', 'lon_modified', 'dominant_urban_pop',
       'dominant_urban_area', 'total_pop', 'urban_pop', 'pop_urb_11',
       'pop_urb_12', 'pop_urb_13', 'pop_urb_21', 'pop_urb_22', 'pop_urb_23',
       'pop_urb_30', 'GHS_SMOD_11', 'GHS_SMOD_12', 'GHS_SMOD_13',
       'GHS_SMOD_21', 'GHS_SMOD_22', 'GHS_SMOD_23', 'GHS_SMOD_30'],
      dtype='object')

# TT to nearest city (UCDB)

zonal_households_gdf.total_bounds