In [1]:
import gzip
from string import capwords
from pathlib import Path
import warnings

import geopandas as gpd
import numpy as np
import pandas as pd
import seaborn as sns
import rasterio.mask
import rasterio.warp
import requests_cache
import wbgapi as wb
import pygadm
from sklearn.linear_model import LinearRegression
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)
sns.set_theme()
requests_cache.install_cache()
# research years
years = list(range(2013, 2021))

BASE_DIR = Path("/Volumes/TOSHIBA")

dst_dir = BASE_DIR / "nightlight"
research_area = ["UGA", "TZA", "KEN", "MOZ"]
# Zanzibar is not in research area
Zanzibar = ("Mjini Magharibi", "Kusini Unguja", "Kaskazini Unguja", "Kusini Pemba", "Kaskazini Pemba")
columns = ["GID_0", "NAME", *years]
ids = ["GID_0", "NAME", "YEAR"]

def get_geometry() -> gpd.GeoDataFrame:
    # use this instead of GADM since it has 136 districts
    uga_geom = gpd.read_file("https://data.unhcr.org/en/documents/download/83043")[["d", "geometry"]].rename(columns={"d": "NAME"})
    uga_geom["GID_0"] = "UGA"

    return pd.concat([
        (
            pygadm.get_items(admin=["TZA", "KEN", "MOZ"], content_level=1)
            [["GID_0", "NAME_1", "geometry"]]
            .rename(columns={"NAME_1": "NAME"})
        ),
        uga_geom,
        (
            pygadm.get_items(admin=research_area, content_level=0)
            .rename(columns={"NAME_0": "NAME"})
        ),
    ]).reset_index(drop=True)

if (geom_path := Path("data/geometry.json")).exists():
    geo = gpd.read_file(geom_path, driver="GeoJSON")
else:
    geo = get_geometry()
    geo.to_file(geom_path, driver="GeoJSON")

In [2]:
wbdf = wb.data.DataFrame(["NY.GDP.MKTP.CN", "NY.GDP.MKTP.PP.KD"], research_area, years).rename(columns=lambda name: int(name.replace("YR", "")))
# GDP current LCU
CN = wbdf.loc[(slice(None), "NY.GDP.MKTP.CN"),].droplevel(1)
# GDP constant 2017 international $
KD = wbdf.loc[(slice(None), "NY.GDP.MKTP.PP.KD"),].droplevel(1)
# factor to convert current LCU to constant 2017
factor = KD / CN
ken = (
        pd.read_excel("data/gdp.xlsx", sheet_name="KEN", index_col=0)
        .rename(capwords)
        # rename for merging to geom
        .rename({
            "Elgeyo Marakwet": "Elgeyo-Marakwet",
            "Kisi": "Kisii",
            "Muranga": "Murang'a",
            "Tharaka Nithi": "Tharaka-Nithi",
            "Uasin-gishu": "Uasin Gishu",
            "Total": "Kenya",
        })
        * 1e6
        * factor.loc["KEN"]
)
ken = ken.rename_axis("NAME").reset_index()
ken["GID_0"] = "KEN"

tza = pd.read_excel("data/gdp.xlsx", sheet_name="TZA", skiprows=1, index_col=0) * 1e6
tza_pop = tza / pd.read_excel("data/gdp.xlsx", sheet_name="TZA_per_capita", index_col=0)
tza *= factor.loc["TZA"]
tza = tza.rename_axis("NAME").reset_index()
tza["GID_0"] = "TZA"
tza_pop = tza_pop.rename_axis("NAME").reset_index()
tza_pop["GID_0"] = "TZA"

moz_renames = {
    "Maputo Cidade": "Maputo City",
    "Maputo Província": "Maputo",
    "Zambézia": "Zambezia",
    "Niassa": "Nassa",
    "Moçambique": "Mozambique"
}
moz = pd.read_excel("data/gdp.xlsx", sheet_name="MOZ", index_col=0).rename(moz_renames) * 1e6
moz_pop = moz / pd.read_excel("data/gdp.xlsx", sheet_name="MOZ_per_capita", index_col=0).rename(moz_renames)
moz *= factor.loc["MOZ"]
# Drop region sum
moz = moz.drop(['Sul', 'Centro', 'Norte']).rename_axis("NAME").reset_index()
moz["GID_0"] = "MOZ"
moz_pop = moz_pop.drop(["Norte", "Centro", "Sul"]).rename_axis("NAME").reset_index()
moz_pop["GID_0"] = "MOZ"
gdp = pd.concat([ken, tza, moz], ignore_index=True)[columns]
gdp.loc[len(gdp.index)] = ["UGA", "Uganda", *KD.loc["UGA"]]
gdp = gdp.melt(id_vars=ids[:2], var_name=ids[2], value_name="GDP")
gdp = gdp.replace({"Mainland Tanzania": "Tanzania"})
gdp = gdp.set_index(["GID_0", "NAME", "YEAR"]).sort_index()
gdp.loc[("TZA", "Tanzania"), "GDP"] = KD.loc["TZA"].values

In [3]:
# Kenya population estimating
def calc_pop(geom):
    pop_map = rasterio.mask.mask(ppp, geom, crop=True)[0]
    return pop_map[pop_map > 0].sum()

if (ken_pop_path := Path("data/ken_pop.xlsx")).exists():
    ken_pop = pd.read_excel(ken_pop_path)
else:
    def calc_pop(geom):
        pop_map = rasterio.mask.mask(ppp, geom, crop=True)[0]
        return pop_map[pop_map > 0].sum()

    ken_pop = pd.DataFrame()

    for year in years:
        ppp = rasterio.open(BASE_DIR / "worldpop" / f"ken_ppp_{year}.tiff")
        ken_pop[year] = geo.loc[geo["GID_0"] == "KEN", "geometry"].apply(calc_pop)
    ken_pop["NAME"] = geo.loc[geo["GID_0"] == "KEN", "NAME"]
    ken_pop["GID_0"] = "KEN"
    ken_pop = ken_pop.set_index("NAME")
    ken_pop_census = pd.read_excel("data/gdp.xlsx", sheet_name="KEN_pop", index_col=0)
    pop_factor = ken_pop_census["Total"] / ken_pop[2019]
    for year in years:
        ken_pop[year] *= pop_factor

    pop_factor = ken_pop_census["Total"] / ken_pop[2019]
    for year in years:
        ken_pop[year] *= pop_factor
    ken_pop = ken_pop.reset_index()
    ken_pop.to_excel("ken_pop.xlsx", index=False)
uga_pop_2015 = (
    pd.read_excel(
        "data/Rural_Urban_Population_for_the_146_Districts_in_Uganda.xlsx",
        index_col=0,
        skiprows=(1,),
    )
    .rename(
        columns={
                    f"Unnamed: {(year - 2014) * 3}": f"{year}_Urban" for year in range(2015, 2031)
                } | {
                    f"Unnamed: {(year - 2014) * 3 + 1}": f"{year}_Rural" for year in range(2015, 2031)
                }
    )
    .set_index("District")
    .rename(index={
        "KampalaCityAuthority": "Kampala",
    })
)
merge_dict = {
    "AruaCity": "Arua",
    "FortPortal City": "Kabarole",
    "Lira City": "Lira",
    "Jinja City": "Jinja",
    "Mbarara City": "Mbarara",
    "Gulu City": "Gulu",
    "Hoima City": "Hoima",
    "Soroti City": "Soroti",
    "Masaka City": "Masaka",
    "MbaleCity": "Mbale",
}
for from_, to_ in merge_dict.items():
    uga_pop_2015.loc[to_] += uga_pop_2015.loc[from_]
uga_pop_2015 = uga_pop_2015.drop(merge_dict.keys())
uga_pop_2014 = pd.concat([
    pd.read_excel(
        "data/Census_Population_counts_(2002_and_2014)_by_Region,_District_and_Mid-Year_Population_projections_(2015-2021).xlsx",
        usecols=(1, 3),
        index_col=0,
        skiprows=(0, 1, 2, 3),
    ).dropna().drop("National"),
    pd.DataFrame(data=((93_593 + 105_710,),), columns=(2014,), index=("Terego",))
])
uga_pop_2014.loc["Arua"] -= uga_pop_2014.loc["Terego"]
uga_pop = uga_pop_2014.rename(str.strip).rename({"Kyakwanzi": "Kyankwanzi"}).merge(uga_pop_2015, left_index=True, right_index=True, how="outer")[range(2014, 2021)]
X = np.arange(2014, 2021).reshape((-1, 1))
uga_pop[2013] = uga_pop.apply(lambda row: LinearRegression().fit(X, row).predict([[2013]])[0], axis=1)
uga_pop["GID_0"] = "UGA"
uga_pop = uga_pop.rename_axis("NAME").reset_index()[["GID_0", "NAME", *years]]
uga_pop.loc[len(uga_pop.index)] = ["UGA", "Uganda", *uga_pop[years].sum()]
pop = pd.concat([moz_pop, tza_pop, ken_pop, uga_pop])[columns]
pop = pop.replace({"Mainland Tanzania": "Tanzania"})
pop = pop.melt(id_vars=ids[:2], var_name=ids[2], value_name="POP").set_index(["GID_0", "NAME", "YEAR"]).sort_index()
pop.loc[("TZA", "Songwe", [2013, 2014]), "POP"] = ([5_423_178 * 1e6 / 1_950_609, 6_145_692 * 1e6 / 2_150_338] - pop.loc[("TZA", "Mbeya", [2013, 2014]), "POP"]).values
pop.loc[("TZA", "Katavi", 2013), "POP"] = 2_483_395 * 1e6 / 1_533_513 - pop.loc[("TZA", "Rukwa", 2013), "POP"]

In [4]:
if (ntl_path := Path("data/ntl.parquet")).exists():
    ntl = pd.read_parquet(ntl_path)
else:
    ntl = []
    for gz in (BASE_DIR / "nighttime_light" / "annual" / "v20").glob("**/VNL_*.average_masked.tif.gz"):
        year = int(gz.name.split("_")[3][:4])
        if year not in years:
            continue
        print(year)
        if not (tif := gz.parent / gz.stem).exists():
            with gzip.open(gz, "rb") as f:
                tif.write_bytes(f.read())
        with rasterio.open(tif) as src:
            def calc_ntl(row):
                crop = rasterio.mask.mask(src, [row["geometry"]], crop=True)[0]
                crop[crop < 0] = 0
                return pd.Series(
                    [row["GID_0"], row["NAME"], year, crop.sum(), np.log1p(crop).sum(), np.count_nonzero(crop)],
                    index=["GID_0", "NAME", "YEAR", "NTL", "logNTL", "NTLc"]
                )
            ntl.append(geo.apply(calc_ntl, axis=1))
    ntl = pd.concat(ntl)
    ntl.set_index(ids).to_parquet(ntl_path)
ntl

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,NTL,logNTL,NTLc
GID_0,NAME,YEAR,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TZA,Arusha,2013,2808.747803,1399.397461,2343
TZA,Dar es Salaam,2013,21267.341797,5945.849609,4253
TZA,Dodoma,2013,2561.721680,1213.898193,2258
TZA,Geita,2013,1129.858154,489.925537,920
TZA,Iringa,2013,1325.369385,756.524597,1718
...,...,...,...,...,...
UGA,Arua,2020,348.320862,183.079224,213
UGA,Uganda,2020,43146.335938,17796.031250,18430
TZA,Tanzania,2020,108459.976562,47386.421875,61800
KEN,Kenya,2020,158556.921875,64531.050781,82460


In [5]:
data = ntl.merge(pop, how="outer", left_index=True, right_index=True).merge(gdp, how="outer", left_index=True, right_index=True)
geo["AREA"] = geo["geometry"].to_crs("epsg:3857").area
data["AREA"] = geo.set_index(["GID_0", "NAME"]).loc[data.index.droplevel("YEAR"), "AREA"].values
data["LVL"] = (~data.index.get_level_values("NAME").isin(("Kenya", "Mozambique", "Tanzania", "Uganda"))).astype(int)
data.to_parquet("data.parquet")
zanzibar_ntl = data.loc[("TZA", Zanzibar), ("NTL", "logNTL", "NTLc", "AREA")].groupby(level=2).sum()
for year in years:
    cols = ("NTL", "logNTL", "NTLc", "AREA")
    # Simiyu has no data, so add NTL data to Shinyanga
    data.loc[("TZA", "Shinyanga", year), cols] += data.loc[("TZA", "Simiyu", year), cols]
    # Remove Zanzibar's NTL because GDP/POP data not include
    data.loc[("TZA", "Tanzania", year), cols] -= zanzibar_ntl.loc[year, :].values
data = data[~data["POP"].isna()].sort_index()
data["POP"] = data["POP"].round().astype(int)
gb = data.groupby(level=["GID_0", "YEAR"]).max()
for col in "NTL	logNTL	NTLc	POP	GDP	AREA".split():
    data["COUNTRY_"+col] = gb[col].loc[data.index.droplevel("NAME")].values
data

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,NTL,logNTL,NTLc,POP,GDP,AREA,LVL,COUNTRY_NTL,COUNTRY_logNTL,COUNTRY_NTLc,COUNTRY_POP,COUNTRY_GDP,COUNTRY_AREA
GID_0,NAME,YEAR,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
KEN,Baringo,2013,104.649399,78.658463,299,537068,1.194562e+09,1.091575e+10,1,69484.320312,27487.429688,46146,39026542,1.609759e+11,5.908727e+11
KEN,Baringo,2014,114.508026,85.828583,341,556100,1.301381e+09,1.091575e+10,1,82171.367188,31197.585938,52532,40299190,1.697299e+11,5.908727e+11
KEN,Baringo,2015,159.978226,115.055786,391,576926,1.351257e+09,1.091575e+10,1,96989.476562,37019.664062,59162,41632570,1.782231e+11,5.908727e+11
KEN,Baringo,2016,161.592636,113.033165,434,598366,1.372291e+09,1.091575e+10,1,106868.015625,39489.902344,65015,43017424,1.854384e+11,5.908727e+11
KEN,Baringo,2017,369.986298,255.012360,605,620355,1.478240e+09,1.091575e+10,1,144498.890625,57079.613281,78647,44466512,1.929659e+11,5.908727e+11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
UGA,Zombo,2016,14.822044,11.515968,38,252900,,8.981807e+08,1,26065.292969,10785.073242,14487,36652700,8.281315e+10,2.433110e+11
UGA,Zombo,2017,21.887794,16.075193,33,260200,,8.981807e+08,1,30920.351562,13563.561523,16307,37838900,8.540636e+10,2.433110e+11
UGA,Zombo,2018,107.846771,34.405903,36,267800,,8.981807e+08,1,34749.324219,15180.665039,18100,39059000,9.079031e+10,2.433110e+11
UGA,Zombo,2019,16.802755,12.958255,32,275400,,8.981807e+08,1,39238.386719,16287.588867,19506,40308000,9.663607e+10,2.433110e+11


In [6]:
unbalanced_data = data.copy()

cols = ("NTL", "logNTL", "NTLc", "POP", "GDP", "AREA")
indexes = (
    ("TZA", "Songwe", 2013),
    ("TZA", "Songwe", 2014),
    ("TZA", "Katavi", 2013)
)
unbalanced_data.loc[("TZA", "Mbeya", 2013), cols] += data.loc[("TZA", "Songwe", 2013), cols].fillna(0)
unbalanced_data.loc[("TZA", "Mbeya", 2014), cols] += data.loc[("TZA", "Songwe", 2014), cols].fillna(0)
unbalanced_data.loc[("TZA", "Rukwa", 2013), cols] += data.loc[("TZA", "Katavi", 2013), cols].fillna(0)
unbalanced_data = unbalanced_data[~unbalanced_data.index.isin(indexes)]
gb = unbalanced_data.groupby(level=["GID_0", "YEAR"])["LVL"].sum()
unbalanced_data["REGIONS"] = gb.loc[unbalanced_data.index.droplevel("NAME")].values
unbalanced_data.to_parquet("unbalanced_data.parquet")

In [7]:
balanced_data = data.copy()
for year in years:
    cols = ("NTL", "logNTL", "NTLc", "POP", "GDP", "AREA")
    balanced_data.loc[("TZA", "Mbeya", year), cols] += data.loc[("TZA", "Songwe", year), cols].fillna(0)
    balanced_data.loc[("TZA", "Rukwa", year), cols] += data.loc[("TZA", "Katavi", year), cols].fillna(0)
balanced_data = balanced_data[~balanced_data.index.isin(["Songwe", "Katavi"], level="NAME")]
gb = balanced_data.groupby(level=["GID_0", "YEAR"])["LVL"].sum()
balanced_data["REGIONS"] = gb.loc[balanced_data.index.droplevel("NAME")].values
balanced_data.to_parquet("balanced_data.parquet")