## Imports and Setup

In [1]:
import os
import pandas as pd
import geopandas as gpd
import logging
import joblib
import torch

import sys
sys.path.insert(0, "../src")
import sat_download

sys.path.insert(0, "../utils/")
import data_utils
import config_utils
import pred_utils
import embed_utils
import model_utils

import matplotlib.pyplot as plt
from PIL import Image

%load_ext autoreload
%autoreload 2

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


## Load Config Files

In [2]:
iso_code = "MOZ"
cwd = os.path.dirname(os.getcwd())
data_config_file = os.path.join(cwd, "configs/data_config.yaml")
data_config = config_utils.load_config(data_config_file)

## Load Geoboundary

In [3]:
geoboundary = data_utils._get_geoboundaries(data_config, iso_code, adm_level="ADM2")
shapenames = geoboundary.shapeName.unique()
geoboundary = geoboundary[geoboundary.shapeName.isin(shapenames)]
print(f"Data dimensions: {geoboundary.shape}")
geoboundary.head(3)

Data dimensions: (159, 6)


Unnamed: 0,shapeName,shapeISO,shapeID,shapeGroup,shapeType,geometry
0,Ancuabe,,81135715B82033504020472,MOZ,ADM2,"POLYGON ((40.22659 -13.12288, 40.22642 -13.121..."
1,Balama,,81135715B71063384195178,MOZ,ADM2,"POLYGON ((38.55914 -14.14872, 38.55881 -14.148..."
2,Chiure,,81135715B78374489534152,MOZ,ADM2,"POLYGON ((39.39557 -13.20670, 39.39445 -13.206..."


## Load Predictions

In [None]:
model_config_file = os.path.join(cwd, f"configs/cnn_configs/convnext_base.yaml")
model_config = config_utils.load_config(model_config_file)
model_config["iso_codes"] = [iso_code]
out_dir = os.path.join(cwd, "output", iso_code, "results")

data = []
for shapename in shapenames:
    filename = f"{iso_code}_{shapename}_{model_config['model']}_cam.gpkg"
    subdata = gpd.read_file(os.path.join(out_dir, filename))
    data.append(subdata)

data = gpd.GeoDataFrame(pd.concat(data), geometry="geometry", crs="EPSG:3857")
data = data.to_crs("EPSG:4326").drop_duplicates("geometry")
data = data[data["sum"] > 5].reset_index()

out_dir = os.path.join(cwd, "output", iso_code)
data = data.rename(columns={'UID': 'PUID'})
data.to_file(os.path.join(out_dir, f"{iso_code}_results.geojson"), driver="GeoJSON")
print(f"Data dimensions: {data.shape}")
data.head(3)

## Load Master Dataset

In [7]:
name = "unicef_clean"
master_dir = os.path.join(cwd, data_config["vectors_dir"])
master = gpd.read_file(os.path.join(master_dir, f"school/{name}/{iso_code}_{name}.geojson"))
master = gpd.sjoin(master, geoboundary, predicate='within', how='left')
master = master.drop(columns=["index_right"])
master = master.rename(columns={'UID': 'MUID'})
print(f"Data dimensions: {master.shape}")
master.head(3)

Data dimensions: (13020, 15)


Unnamed: 0,MUID,source,iso,country,region,subregion,name,giga_id_school,unicef_clean,geometry,shapeName,shapeISO,shapeID,shapeGroup,shapeType
0,UNICEF-MOZ-SCHOOL-00000000,UNICEF,MOZ,Mozambique,Africa,Sub-Saharan Africa,Escola Primário do 1º Grau 4 de Outubro,0,0,POINT (33.23836 -19.21253),Vanduzi,,81135715B46655086955140,MOZ,ADM2
1,UNICEF-MOZ-SCHOOL-00000001,UNICEF,MOZ,Mozambique,Africa,Sub-Saharan Africa,Escola Primária de Naicuainha,1,0,POINT (35.20014 -13.47617),Chimbonila,,81135715B9835841983293,MOZ,ADM2
2,UNICEF-MOZ-SCHOOL-00000002,UNICEF,MOZ,Mozambique,Africa,Sub-Saharan Africa,Escola Primária de Lipapa,2,4,POINT (35.54036 -13.43500),Chimbonila,,81135715B9835841983293,MOZ,ADM2


## Get Intersection between Master and Predictions

In [15]:
intersect = gpd.sjoin(master, data, predicate="intersects", how="inner")
condition = (master["giga_id_school"].isin(intersect["giga_id_school"])) & (master["giga_code"] == 0)
master.loc[condition, "giga_code"] = 3
master = master.drop_duplicates("MUID")
print(master.giga_code.value_counts())
master.head(3)

giga_code
3    5907
0    3432
1     480
2     246
Name: count, dtype: int64


Unnamed: 0,MUID,source,iso,country,region,subregion,name,giga_id_school,clean,geometry,shapeName,shapeISO,shapeID,shapeGroup,shapeType,giga_code
0,UNICEF-SEN-SCHOOL-00000000,UNICEF,SEN,Senegal,Africa,Sub-Saharan Africa,EE ALIEU SAMB,7614c6c6-9aca-36ff-978b-22bfca59916a,0,POINT (-17.51261 14.74680),Dakar,,32543966B20608950555277,SEN,ADM2,0
1,UNICEF-SEN-SCHOOL-00000001,UNICEF,SEN,Senegal,Africa,Sub-Saharan Africa,EE EL HADJI BAYTIR SAMB,350fb172-bfef-331d-bd55-5c51139b9f48,0,POINT (-17.51196 14.74951),Dakar,,32543966B20608950555277,SEN,ADM2,0
2,UNICEF-SEN-SCHOOL-00000002,UNICEF,SEN,Senegal,Africa,Sub-Saharan Africa,EE CASTORS ASECNA,b1a38b2d-0a27-33bf-8155-2ce84433d07a,0,POINT (-17.48871 14.73014),Dakar,,32543966B20608950555277,SEN,ADM2,3


In [16]:
temp = data_utils._convert_crs(master.copy(), target_crs="EPSG:3857")
temp["geometry"] = temp["geometry"].buffer(300, cap_style=3)
temp = temp.to_crs("EPSG:4326")
intersect = gpd.sjoin(temp, data, predicate="intersects")
condition = ((master["giga_id_school"].isin(intersect["giga_id_school"])) & (master["giga_code"] == 0))
master.loc[condition, "giga_code"] = 4
master = master.drop_duplicates("MUID")
print(master.giga_code.value_counts())
master.head(3)

giga_code
3    5907
4    2112
0    1320
1     480
2     246
Name: count, dtype: int64


Unnamed: 0,MUID,source,iso,country,region,subregion,name,giga_id_school,clean,geometry,shapeName,shapeISO,shapeID,shapeGroup,shapeType,giga_code
0,UNICEF-SEN-SCHOOL-00000000,UNICEF,SEN,Senegal,Africa,Sub-Saharan Africa,EE ALIEU SAMB,7614c6c6-9aca-36ff-978b-22bfca59916a,0,POINT (-17.51261 14.74680),Dakar,,32543966B20608950555277,SEN,ADM2,0
1,UNICEF-SEN-SCHOOL-00000001,UNICEF,SEN,Senegal,Africa,Sub-Saharan Africa,EE EL HADJI BAYTIR SAMB,350fb172-bfef-331d-bd55-5c51139b9f48,0,POINT (-17.51196 14.74951),Dakar,,32543966B20608950555277,SEN,ADM2,0
2,UNICEF-SEN-SCHOOL-00000002,UNICEF,SEN,Senegal,Africa,Sub-Saharan Africa,EE CASTORS ASECNA,b1a38b2d-0a27-33bf-8155-2ce84433d07a,0,POINT (-17.48871 14.73014),Dakar,,32543966B20608950555277,SEN,ADM2,3


In [12]:
temp1 = data_utils._convert_crs(master.copy(), target_crs="EPSG:3857")
temp2 = data.to_crs("EPSG:3857")
dist = temp1.geometry.apply(lambda x: temp2.distance(x).sort_values())
dist

Unnamed: 0,85,69,106,136,41,6,180,185,10,137,...,4405,4460,4446,4466,4461,4422,4436,4425,4412,4380
0,388.767756,1947.350012,2151.985126,2221.688564,2322.154187,2831.503424,3098.461614,3117.184090,3190.559605,3292.156218,...,713401.750325,713880.342262,715240.086927,715344.986538,716496.146182,716672.378234,719451.123291,721785.295753,731499.357551,732273.432138
1,659.691609,1874.379753,2065.330586,2365.612858,2217.109194,3015.114950,3259.052378,3226.126652,3327.646573,3499.996258,...,713423.764478,713925.719163,715276.269936,715378.821794,716525.724279,716701.860180,719487.681995,721830.933049,731544.109652,732318.512044
2,3370.260786,2100.952449,2203.298465,873.956069,2330.860331,549.458417,201.634630,221.104101,0.000000,810.612899,...,710298.680558,710707.592166,712092.789547,712204.599716,713368.712745,713545.243458,716302.730834,718611.854927,728328.239415,729101.447136
3,3946.312040,3283.146083,3416.941036,1796.231723,3562.511932,1063.529700,1012.418978,1483.476553,1171.019589,524.527694,...,710144.551516,710461.515149,711882.847314,712003.894748,713184.762610,713361.671221,716091.293248,718364.724422,728084.548673,728856.465975
4,7225.837126,4793.222588,4566.441222,5568.848952,4382.814347,5818.822565,5538.394935,4986.032021,5262.603736,6134.416074,...,707502.254993,708254.818280,709502.734056,709579.914565,710681.660903,710856.787948,713718.183680,716162.904822,725865.830047,726643.963991
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10060,458170.465877,456228.889402,456124.682908,455669.409006,456058.587448,455180.551863,454854.265308,454749.521527,454711.084709,454873.641331,...,273165.656555,261713.593021,267018.736221,268284.607947,271661.481438,271885.447962,270954.074001,269411.746938,279326.026265,279968.838275
10061,438109.957097,436196.357993,436096.318866,435609.918269,436034.028436,435110.960337,434787.771563,434693.431595,434649.085963,434795.361109,...,294826.489477,283286.226538,288714.275497,289992.971668,293374.572776,293598.416220,292643.657359,290966.589813,300884.401990,301519.294709
10062,108733.766876,106361.838529,106178.251158,106434.666020,106037.430814,106214.374713,105849.812897,105524.735836,105620.314681,106130.961963,...,608641.071330,611668.130002,611943.481454,611788.460134,612484.493007,612650.545242,616191.529334,619593.593196,629185.104488,629998.208267
10063,296484.011648,295687.421127,295723.501388,294419.024125,295780.656381,293708.895553,293528.629830,293751.295246,293543.095107,293192.484412,...,537318.638095,518394.167230,527188.470726,529227.427017,533836.217513,534080.076329,530798.472953,525563.733378,535369.272958,535829.695941


In [18]:
temp = data_utils._convert_crs(master.copy(), target_crs="EPSG:3857")
temp["geometry"] = temp["geometry"].buffer(100, cap_style=3)
temp = temp.to_crs("EPSG:4326")

intersect1 = gpd.sjoin(master, data, predicate="intersects", how="inner")
intersect2 = gpd.sjoin(temp, data, predicate="intersects", how="inner")
exclude = data[(data["PUID"].isin(intersect1["PUID"])) | (data["PUID"].isin(intersect2["PUID"]))]
result = data[~data["PUID"].isin(exclude["PUID"])]
result["geometry"] = result["geometry"].centroid
result


  result["geometry"] = result["geometry"].centroid


Unnamed: 0,index,prob,PUID,sum,group,geometry
7,7,0.983508,6746,114.0,160,POINT (-17.44199 14.75123)
9,9,0.983480,5099,117.0,52,POINT (-17.46393 14.71046)
26,26,0.979965,6716,52.0,157,POINT (-17.44199 14.71176)
42,42,0.975091,7137,60.0,178,POINT (-17.43660 14.73429)
43,43,0.975086,5585,128.0,77,POINT (-17.45643 14.68569)
...,...,...,...,...,...,...
12490,178,0.544261,42201,59.0,50,POINT (-16.28686 12.57904)
12492,180,0.534193,54393,21.0,132,POINT (-16.21448 12.56363)
12494,183,0.513865,48083,36.0,112,POINT (-16.25183 12.58730)
12495,184,0.509563,43514,17.0,63,POINT (-16.27878 12.52323)


In [19]:
result.to_file("SEN_results_centroid.geojson", driver="GeoJSON")

In [56]:
master.to_file("SEN_master.geojson", driver="GeoJSON")