In [1]:
import sys, os, importlib, json, multiprocessing
import rasterio, pycountry
import reverse_geocode

import geopandas as gpd
import pandas as pd

from urllib.request import urlopen
from shapely.geometry import Point
from shapely.ops import nearest_points
from shapely import wkt

#Import local functions
import ict_helper as ict

# Import GOST libraries; sys.path.append will be unnecessary if libraries are already installed
sys.path.append("../../../../gostrocks/src")
sys.path.append("../../../../GOST_Urban/src")

import GOSTRocks.rasterMisc as rMisc
from GOSTRocks.misc import tPrint
import GOST_Urban.UrbanRaster as urban

%matplotlib inline  
%load_ext autoreload
%autoreload 2

In [2]:
out_folder =  "/home/wb411133/temp/ICT_distance"
sel_countries = ['AZE','KAZ','KHM','SLB','UKR']

In [3]:
wbes_coords = "/home/wb411133/data/Projects/ICT_Distance/Data/Masked ES GPS April 15-2021_year.dta"
inD = pd.read_stata(wbes_coords)
print(inD.shape)
# filter data with broken coordinates
inD = inD.loc[(inD['lat_mask'] > -90) & (inD['lat_mask'] < 90) & (inD['lon_mask'] > -180) & (inD['lon_mask'] < 180)]
print(inD.shape)
inD.head()

(121121, 9)
(121116, 9)


Unnamed: 0,idstd,year,fyear,d2_l1_year_perf_indicators,lat_mask,lon_mask,survey,WBCode,type
0,468901.0,2010.0,2009,2009,14.803293,49.687744,Yemen2010,YEM,Enterprise Survey
1,468902.0,2010.0,2009,2009,14.55662,49.129032,Yemen2010,YEM,Enterprise Survey
2,468903.0,2010.0,2009,2009,14.551517,49.141815,Yemen2010,YEM,Enterprise Survey
3,468904.0,2010.0,2009,2009,14.515405,49.125977,Yemen2010,YEM,Enterprise Survey
4,468905.0,2010.0,2009,2009,14.519185,49.119148,Yemen2010,YEM,Enterprise Survey


In [4]:
geoms = [Point(row['lon_mask'], row['lat_mask']) for idx, row in inD.iterrows()]
inD = gpd.GeoDataFrame(inD, geometry=geoms, crs = 'epsg:4326')
inD['d2_l1_year_perf_indicators'] = inD['d2_l1_year_perf_indicators'].astype(int)

In [5]:
global_bounds = "/home/public/Data/GLOBAL/ADMIN/Admin0_Polys.shp"
inG = gpd.read_file(global_bounds)
inG = inG.to_crs('epsg:4326')
inG.loc[inG['ISO3'] == 'KSV', 'ISO3'] = 'XKX'

# Open and load infrastructure data

In [6]:
openCellID = "/home/public/Data/GLOBAL/INFRA/OPENCELLID/cell_towers_2020-04-15-T000000.csv"    
ixp_file = '/home/public/Data/GLOBAL/INFRA/IXPS/ixps_global_geocoded.csv'
colocation_file = '/home/public/Data/GLOBAL/INFRA/DATA_CENTERS/Data_Centers.geojson'
cell_coverage_folder = '/home/public/Data/GLOBAL/INFRA/GSMA/2019/MCE/Data_MCE/Global'
cell_files = ['MCE_Global2G_2020.tif', 'MCE_Global3G_2020.tif', 'MCE_Global4G_2020.tif']
submarine_cable_file = '/home/public/Data/GLOBAL/INFRA/SUBMARINE_CABLES/landing-point-geo.json'

In [7]:
inCell = pd.read_csv(openCellID)
inD_geom = [Point(x) for x in zip(inCell['lon'], inCell['lat'])]
inCell = gpd.GeoDataFrame(inCell, geometry = inD_geom, crs = {'init':'epsg:4326'})

  return _prepare_from_string(" ".join(pjargs))


In [8]:
cell_sindex = inCell.sindex

In [9]:
inCol = gpd.read_file(colocation_file)

In [10]:
# Load submarine cables - load landing points, identify RFS attributes from combined other data
#     final dataset needs landing points with RFS dates
landing_points = gpd.read_file('https://raw.githubusercontent.com/telegeography/www.submarinecablemap.com/master/web/public/api/v3/landing-point/landing-point-geo.json')                                
cables = gpd.read_file('https://raw.githubusercontent.com/telegeography/www.submarinecablemap.com/master/web/public/api/v3/cable/cable-geo.json')                        
cables_json_base = 'https://raw.githubusercontent.com/telegeography/www.submarinecablemap.com/master/web/public/api/v3/cable/{cable_id}.json'

In [11]:
all_landing_points = []
for cur_id in cables['id']:
    response = urlopen(cables_json_base.format(cable_id = cur_id))
    json_data = response.read().decode()
    d = json.loads(json_data)
    l_points = d['landing_points']
    for item in l_points:
        try:
            item['RFS'] = int(d['rfs'][:4])
            all_landing_points.append(item)
        except:
            pass

In [12]:
landing_point_rfs = pd.DataFrame(all_landing_points)
lp_data = pd.merge(landing_point_rfs, landing_points, on="id")
lp_data = gpd.GeoDataFrame(lp_data, geometry='geometry', crs='epsg:4326')

def try_country(x):
    try:
        return(pycountry.countries.search_fuzzy(x)[0].alpha_3)
    except:
        country_defs = {
            'South Korea':'KOR',
            'Virgin Islands (U.S.)':'VIR',
            'Congo, Dem. Rep.':'COD',
            'Congo, Rep.':'COG',
            'Cape Verde':'CPV',
            'Virgin Islands (U.K.)':'VIR'
        }
        try:
            return(country_defs[x])
        except:
            return('')

lp_data['ISO3'] = lp_data['country'].apply(lambda x: try_country(x))
lp_data.drop(['name_x', 'is_tbd_x'], axis=1, inplace=True)
lp_data.columns = ['id', 'country', 'RFS', 'name', 'is_tbd', 'geometry', 'ISO3']
inCables = lp_data

In [13]:
inIXP = pd.read_csv(ixp_file, index_col=0)
inD_geom = [Point(x) for x in zip(inIXP['Lon'], inIXP['Lat'])]
inIXP = gpd.GeoDataFrame(inIXP, geometry = inD_geom, crs = {'init':'epsg:4326'})

In [14]:
gsma2g_R = rasterio.open(os.path.join(cell_coverage_folder, cell_files[0]))
gsma3g_R = rasterio.open(os.path.join(cell_coverage_folder, cell_files[1]))
gsma4g_R = rasterio.open(os.path.join(cell_coverage_folder, cell_files[2]))

# Calculate distances

In [19]:
broken_countries = ['CHL','RUS','CHN','IDN']
n_processes = 30
epsg = 'epsg:6933'
if not os.path.exists(out_folder):
    os.makedirs(out_folder)
args = []
for cur_country in ['LAO','THA','VNM','MDA','VEN','ZIM','ROM','BEL','SLO','HUN','POL']: #inD['WBCode'].unique(): #sel_countries: #
    out_file = os.path.join(out_folder, "%s_ICT_distance.csv" % cur_country)
    tPrint(f"***** Processing {cur_country}")
    if not cur_country in broken_countries:# and not os.path.exists(out_file):
        curD = inD.loc[inD['WBCode'] == cur_country]
        if curD.shape[0] > 0:
            total_bound = inG.loc[inG['ISO3'] == cur_country].unary_union
            curB = inG.loc[inG['ISO3'] == cur_country]
            curN = inG.loc[inG.intersects(curB.buffer(0.01).unary_union)]
            curN = curN.loc[curN['ISO3'] != cur_country]        
            if curN.shape[0] < 5:
                # If there are no intersecting neighbours, identify the nearest 5 neighbours
                # Calculate distance from curB to every country
                inG['dist'] = inG['geometry'].apply(lambda x: x.distance(total_bound))
                curN = inG.sort_values("dist").iloc[1:6].copy()
            selCol = inCol.loc[inCol['geometry'].apply(lambda x: x.intersects(total_bound))]
            selIXP = inIXP.loc[inIXP['geometry'].apply(lambda x: x.intersects(total_bound))]                
            args.append([cur_country, curD, curB, curN, out_file, selCol, selIXP, inCables, inCell, epsg])


10:55:11	***** Processing LAO



  curN = inG.loc[inG.intersects(curB.buffer(0.01).unary_union)]


10:55:11	***** Processing THA



  curN = inG.loc[inG.intersects(curB.buffer(0.01).unary_union)]


10:55:45	***** Processing VNM



  curN = inG.loc[inG.intersects(curB.buffer(0.01).unary_union)]


10:56:15	***** Processing MDA



  curN = inG.loc[inG.intersects(curB.buffer(0.01).unary_union)]


10:56:21	***** Processing VEN



  curN = inG.loc[inG.intersects(curB.buffer(0.01).unary_union)]


10:56:46	***** Processing ZIM
10:56:46	***** Processing ROM
10:56:46	***** Processing BEL



  curN = inG.loc[inG.intersects(curB.buffer(0.01).unary_union)]


10:56:49	***** Processing SLO
10:56:49	***** Processing HUN



  curN = inG.loc[inG.intersects(curB.buffer(0.01).unary_union)]


10:56:50	***** Processing POL



  curN = inG.loc[inG.intersects(curB.buffer(0.01).unary_union)]


In [18]:
# run individual country
# importlib.reload(ict)
# res = ict.calculate_country(*args[0])

10:37:23	Starting RWA


Unnamed: 0,idstd,year,fyear,d2_l1_year_perf_indicators,lat_mask,lon_mask,survey,WBCode,type,geometry,...,ngh4_cbl_dist,ngh4_cbl,ngh5,ngh5_dist,ngh5_cbl_dist,ngh5_cbl,cell_dist,gsma2g,gsma3g,gsma4g
16081,518601.0,2011.0,2010,2010,-1.959720,30.120523,Rwanda2011,RWA,Enterprise Survey,POINT (2906217.267 -249960.503),...,2.916907e+06,2009,KEN,2.916905e+06,2.916907e+06,2009,79.089302,1,1,1
16082,518602.0,2011.0,2010,2010,-1.968369,30.057611,Rwanda2011,RWA,Enterprise Survey,POINT (2900147.124 -251063.283),...,2.910954e+06,2009,KEN,2.910952e+06,2.910954e+06,2009,42.142827,1,1,1
16083,518603.0,2011.0,2010,2010,-1.965382,30.056484,Rwanda2011,RWA,Enterprise Survey,POINT (2900038.360 -250682.391),...,2.910813e+06,2009,KEN,2.910811e+06,2.910813e+06,2009,95.166281,1,1,1
16084,518604.0,2011.0,2010,2010,-1.970558,30.089775,Rwanda2011,RWA,Enterprise Survey,POINT (2903250.472 -251342.294),...,2.914070e+06,2009,KEN,2.914068e+06,2.914070e+06,2009,77.817486,1,1,1
16085,518605.0,2011.0,2010,2010,-1.975110,30.081638,Rwanda2011,RWA,Enterprise Survey,POINT (2902465.387 -251922.674),...,2.913338e+06,2009,KEN,2.913336e+06,2.913338e+06,2009,37.182041,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112999,682496.0,2019.0,2018/2019,2018,1.921794,30.025911,Rwanda2019,RWA,Enterprise Survey,POINT (2897088.495 245124.895),...,2.907402e+06,2009,KEN,2.907398e+06,2.907401e+06,2012,409504.081541,1,3,3
113000,682497.0,2019.0,2018/2019,2018,2.596256,29.728979,Rwanda2019,RWA,Enterprise Survey,POINT (2868438.610 331102.586),...,2.887447e+06,2009,KEN,2.887443e+06,2.887446e+06,2012,497799.693736,3,3,3
113001,682498.0,2019.0,2018/2019,2018,1.713534,29.267258,Rwanda2019,RWA,Enterprise Survey,POINT (2823888.828 218569.485),...,2.832296e+06,2009,KEN,2.832293e+06,2.832296e+06,2012,394293.382303,3,3,3
113002,682499.0,2019.0,2018/2019,2018,1.935338,30.039743,Rwanda2019,RWA,Enterprise Survey,POINT (2898423.103 246851.791),...,2.908877e+06,2009,KEN,2.908874e+06,2.908877e+06,2012,411134.961815,1,3,3


In [20]:
num_processes = min([len(args), 20])
num_processes

8

In [21]:
importlib.reload(ict)
with multiprocessing.Pool(num_processes) as pool:
    res = pool.starmap(ict.calculate_country, args)

  projstring = _prepare_from_string(projparams)


12:36:41	Starting LAO


  projstring = _prepare_from_string(projparams)


12:44:09	Starting THA


  projstring = _prepare_from_string(projparams)


12:51:42	Starting VNM


  projstring = _prepare_from_string(projparams)


12:59:23	Starting MDA


  projstring = _prepare_from_string(projparams)


13:07:27	Starting VEN


  projstring = _prepare_from_string(projparams)


13:15:14	Starting BEL


  projstring = _prepare_from_string(projparams)


13:23:10	Starting HUN


  projstring = _prepare_from_string(projparams)


13:31:42	Starting POL


# Manually fixing cell distance
The cell_dist column is coming up empty in the multiprocessing side, so it will be processed manually

In [25]:
in_files = os.listdir(out_folder)
sel_file = in_files[0]
sel_file

'AZE_ICT_distance.csv'

In [None]:
epsg = 'epsg:6933'
for sel_file in in_files:
    iso3 = sel_file.split("_")[0]    
    if iso3 in ['LAO','THA','VNM','MDA','VEN','ZIM','ROM','BEL','SLO','HUN','POL']:
        tPrint(iso3)
        distD = pd.read_csv(os.path.join(out_folder, sel_file), index_col=0)
        distD['geometry'] = distD['geometry'].apply(lambda x: wkt.loads(x))
        distD = gpd.GeoDataFrame(distD, geometry='geometry', crs=epsg)
        curB = inG.loc[inG['ISO3'] == sel_file[:3]]

        potential_matches = inCell.loc[list(cell_sindex.intersection(curB.total_bounds))]
        selCell = potential_matches.loc[potential_matches.intersects(curB.unary_union)]
        selCell = selCell.to_crs(epsg)
        distD['cell_dist'] = distD.distance(selCell.unary_union)
        pd.DataFrame(distD).to_csv(os.path.join(out_folder, sel_file))

    

13:11:57	VEN
13:17:57	POL


# DEBUGGING

In [None]:
pd.read_csv?

In [None]:
# Check out how many countries don't have 5 neighbours
for file in os.listdir(out_folder):
    tempD = pd.read_csv(os.path.join(out_folder, file), index_col=0)
    name = file.split("_")[0]
    neigh_cols = [x for x in tempD.columns if (("ngh" in x) and not ("_" in x))]
    n_neigh = len(neigh_cols)
    neighbours = [tempD.loc[tempD.index[0], n] for n in neigh_cols]
    #if ("COD" in neighbours) or ("COG" in neighbours):
    #    print(name)
    if n_neigh == 1:
        print(f'{name}: {n_neigh}')
    

In [None]:
neighbours

In [None]:
for n in neigh_cols:
    print(inD.loc[inD.index[0],n])

In [None]:
inD['ngh1_dist'].max()