In [15]:
import sys, os, importlib, json, multiprocessing
import rasterio, pycountry
import reverse_geocode

import geopandas as gpd
import pandas as pd

from urllib.request import urlopen
from shapely.geometry import Point
from shapely.ops import nearest_points
from shapely import wkt

#Import local functions
import ict_helper as ict

# Import GOST libraries; sys.path.append will be unnecessary if libraries are already installed
sys.path.append("../../../../gostrocks/src")
sys.path.append("../../../../GOST_Urban/src")

import GOSTRocks.rasterMisc as rMisc
from GOSTRocks.misc import tPrint
import GOST_Urban.UrbanRaster as urban
%matplotlib inline  

In [16]:
out_folder =  "/home/wb411133/temp/ICT_distance"


In [17]:
wbes_coords = "/home/wb411133/data/Projects/ICT_Distance/Data/Masked ES GPS April 15-2021_year.dta"
inD = pd.read_stata(wbes_coords)
print(inD.shape)
# filter data with broken coordinates
inD = inD.loc[(inD['lat_mask'] > -90) & (inD['lat_mask'] < 90) & (inD['lon_mask'] > -180) & (inD['lon_mask'] < 180)]
print(inD.shape)
inD.head()

(121121, 9)
(121116, 9)


Unnamed: 0,idstd,year,fyear,d2_l1_year_perf_indicators,lat_mask,lon_mask,survey,WBCode,type
0,468901.0,2010.0,2009,2009,14.803293,49.687744,Yemen2010,YEM,Enterprise Survey
1,468902.0,2010.0,2009,2009,14.55662,49.129032,Yemen2010,YEM,Enterprise Survey
2,468903.0,2010.0,2009,2009,14.551517,49.141815,Yemen2010,YEM,Enterprise Survey
3,468904.0,2010.0,2009,2009,14.515405,49.125977,Yemen2010,YEM,Enterprise Survey
4,468905.0,2010.0,2009,2009,14.519185,49.119148,Yemen2010,YEM,Enterprise Survey


In [18]:
countries = inD['WBCode'].unique()
for c in countries:
    if not os.path.exists(f'/home/wb411133/temp/ICT_distance/{c}_ICT_distance.csv'):
        print(c)

CHL
RUS
CHN
IDN
THA


In [19]:
geoms = [Point(row['lon_mask'], row['lat_mask']) for idx, row in inD.iterrows()]
inD = gpd.GeoDataFrame(inD, geometry=geoms, crs = 'epsg:4326')
inD['d2_l1_year_perf_indicators'] = inD['d2_l1_year_perf_indicators'].astype(int)

In [20]:
global_bounds = "/home/public/Data/GLOBAL/ADMIN/Admin0_Polys.shp"
inG = gpd.read_file(global_bounds)
inG = inG.to_crs('epsg:4326')
inG.loc[inG['ISO3'] == 'KSV', 'ISO3'] = 'XKX'

# Open and load infrastructure data

In [21]:
openCellID = "/home/public/Data/GLOBAL/INFRA/OPENCELLID/cell_towers_2020-04-15-T000000.csv"    
ixp_file = '/home/public/Data/GLOBAL/INFRA/IXPS/ixps_global_geocoded.csv'
colocation_file = '/home/public/Data/GLOBAL/INFRA/DATA_CENTERS/Data_Centers.geojson'
cell_coverage_folder = '/home/public/Data/GLOBAL/INFRA/GSMA/2019/MCE/Data_MCE/Global'
cell_files = ['MCE_Global2G_2020.tif', 'MCE_Global3G_2020.tif', 'MCE_Global4G_2020.tif']
submarine_cable_file = '/home/public/Data/GLOBAL/INFRA/SUBMARINE_CABLES/landing-point-geo.json'

In [22]:
inCell = pd.read_csv(openCellID)
inD_geom = [Point(x) for x in zip(inCell['lon'], inCell['lat'])]
inCell = gpd.GeoDataFrame(inCell, geometry = inD_geom, crs = {'init':'epsg:4326'})

  return _prepare_from_string(" ".join(pjargs))


In [23]:
cell_sindex = inCell.sindex

In [24]:
inCol = gpd.read_file(colocation_file)

In [27]:
# Load submarine cables - load landing points, identify RFS attributes from combined other data
#     final dataset needs landing points with RFS dates
landing_points = gpd.read_file('https://www.submarinecablemap.com/api/v3/landing-point/landing-point-geo.json')                                 
cables = gpd.read_file('https://www.submarinecablemap.com/api/v3/cable/cable-geo.json')                        
cables_json_base = 'https://www.submarinecablemap.com/api/v3/cable/{cable_id}.json'

In [28]:
all_landing_points = []
for cur_id in cables['id']:
    response = urlopen(cables_json_base.format(cable_id = cur_id))
    json_data = response.read().decode()
    d = json.loads(json_data)
    l_points = d['landing_points']
    for item in l_points:
        try:
            item['RFS'] = int(d['rfs'][:4])
            all_landing_points.append(item)
        except:
            pass

In [29]:
landing_point_rfs = pd.DataFrame(all_landing_points)
lp_data = pd.merge(landing_point_rfs, landing_points, on="id")
lp_data = gpd.GeoDataFrame(lp_data, geometry='geometry', crs='epsg:4326')

def try_country(x):
    try:
        return(pycountry.countries.search_fuzzy(x)[0].alpha_3)
    except:
        return('')

lp_data['ISO3'] = lp_data['country'].apply(lambda x: try_country(x))
lp_data.drop(['name_x', 'is_tbd_x'], axis=1, inplace=True)
lp_data.columns = ['id', 'country', 'RFS', 'name', 'is_tbd', 'geometry', 'ISO3']
inCables = lp_data

In [30]:
inIXP = pd.read_csv(ixp_file, index_col=0)
inD_geom = [Point(x) for x in zip(inIXP['Lon'], inIXP['Lat'])]
inIXP = gpd.GeoDataFrame(inIXP, geometry = inD_geom, crs = {'init':'epsg:4326'})

In [31]:
gsma2g_R = rasterio.open(os.path.join(cell_coverage_folder, cell_files[0]))
gsma3g_R = rasterio.open(os.path.join(cell_coverage_folder, cell_files[1]))
gsma4g_R = rasterio.open(os.path.join(cell_coverage_folder, cell_files[2]))

# Calculate distances

In [32]:
broken_countries = ['CHL','RUS','CHN','IDN']
n_processes = 30
epsg = 'epsg:6933'
if not os.path.exists(out_folder):
    os.makedirs(out_folder)
args = []
for cur_country in ['THA']: #inD['WBCode'].unique():
    out_file = os.path.join(out_folder, "%s_ICT_distance.csv" % cur_country)
    tPrint(f"***** Processing {cur_country}")
    if not cur_country in broken_countries and not os.path.exists(out_file):
        curD = inD.loc[inD['WBCode'] == cur_country]
        if curD.shape[0] > 0:
            total_bound = inG.loc[inG['ISO3'] == cur_country].unary_union
            curB = inG.loc[inG['ISO3'] == cur_country]
            curN = inG.loc[inG.intersects(curB.buffer(0.01).unary_union)]
            curN = curN.loc[curN['ISO3'] != cur_country]        
            selCol = inCol.loc[inCol['geometry'].apply(lambda x: x.intersects(total_bound))]
            selIXP = inIXP.loc[inIXP['geometry'].apply(lambda x: x.intersects(total_bound))]                
            args.append([cur_country, curD, curB, curN, out_file, selCol, selIXP, inCables, inCell, epsg])


15:55:47	***** Processing THA



  curN = inG.loc[inG.intersects(curB.buffer(0.01).unary_union)]


In [None]:
for arg in args:
    print(arg[0])

In [None]:
importlib.reload(ict)
num_processes = min([len(args), 20])
with multiprocessing.Pool(num_processes) as pool:
    res = pool.starmap(ict.calculate_country, args)

In [33]:
broken = []
for arg in args:
    if not arg[0] in broken:
        xx = ict.calculate_country(*args[0], debug=False)

15:55:50	Starting THA
KHM
MYS
MMR


# Manually fixing cell distance
The cell_dist column is coming up empty in the multiprocessing side, so it will be processed manually

In [None]:
in_files = os.listdir(out_folder)
sel_file = in_files[0]
sel_file

In [None]:
epsg = 'epsg:6933'
for sel_file in in_files:
    tPrint(sel_file)
    distD = pd.read_csv(os.path.join(out_folder, sel_file), index_col=0)
    distD['geometry'] = distD['geometry'].apply(lambda x: wkt.loads(x))
    distD = gpd.GeoDataFrame(distD, geometry='geometry', crs=epsg)
    curB = inG.loc[inG['ISO3'] == sel_file[:3]]

    potential_matches = inCell.loc[list(cell_sindex.intersection(curB.total_bounds))]
    selCell = potential_matches.loc[potential_matches.intersects(curB.unary_union)]
    selCell = selCell.to_crs(epsg)
    distD['cell_dist'] = distD.distance(selCell.unary_union)
    pd.DataFrame(distD).to_csv(os.path.join(out_folder, sel_file))

    

# DEBUGGING

In [None]:
# Loop through results, determine where neighbour is self
in_files = os.listdir(out_folder)
sel_file = in_files[0]
sel_file

In [None]:
for sel_file in in_files