# Fiber Data Validation

We are going to use three datasets in this notebook to validate the fiber node locations:
1. School entity data from Project Connect APIs
2. School to fiber node distances
3. Manually generated fiber locations

Let's first compare the two school datasets to one another.

In [None]:
import pandas as pd

schools = pd.read_csv('sample_workspace/rwanda/schools.csv')
schools

In [None]:
import pandas as pd

df = pd.read_csv('parameter_workspace/rwanda_fiber_distances.csv')
dists = df[['giga_id_school', 'source_id_school', 'fiber_node_distance']]
dists = dists.rename(columns={'source_id_school': 'school_id'})
dists

In [None]:
dists_schools = set(dists['school_id'].dropna())
connect_schools = set(schools['school_id'].dropna())

print(f'Project connect schools {len(connect_schools)}, not in distance data set {len(connect_schools.difference(dists_schools))}')
print(f'Fiber distance schools {len(dists_schools)}, not in distance data set {len(dists_schools.difference(connect_schools))}')
print(f'Total schools in validation: {len(dists_schools.intersection(connect_schools))}')

## Distance Comparison

Let's take a look at the differences in distance estimates and the true values.

In [None]:
# join to the shchools

schools = schools[schools['school_id'].notna()]
dists = dists[dists['school_id'].notna()]

schools = schools.merge(dists, how='inner', on='school_id')
schools = schools.rename(columns={'fiber_node_distance': 'true_fiber_node_distance'})
schools

In [None]:
import geopandas as gpd 
import fiona 
import numpy as np
import pandas as pd 


def to_lon(row):
    return row['geometry'].coords.xy[0][0]
    
def to_lat(row):
    return row['geometry'].coords.xy[1][0]


fiona.drvsupport.supported_drivers['kml'] = 'rw' # enable KML support which is disabled by default
fiona.drvsupport.supported_drivers['KML'] = 'rw' # enable KML support which is disabled by default

fiber = gpd.read_file('parameter_workspace/rwanda.kml',driver='KML')
fiber = fiber.rename(columns={'Name': 'coordinate_id'})
fiber['lat'] = fiber.apply(to_lat, axis=1)
fiber['lon'] = fiber.apply(to_lon, axis=1)
fiber = pd.DataFrame(fiber)
fiber = fiber.drop(columns=['Description', 'geometry'])
fiber.to_csv('sample_workspace/rwanda/fiber.csv')

In [None]:
import pandas as pd

from giga.models.nodes.graph.greedy_distance_connector import GreedyDistanceConnector
from giga.schemas.school import GigaSchoolTable
from giga.schemas.geo import UniqueCoordinateTable

fiber_coordinates = UniqueCoordinateTable.from_csv('sample_workspace/rwanda/fiber.csv')
school_coords = GigaSchoolTable.from_csv('sample_workspace/rwanda/schools.csv')

conection_model = GreedyDistanceConnector(
    fiber_coordinates.coordinates,
    dynamic_connect=False,
    progress_bar=True
)

In [None]:
dists = conection_model.run(school_coords.to_coordinates())

In [None]:
dist_lookup = {d.pair_ids[0]: d.distance / 1000.0 for d in dists}

manual_distances = [dist_lookup[row['giga_id_school_x']] for i, row in schools.iterrows()]
schools['estimated_fiber_node_distance'] = manual_distances    

In [None]:
import matplotlib.pyplot as plt

plt.figure()
ax = schools.hist('true_fiber_node_distance', bins=50, label="True")
schools.hist('estimated_fiber_node_distance', bins=50, ax=ax, alpha=0.75, color='#f5de07', label="Estimated")
plt.xlabel("Distance (km)")
plt.title("Fiber Node Distance Comparison")
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt

delta = list(abs(schools['true_fiber_node_distance'] - schools['estimated_fiber_node_distance']))
schools['fiber_distance_delta'] = delta

plt.hist(delta, bins=50)
plt.xlabel("Delta in Distance Estimate (km)")
plt.grid()
plt.show()

In [None]:
from ipywidgets import interactive
import matplotlib.pyplot as plt
import folium
from folium.plugins import Draw
import numpy as np
import branca.colormap as cm


m=folium.Map(tiles="cartodbpositron", zoom_start=8, location=[-1.9, 30.1])

linear = cm.LinearColormap(["green", "yellow", "red"], vmin=schools['fiber_distance_delta'].min(), vmax=schools['fiber_distance_delta'].max())

for i, s in schools.iterrows():
    popup = f"Distance delta {s.fiber_distance_delta}"
    #popup += f" {c.coordinate_id}"
    folium.CircleMarker(
        location=[s['lat'], s['lon']],
        popup=popup,
        color=linear(s.fiber_distance_delta),
        fill=True,
        radius=2,
    ).add_to(m)
    
for c in fiber_coordinates.coordinates:
    popup = f"{c.coordinate_id}"
    folium.CircleMarker(
        location=c.coordinate,
        popup=popup,
        color="black",
        fill=True,
        radius=2,
    ).add_to(m)
    
Draw(
    export=True,
    filename="fiber.geojson",
    position="topleft",
).add_to(m)
m


In [None]:
import numpy as np
import pandas as pd

by_admin = []

grouped = {}

for a, g in schools.groupby('admin_1_name'):
    d = np.mean(g['true_fiber_node_distance'] - g['estimated_fiber_node_distance'])
    grouped[a] = list(g['true_fiber_node_distance'] - g['estimated_fiber_node_distance'])
    by_admin.append({"admin_1": a, "distance_average": d})
pd.DataFrame(by_admin)

In [None]:
from ipywidgets import interactive, interact

def interactive_admin(grouped):
    def render(admin):
        plt.xlim([-30, 30])
        plt.hist(grouped[admin], bins=50)
        plt.xlabel("Delta in Distance Estimate (km)")
        plt.title(admin)
        plt.grid()
    interactive_plot = interactive(render, admin=list(grouped.keys()))
    return interactive_plot

In [None]:
interactive_admin(grouped)