### Import packages

In [None]:
import pandas as pd

# for creating extent from centroid
from utils import create_extent_from_centroid, produce_geojson

# for adding model prediction results to csv file
from utils import add_yolov5_conf_scores, add_efficientnet_conf_scores

# for nearest neighbour merge
import geopandas as gpd
import numpy as np

### Params

In [None]:
csv_dir = "data/BASIC SCHOOL DATA + GPS.csv"

# params from original csv file
original_lat_col_name = "lat"
original_lon_col_name = "lon"
original_crs = "EPSG:4326"


# grid size for downloading school tiles for validation with ML models
tile_width = 256
tile_height = 256
spatial_resolution = 0.6 # in meters


# image downloading


# adding model prediction results to DataFrame
model_1_name = "efficientnet" # classification model
model_2_name = "yolov5" # object detection model

model_1_pred_folder = f"data/{model_1_name}_predictions" # data/efficientnet_preds
model_2_pred_folder = f"data/{model_2_name}_predictions" # data/yolov5_preds

model_1_conf_col = f"conf_{model_1_name}" # "conf_efficientnet"
model_2_conf_col = f"conf_{model_2_name}" # "conf_yolov5"


# for nearest neighbour merge
pred_gjson_dir = "data/sudan_predictions_centroid_validated_conf_5.geojson"
pred_data_src_crs = "EPSG:3857" # crs location points in model prediction file 
max_distance = 300 # nearest neighbour distance
drop_threshold = 0.5

# for "location_type" column
loc_type_gov_dropped = 0
loc_type_gov = 1
loc_type_ml = 2

# for "source" column
src_gov_id = 1 # id for all points from government csv file is 1
src_ml_id = 2  # id for all ML prediction points are 2

In [None]:
# read csv file
df = pd.read_csv(csv_dir)

# create a new column and add row indexes from the original csv file 
df['original_index'] = df.index

## 1. Drop NAN rows

In [None]:
"""
Assumes lat / lon columns contain NaN values
"""

In [None]:
# create a separate df for rows with missing lat/lon values
df_nan_latlon = df[df[original_lat_col_name].isna() | df[original_lon_col_name].isna()].copy()

# DataFrame without missing lat/lon values (drop nan rows for both lat / lon colulmn)
df = df[df[original_lat_col_name].notna()]
df = df[df[original_lon_col_name].notna()]

In [None]:
"""  save the file (optional)  """

# df.to_csv("latlon_nan_rows_dropped.csv", index = False)
# df_nan_latlon.to_csv("nan_latlon_rows.csv", index = False)

## 2. Drop duplicates

In [None]:
"""
Assumes there are duplicated lat / lon values in csv file.
Drop duplicates keep only the first row.
Cause: 
    - ground team could not locate the exact location of school but is sure there is a school nearby
    - same location point could occur multiple times with different school attributes
"""

In [None]:
# drop duplicates
df_without_duplicates = df.drop_duplicates(subset = [original_lat_col_name, original_lon_col_name],
                                           keep="first"
                                          ).copy()

print(f"After dropping duplicates: {len(df_without_duplicates)} rows")

# create a separate DataFrame with only duplicated rows
df_duplicated_rows = df[~ df.index.isin(df_without_duplicates.index) ].copy()

df = df_without_duplicates

In [None]:
"""  Save file (Optional)  """

# # cleaned csv file (optional)
# df.to_csv("cleaned.csv", index = False)

# # a separate csv file with only dropped duplicated rows (optional)
# df_duplicated_rows.to_csv("duplicated_rows.csv", index = False)

## 3. Create extent from centroid

In [None]:
# get x, y (lon / lat) values 
lat_col_values = df[original_lat_col_name]
lon_col_values = df[original_lon_col_name]

# Calculate four corner points of the grid
top, left, bottom, right = create_extent_from_centroid(src_crs = original_crs,
                                                       x = lon_col_values,
                                                       y = lat_col_values,
                                                       grid_width = tile_width,
                                                       grid_height = tile_height,
                                                       spatial_resolution = spatial_resolution)

# add four corner grid points to DataFrame
df["top"] = top
df["left"] = left
df["bottom"] = bottom
df["right"] = right

# Add a new column: image ids for downloading and validation by ML models
df['image_id'] = list(range(1, len(df)+1))

In [None]:
"""  Save csv file (optional)  """

# df.to_csv("with_grid_extent.csv", index = False)

### Optional - generate geojson file

In [None]:
# produce_geojson(df = df, 
#                 crs = original_crs, 
#                 columns =  ["image_id", "MOEcode", "location", "lat", "lon", "top", "left", "bottom", "right"], 
#                 save_dir = f"school_locations_with_{tile_width}_grids_1.geojson")

## 4. Data downloading

In [None]:
"""
Done in a separate notebook.
"""

## 5. Model training & prediction

In [None]:
"""
Model training is done separately.
Currently, we use two models:
    1. EfficientNet (classification model)
    2. YOLOv5 (object detection model)
"""

## 6. Add model prediction probability scores to DataFrame
- model probability scores on images downloaded using government location data

In [None]:
# put NaN values in two model conf score columns
df[model_1_conf_col] = np.nan
df[model_2_conf_col] = np.nan

#####  add model probability scores to DataFrame  #####

# add model 1 (EfficientNet) probability scores to DataFrame
df = add_efficientnet_conf_scores(original_df = df,
                                  predictions_folder = model_1_pred_folder,
                                  school_class_id = 1, 
                                  school_conf_col = model_1_conf_col, 
                                  image_id_col = "image_id")

# add model 2 (YOLOv5) probability scores to DataFrame
df = add_yolov5_conf_scores(original_df = df, 
                            predictions_folder = model_2_pred_folder,
                            school_conf_col = model_2_conf_col, 
                            image_id_col = "image_id")

# replace NaNs with zeros
df[model_1_conf_col] = df[model_1_conf_col].fillna(0)
df[model_2_conf_col] = df[model_2_conf_col].fillna(0)

# reset index
df = df.reset_index(drop = True)

## 7. Nearest neighbour merge
Merge before dropping conf = 0 gov points so that we can keep the gov school data attributes

3 different location types:
  - 0: dropped points
  - 1: gov data
  - 2: ML prediction

In [None]:
# the two dataframes to be merged
gt_df = df
pred_df = gpd.read_file(pred_gjson_dir) # centroid data

# convert crs to 4326 if source crs is 3857
if pred_data_src_crs == "EPSG:3857":
    pred_df = pred_df.to_crs("EPSG:4326")
    
print("num gov points:", len(gt_df))
print("num pred points:", len(pred_df))

# rename gov gt and pred df columns
gt_df = gt_df.rename({'image_id': 'gt_image_id'}, 
                     axis=1)
pred_df = pred_df.rename({'image_id': 'pred_image_id'},
                         axis=1)

# add lat / lon in pred_df (geometry is centroid)
pred_df['lat_pred'] = pred_df.geometry.y
pred_df['lon_pred'] = pred_df.geometry.x

# select columns
pred_df = pred_df[ ["lat_pred", "lon_pred", "prob", "pred_image_id", "geometry"] ]

### 7.1. Create a GeoDataFrame for gt csv data

In [None]:
# create geometry column for gt csv file
geometry = gpd.points_from_xy(gt_df.lon, gt_df.lat, crs = original_crs) # 'EPSG:4326'

# create geo-dataframe
gt_df = gpd.GeoDataFrame(gt_df, 
                         geometry = geometry)

In [None]:
"""  Save geojson files (Optional)  """

# gt_df.to_file("gt.geojson")
# pred_df.to_file("pred.geojson")

### 7.2. Merge - sjoin_nearest ( how = left )

In [None]:
# get nearest points to gov points from predictions df
# all points from ground truth csv file will be kept

nearest_df = gpd.sjoin_nearest(gt_df.to_crs("EPSG:3857"),
                               pred_df.to_crs("EPSG:3857"),
                               how = 'left', # keep all rows from left df (gov gt csv)
                               distance_col = 'dist',
                               max_distance = max_distance)

### 7.3. Drop duplicated ML points from nearest neighbour df

In [None]:
"""
- one ML point can be associated with multiple gt gov points
- drop duplciated ML points
"""

In [None]:
# create a df that contains only gov points with ML points very nearby
nn_gov_points_df = nearest_df.loc[nearest_df['index_right'].notnull()]


"""  sort by distance and drop duplicates  """
# sort by distance
nn_gov_points_df = nn_gov_points_df.sort_values(by=['dist'],
                                                ascending=False)

# drop duplicates after sorting by distance
nn_gov_points_df = nn_gov_points_df.drop_duplicates(
    subset=["index_right", "lat_pred", "lon_pred", "prob", "pred_image_id"], 
    keep="first")

# keep only first nearest point in gt dataframe, 
# replace duplicated point rows with null
columns_to_nullify = [ "index_right", "lat_pred", "lon_pred", "prob", "pred_image_id", "dist" ]

# row, column indexer
nearest_df.loc[~nearest_df.index.isin(nn_gov_points_df.index), columns_to_nullify] = np.nan

# check if it actually dropped duplicates. e.g. 6632 is correct for sudan.
print("Total ML points near gov points:", len(nearest_df[nearest_df['index_right'].notna()]))

### Split the preds df into 2 parts
- one with points merged in gt csv file
- one with points that weren't merged in csv file

In [None]:
pred_indexes_in_nn = list(nearest_df['index_right'])

# points that are near gt_df points (i.e. merged points)
points_near_gt = pred_df[pred_df.index.isin(pred_indexes_in_nn)]

# points that are not near gt points
points_away_from_gt = pred_df[~pred_df.index.isin(pred_indexes_in_nn)].copy()

print("merged in gt df:", len(points_near_gt))
print("not merged points:", len(points_away_from_gt))

In [None]:
"""  Save the splitted dataframess (optional)  """
# ML points near gov points and points away from gov points
# points_near_gt.to_file('points_near_gt_df_dist_300.geojson')
# points_away_from_gt.to_file('points_away_gt_df_dist_300.geojson')

### Append the ML dataframe to gov dataframe
- appends ML points that have not been merged to gov dataframe
- these are the new discovery points by the ML model

In [None]:
# add index column
points_away_from_gt['index_right'] = list(points_away_from_gt.index)

# merge gov and ML points(that are not near any gov points)
merged_df = nearest_df.append(points_away_from_gt)

# add a new column for location_type. e.g. gov, ML, dropped gov points
merged_df['location_type'] = np.nan

# rename index_right to pred_df_index
merged_df = merged_df.rename({'index_right': 'pred_df_index',
                              'lat' : 'lat_gov',
                              'lon' : 'lon_gov'},
                             axis=1)

# drop geometry column
merged_df = merged_df.drop(["geometry"], axis = 1)

# 8. Final data

In [None]:
"""
Add final_lat, final_lon columns in DataFrame (Pick from gov or ML)

1. if both gov and ML present, take ML
    - e.g. case: Assuming ML is more precise in location
    - idea to try: if gov conf is higher than pred conf, keep gov conf
        
2. only gov data present:
    - either YOLO == 0 or EfficientNet == 0
    - if both model conf for gov points < threshold --->  drop them
    - e.g. case: error points, like linear points, desert areas and water bodies
"""

In [None]:
# Add two columns to be used as final lat / lon points
merged_df['lat_final'] = np.nan
merged_df['lon_final'] = np.nan

# Reset index because there are duplicates
merged_df = merged_df.reset_index(drop = True)

### 8.1. if both gov and ML points are present, take ML points

In [None]:
# create a copy of the ml points subset from merged df
ml_pred_points_df = merged_df[merged_df['pred_df_index'].notna()].copy()

# ML prediction lat / lon
lat_pred = ml_pred_points_df['lat_pred']
lon_pred = ml_pred_points_df['lon_pred']

# index locations for ml prediction points
pred_points_indexes = ml_pred_points_df.index

# column names to be modified
lat_final_col = ['lat_final']
lon_final_col = ['lon_final']
loc_type_col = ['location_type']

# modify final lat / lon columns
merged_df.loc[merged_df.index.isin(pred_points_indexes), lat_final_col] = lat_pred
merged_df.loc[merged_df.index.isin(pred_points_indexes), lon_final_col] = lon_pred
merged_df.loc[merged_df.index.isin(pred_points_indexes), loc_type_col] = loc_type_ml # 2

### 8.2. Create DataFrames with rows where only gov data is present:
Create 2 DataFrames:
- one with points that are still kept
- one with dropped points

In [None]:
# gov points only (rows with no ML prediction points)
non_ml_df      = merged_df[merged_df['pred_df_index'].isna()].copy()

# df for gov points that will be kept after filtering with conditions
kept_points_df = merged_df[merged_df['pred_df_index'].isna()].copy()

### 8.3. Drop gov points based on conditions
Drop based on the following conditions:
- both YOLO and EfficientNet is 0 , drop
- both model prob score < threshold ,  drop them
- either YOLO == 0 or EfficientNet == 0 (**NOT** doing this)
    - doesn't always work
    - sometimes, EfficientNet conf is high and there's actually a school but YOLO conf is very low

In [None]:
""" 
Conditions:

# both YOLO and EfficientNet conf is 0
non_ml_df[ ((non_ml_df['conf_efficientnet'] == 0) & (non_ml_df['conf_yolov5'] == 0)) ]

# both model conf less than threshold
non_ml_df[ (non_ml_df['conf_efficientnet'] < drop_threshold) & (non_ml_df['conf_yolov5'] < drop_threshold) ]

# Either YOLO == 0 or EfficientNet == 0
non_ml_df[ (non_ml_df['conf_efficientnet'] == 0) | (non_ml_df['conf_yolov5'] == 0) ]

"""

In [None]:
# drop points based on conditions

# condition 1: both YOLO and EfficientNet conf is 0
kept_points_df = kept_points_df[ 
    ~ ((kept_points_df['conf_efficientnet'] == 0) & (kept_points_df['conf_yolov5'] == 0)) 
]

# condition 2: both model conf less than threshold
kept_points_df = kept_points_df[ 
    ~ ((kept_points_df['conf_efficientnet'] < drop_threshold) & (kept_points_df['conf_yolov5'] < drop_threshold))
]

kept_points_df

In [None]:
# df for gov points (with no ML points nearby) that will be dropped after filtering with conditions
# dropped_points_df =  non_ml_df[ ~ non_ml_df.index.isin(kept_points_df.index)]
# dropped_points_df.to_csv("dropped_gov_points.csv")

### 8.4. Add gov points filtered with conditions (and with no ML points nearby) to final lat / lon column 

In [None]:
kept_points_indexes = kept_points_df.index
gov_lat_final = kept_points_df['lat_gov']
gov_lon_final = kept_points_df['lon_gov']
loc_type_col = ['location_type']

# put gov lat/lon in lat_final/ on_final columns
merged_df.loc[merged_df.index.isin(kept_points_indexes), lat_final_col] = gov_lat_final
merged_df.loc[merged_df.index.isin(kept_points_indexes), lon_final_col] = gov_lon_final
merged_df.loc[merged_df.index.isin(kept_points_indexes), loc_type_col] = loc_type_gov # 1

In [None]:
# Optional: save before finalizing (can be used for debugging)
# merged_df.to_csv('final_data_debug_version.csv')

### 8.5.  Add also the dropped gov points to final DataFrame

In [None]:
# copy the df for finalizing
final_df = merged_df.copy()

# rather than dropping gov points where both model conf < threshold, set their values in "location_type" column to zero
dropped_gov_points = final_df.loc[final_df['lat_final'].isna()].copy()

dropped_indexes = dropped_gov_points.index

dropped_lat = dropped_gov_points['lat_gov']
dropped_lon = dropped_gov_points['lon_gov']

loc_type_col = ['location_type']
lat_final_col = ['lat_final']
lon_final_col = ['lon_final']

final_df.loc[final_df.index.isin(dropped_indexes), lat_final_col] = dropped_lat
final_df.loc[final_df.index.isin(dropped_indexes), lon_final_col] = dropped_lon
final_df.loc[final_df.index.isin(dropped_indexes), loc_type_col] = loc_type_gov_dropped # 0

### 8.6. Add "source" column to DataFrame
There are 2 values for **"source"** column:
- 1 = government location point
- 2 = ML prediction point

In [None]:
source_col = 'source'

# Add "source" column to DataFrame
final_df[source_col] = np.nan

# add government 
gov_source_list = [loc_type_gov_dropped, loc_type_gov]
ml_source_list = [loc_type_ml]

##########  add values in "source" column  ##########
# add gov points
final_df.loc[final_df.location_type.isin(gov_source_list), 
             source_col] = src_gov_id
# add ML points
final_df.loc[final_df.location_type.isin(ml_source_list), 
             source_col] = src_ml_id

In [None]:
final_df['Source'] = np.where((final_df['source'] ==1), 'Govt', 'ML')

### 8.7. Finalize

In [None]:
# reset index
final_df = final_df.reset_index(drop = True)

# drop columns
columns_to_drop = ["top", "left", "bottom", "right", 
                   "pred_df_index", "lat_pred", "lon_pred", "prob", "pred_image_id", "dist"]
final_df = final_df.drop(columns_to_drop, axis=1)

# rename lat / lon column
final_df = final_df.rename({'lat_final': 'lat',
                            'lon_final' : 'lon'},
                             axis=1)

# Save to csv file

In [None]:
final_df.to_csv("merged_data.csv", index = False)

# Add giga ids and temp school_ids

In [41]:
# add giga ids

list_unique_id = []
for i in range(0, len(merged_data)):
    uid = str(uuid.uuid3(uuid.NAMESPACE_DNS, str(merged_data['school_nam'][i])  + str(merged_data['lat'][i]) + str(merged_data['lon'][i])))
    list_unique_id.append(uid)              
merged_data['giga_id_school'] = list_unique_id


In [45]:
# add giga_school_id_seq

merged_data['giga_school_id_seq'] = np.arange(len(merged_data)) + 1
merged_data['country_code'] = 'SD'
list_hex_res = []
for i in range(0, len(merged_data)):
    h = ("ML_" + merged_data['country_code'][0] + "_" + merged_data['giga_school_id_seq'].iloc[i].astype('str').zfill(6)) #Based on school agg count 
    list_hex_res.append(h)
merged_data['giga_school_id_seq'] = list_hex_res

In [55]:
#Use giga_school_id_seq as temp school ids for those with no school ids (the ML data points)
merged_data['MOEcode'] = merged_data['MOEcode'].fillna(merged_data['giga_school_id_seq'])

In [58]:
merged_data.to_csv("merged_data_v2.csv", index = False)