In [1]:
import geopandas as gpd
import pandas as pd
from sklearn.cluster import DBSCAN
import numpy as np

## Params

In [2]:
csv_dir = "data/mongolia_gov_data_model_conf_scores_added.csv"
csv_lat_col_name = "latitude"
csv_lon_col_name = "longitude"
csv_crs = "epsg:4326"

nn_distance = 15 # in meters
crs_3857 = "epsg:3857"

## Cluster points with nearby distance threshold

In [3]:
# read csv file
df = pd.read_csv(csv_dir)

# drop rows with NaN values in "longitude" column
new_df = df[df['longitude'].notna()].copy()

# create geojson file
gdf = gpd.GeoDataFrame(new_df, 
                       geometry=gpd.points_from_xy(new_df[csv_lon_col_name], 
                                                   new_df[csv_lat_col_name]), 
                       crs=csv_crs)

# convert CRS to "EPSG:3857"
gdf = gdf.to_crs(crs_3857)

# drop rows with inf values in "geometry" column - error caused during CRS conversion
gdf = gdf[~(np.isinf(gdf.geometry.x) | np.isinf(gdf.geometry.y))].reset_index(drop = True)

In [4]:
assert "x" not in gdf.columns
assert "y" not in gdf.columns

# calculate x and y columns
gdf["x"] = gdf.geometry.x # create x coordinate column (with units in meters)
gdf["y"] = gdf.geometry.y # create y coordinate column (with units in meters)

# cluster - create a numpy array where each row is a coordinate pair
coords = gdf[["x", "y"]].values

# 15 is max distance between points.
# min cluster size are 2 points. 
# You'll have to tweak the distance "nn_distance" variable.
db = DBSCAN(eps = nn_distance, min_samples = 2).fit(coords) 
cluster_labels = pd.Series(db.labels_).rename('cluster') # a series with all points cluster ids

cluster_df = pd.concat([gdf, cluster_labels.reindex(gdf.index)], 
                       axis=1) # concat it to the dataframe

cluster_df = cluster_df[["point_id", "cluster"]]

## Add cluster ids to DataFrame

In [5]:
# append cluster ids to original "df" DataFrame
final_df = pd.merge(df, cluster_df, 
                    on="point_id", 
                    how = "left")

# rename column "cluster" to "cluster_id" 
final_df.rename(columns = {"cluster" : f"cluster_id"},
                inplace = True)

In [6]:
# save final output file
final_df.to_csv(f"data/gov_data_nn_{nn_distance}_m.csv", index = False)