In [None]:
import os
import numpy as np
import pandas as pd
import time
from collections import Counter
from sklearn.metrics.cluster import adjusted_rand_score, adjusted_mutual_info_score
import math
from LAF import DBSCAN
from LAF import DBSCANPP


exp_ds_name = "MS_50k"
alpha_for_laf_dbscan, alpha_for_laf_dbscanpp = 1.5, 1.0
dbscanpp_p_delta = 0.2
num_sample = 50000 
minPts = 5 
eps_cos_dist = 0.55

print("=============== Parameters ========================")
print("exp_ds_name: {}".format(exp_ds_name), flush=True)
print("num_sample: {}".format(num_sample), flush=True)
print("eps_cos_dist: {}, minPts: {}".format(eps_cos_dist, minPts), flush=True)
print("alpha_for_laf_dbscan: {}, alpha_for_laf_dbscan++: {}".format(alpha_for_laf_dbscan, alpha_for_laf_dbscanpp), flush=True)
print("delta for DBSCAN++: {}".format(dbscanpp_p_delta), flush=True)
print("====================================================")

In [None]:
workdir = "./prediction"
ds_root = "./ds"
test_ds_path = os.path.join(
    ds_root, "{}.test.npy".format(exp_ds_name)
)
print("Loaded test data from {}".format(test_ds_path), flush=True)
test_data_points = np.load(test_ds_path)
    
if test_data_points.dtype != np.float32:
    test_data_points = test_data_points.astype(np.float32)

points = test_data_points

In [None]:
def load_estimator_prediction():
    test_res_arr = np.load("{}/{}_eps_{:.2f}_tau_{}.output".format(workdir, exp_ds_name, eps_cos_dist, minPts))
    return test_res_arr

def cosDist2eucDist(cos_dist: float, scale_factor: float):
  return math.sqrt(2 * (scale_factor**2) * cos_dist)

# DBSCAN++ needs the equivalent Euclidean distance eps as it only supports that distance metric.
# Same to DBSCAN/LAF-DBSCAN/LAF-DBSCAN++ as they are all implemented based on the codebase of DBSCAN++.
euc_eps_for_dbscanpp = cosDist2eucDist(eps_cos_dist, 1)
test_res_arr = load_estimator_prediction()
pred_num_neighbors = test_res_arr[:, 0]

def run_original_DBSCAN(points, euc_dist_eps, minPts): 
    dbscan_instance = DBSCAN(eps=euc_dist_eps, minPts=minPts)
    start = time.time()
    clusters_gt = dbscan_instance.fit_predict(points, cluster_outliers=False)
    time_gt = time.time() - start
    print("Elapsed time (DBSCAN): {} s".format(time_gt), flush=True)
    c = Counter(clusters_gt)
    print("#clusters:", len(c), flush=True)
    return clusters_gt, time_gt, dbscan_instance

def run_LAF_DBSCAN(dbscan_instance, points, pred_num_neighbors, clusters_gt, alpha=alpha_for_laf_dbscan):
    signature = "LAF-DBSCAN, alpha={:.2f}".format(alpha)
    relaxed_thresh = dbscan_instance.minPts * alpha
    
    start = time.time()
    clusters_laf_dbscan, num_pred_core_pts, num_real_core_pts = \
        dbscan_instance.fit_predict_with_card_est_with_postproc(
            points, 
            pred_num_neighbors, pred_core_minPts=relaxed_thresh, 
            cluster_outliers=False
        )
    time_laf_dbscan = time.time() - start
    
    print("Elapsed time ({}): {} s".format(signature, time_laf_dbscan), flush=True)
    c = Counter(clusters_laf_dbscan)
    print("#clusters:", len(c), flush=True)
    print("[%s] ARI: %f " % (signature, adjusted_rand_score(clusters_laf_dbscan, clusters_gt)), flush=True)
    print("[%s] AMI: %f " % (signature, adjusted_mutual_info_score(clusters_laf_dbscan, clusters_gt)), flush=True)

    return clusters_laf_dbscan, time_laf_dbscan, num_pred_core_pts, num_real_core_pts

def run_DBSCANpp(points, euc_dist_eps, minPts, init_method, clusters_gt, **kwargs):
    # DBSCAN++ with dynamic sample fraction based on the #predicted_core_points of LAF-DBSCAN.
    sample_fraction = float(kwargs['num_pred_core_pts']) / len(points) + kwargs['p_delta']
    sample_fraction = min(sample_fraction, 1.0)
    signature = "DBSCAN++, dynamic p={:.4f}".format(sample_fraction)
    print("sample fraction (p): {} , init_method: {}".format(sample_fraction, init_method), flush=True)

    dbscanpp_instance = DBSCANPP(p=sample_fraction, eps_density=euc_dist_eps, eps_clustering=euc_dist_eps, minPts=minPts)

    start = time.time()
    clusters_dbscanpp = dbscanpp_instance.fit_predict(points, init=init_method, cluster_outliers=False)
    time_dbscanpp = time.time() - start
    print("Elapsed time ({}): {} s".format(signature, time_dbscanpp), flush=True)

    c = Counter(clusters_dbscanpp)
    print("#clusters:", len(c), flush=True)
    print("[%s] ARI: %f ." % (signature, adjusted_rand_score(clusters_dbscanpp, clusters_gt)), flush=True)
    print("[%s] AMI: %f ." % (signature, adjusted_mutual_info_score(clusters_dbscanpp, clusters_gt)), flush=True)
    
    return  clusters_dbscanpp, time_dbscanpp, dbscanpp_instance

def run_LAF_DBSCANpp(dbscanpp_instance, points, pred_num_neighbors, clusters_gt, init_method, alpha=alpha_for_laf_dbscanpp):
    signature = "LAF-DBSCAN++, alpha={:.2f}, dynamic p={:.4f}".format(alpha, dbscanpp_instance.p)
    relaxed_thresh = dbscanpp_instance.minPts * alpha

    start = time.time()
    clusters_laf_dbscanpp = dbscanpp_instance.fit_predict_with_card_est_with_postproc(
        points, 
        pred_num_neighbors, pred_core_minPts=relaxed_thresh, 
        init=init_method, cluster_outliers=False
    )
    time_laf_dbscanpp = time.time() - start
    print("Elapsed time ({}): {} s".format(signature, time_laf_dbscanpp), flush=True)

    c = Counter(clusters_laf_dbscanpp)
    print("#clusters:", len(c), flush=True)
    print("[%s] ARI: %f" % (signature, adjusted_rand_score(clusters_laf_dbscanpp, clusters_gt)), flush=True)
    print("[%s] AMI: %f" % (signature, adjusted_mutual_info_score(clusters_laf_dbscanpp, clusters_gt)), flush=True)

    return clusters_laf_dbscanpp, time_laf_dbscanpp


### Run and evaluate 

In [None]:
clusters_gt, time_gt, dbscan_instance = \
    run_original_DBSCAN(points, euc_eps_for_dbscanpp, minPts)

In [None]:
clusters_laf_dbscan, time_laf_dbscan, num_pred_core_pts, num_real_core_pts = \
    run_LAF_DBSCAN(dbscan_instance, points, pred_num_neighbors, clusters_gt, alpha_for_laf_dbscan)

In [None]:
init_method = 'uniform'
clusters_dbscanpp, time_dbscanpp, dbscanpp_instance = \
    run_DBSCANpp(points, euc_eps_for_dbscanpp, minPts, init_method, clusters_gt,
                p_delta = dbscanpp_p_delta, num_pred_core_pts=num_pred_core_pts 
    )

In [None]:
clusters_laf_dbscanpp, time_laf_dbscanpp = \
    run_LAF_DBSCANpp(dbscanpp_instance, points, pred_num_neighbors, clusters_gt, init_method, alpha_for_laf_dbscanpp)

In [None]:
ari_laf_dbscan = adjusted_rand_score(clusters_gt, clusters_laf_dbscan)
ami_laf_dbscan = adjusted_mutual_info_score(clusters_gt, clusters_laf_dbscan)
ari_dbscanpp = adjusted_rand_score(clusters_gt, clusters_dbscanpp)
ami_dbscanpp = adjusted_mutual_info_score(clusters_gt, clusters_dbscanpp)
ari_laf_dbscanpp = adjusted_rand_score(clusters_gt, clusters_laf_dbscanpp)
ami_laf_dbscanpp = adjusted_mutual_info_score(clusters_gt, clusters_laf_dbscanpp)

scores_and_time = [
    ("groundtruth", "-", "-", time_gt), 
    ("DBSCAN++", ari_dbscanpp, ami_dbscanpp, time_dbscanpp), 
    ("LAF-DBSCAN", ari_laf_dbscan, ami_laf_dbscan, time_laf_dbscan),
    ("LAF-DBSCAN++", ari_laf_dbscanpp, ami_laf_dbscanpp, time_laf_dbscanpp), 
]
print("================= Evalution scores ======================\n")
print(pd.DataFrame(scores_and_time, columns=["method", "ARI", "AMI", "time"]))
print("=========================================================")