In [1]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import seaborn as sns
import pickle
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
import dask
from dask.distributed import Client
from kneed import KneeLocator
import umap

In [2]:
def compute_elbow_threshold(df_label_counts, y_name="count", y_label="Number of samples"):
    x = df_label_counts.index
    y = df_label_counts[y_name]
    kn = KneeLocator(x=x, y=y, curve='convex', direction='increasing')
    knee = kn.knee
    return df_label_counts.iloc[knee]["label"], df_label_counts.iloc[knee][y_name]

In [3]:
def drop_clusters_with_few_samples(df, thresh=None, suffix=""):
    """ If thresh is None, the Kneedle algorithm will be used to determine a treshold. """
    temp = df.copy()
    
    # count number of grid cells in each cluster
    df_nums = temp.groupby("label").count()["e0"].reset_index().rename(columns={"e0": "count"})
    df_nums = df_nums.sort_values("count").reset_index(drop=True)
    df_nums["label"] = df_nums["label"].astype(str)

    # compute threshold to cut off clusters
    if not thresh:
        cluster_label, thresh = compute_elbow_threshold(df_nums, y_name="count", y_label="Log number of samples")

    # set all labels to -2 or -1 (noise) where num samples is too small
    labels_to_keep = list(df_nums[df_nums["count"] >= thresh].label)
    # temp.loc[~(temp.label.astype(str).isin(labels_to_keep)), "label"] = -1

    # drop small clusters
    temp = temp[temp.label.astype(str).isin(labels_to_keep)]

    return temp

# Repeat UMAP-DBSCAN runs

In [4]:
#@dask.delayed
def compute_labels(df):
    # compute embedding
    embedding = umap.UMAP(min_dist=0.0, n_components=3, n_neighbors=20).fit_transform(df_scaled)

    # compute clustering
    model = DBSCAN(eps=0.1, min_samples=3).fit(embedding)

    # drop small clusters?
    embedding = pd.DataFrame(embedding, columns=["e0", "e1", "e2"])
    embedding["label"] = model.labels_
    dropped = drop_clusters_with_few_samples(embedding, thresh=None)
    
    return dropped.label

In [5]:
# client = Client(n_workers=4)
# client

In [6]:
# load data
df_in = pd.read_csv("data/df_wide_knn.csv") 
df = df_in.drop(["LATITUDE", "LONGITUDE", "LEV_M"], axis=1)  # remove geolocation

# scale data
scaler = MinMaxScaler().fit(df)
df_scaled = pd.DataFrame(scaler.transform(df), columns=df.columns)

In [7]:
num_iterations = 100

# Re-run the UMAP-DBSCAN several times

In [None]:
labels_list = []

for i in range(num_iterations):
    print(i)
    labels_list.append(compute_labels(df_scaled))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48


In [None]:
# res = dask.compute(*labels_list)
res = labels_list

In [None]:
with open("res_withDrop.pickle", "wb") as fp:   
    pickle.dump(res, fp)

# Cluster matching

In [None]:
with open("res_withDrop.pickle", "rb") as fp:
    res = pickle.load(fp)
labels_list = res

## Step 1: Find maximum overlapping labels/clusters for each combination of clustering 

In [None]:
# compare outputs matching by #overlapping samples
num_iterations = len(res)
res_matching = []
for i in range(num_iterations):
    for j in range(num_iterations):
        if i != j:
            print(i, j)

            # load both label sets
            a = pd.DataFrame(labels_list[i], columns=["label"])  # pd.read_csv(output_dir + f"eps_{eps}-min_samples_{min_samples}-iteration_{i}.csv")
            b = pd.DataFrame(labels_list[j], columns=["label"])  # pd.read_csv(output_dir + f"eps_{eps}-min_samples_{min_samples}-iteration_{j}.csv")

            # compute number of clusters of both clusterings
            num_clusters_a = len(a['label'].unique())
            num_clusters_b = len(b['label'].unique())
            print(f"Number of clusters difference: {num_clusters_a - num_clusters_b}")

            # compare num samples per cluster
            num_samples_a = pd.DataFrame(a.value_counts()).reset_index()
            num_samples_b = pd.DataFrame(b.value_counts()).reset_index()

            # match clusters by computing sample overlap
            df_labels = df_in.copy()
            df_labels["label_a"] = a
            df_labels["label_b"] = b

            # iterate over each cluster in a
            for c in np.sort(a["label"].unique()):
                # check potential matches (given label c (from clusterin a), which labels in clustering b are at the same points?
                matches = pd.DataFrame(df_labels[df_labels["label_a"] == c]["label_b"])
                match_counts = matches.value_counts().reset_index()
                num_matches = len(match_counts)

                # what is the cluster in b, that has most points in common with cluster c (from clustering a)?
                max_match = match_counts[match_counts["count"] == match_counts["count"].max()]

                # if all samples in c got assigned the same cluster in b, num_matches will be 1
                if num_matches != 0:
                    res_matching.append({"clustering_a": i, "clustering_b": j,
                                "label_a": c,
                                "label_b": max_match["label_b"].values[0],
                                "num_samples_a": len(matches),
                                "num_samples_max_match": max_match["count"].values[0],
                                "difference": len(matches) - max_match["count"].values[0]})
                else:
                    # if no matching cluster was found
                    res_matching.append({"clustering_a": i, "clustering_b": j,
                                "label_a": c,
                                "label_b": np.nan,
                                "num_samples_a": len(matches),
                                "num_samples_max_match": np.nan,
                                "difference": np.nan})

In [None]:
df_overlap = pd.DataFrame(res_matching)
df_overlap.to_csv("overlap_withDrop.csv", index=False)

In [None]:
df_overlap

## Step 2: Apply the label mapping and count the number of different labels

In [None]:
df_overlap = pd.read_csv("overlap.csv")

In [None]:
uncertainties = []

for i in range(num_iterations):
    for j in range(num_iterations):
        if i != j: 
            temp = df_in.copy()
            
            # define mapping between clustering a and b
            mapping = df_overlap[(df_overlap.clustering_a == i) & (df_overlap.clustering_b == j)][["label_a", "label_b"]]
            mapping_dict = {x[1].label_b: x[1].label_a for x in mapping.iterrows()}
            
            # add labels to initial df
            temp["label_a"] = res[i]
            temp["label_b"] = res[j]
            temp["label_b_mapped"] = temp["label_b"].map(mapping_dict)  # this is the label mapping
            
            # uncertainty as number of different labels
            uncertainties.append(temp.label_a != temp.label_b_mapped)  # if labels are unequal, this will be 1 (or True), i.e. we count how many different labels each datapoint has over the iterations

In [None]:
# summarize uncertainties (ignore the diagonal)
df_in["uncertainty"] = np.array(uncertainties).sum(axis=0)/(num_iterations*num_iterations-num_iterations)*100

In [None]:
df_in.to_csv("uncertainty.csv", index=False)

# Plot uncertainties

In [None]:
df_in = pd.read_csv("uncertainty.csv")

In [None]:
# compute embedding
embedding = umap.UMAP(min_dist=0.0, n_components=3, n_neighbors=20).fit_transform(df_scaled)

# visualize embedding
fig = plt.figure(figsize=(7, 6))
ax = fig.add_subplot(projection='3d')
ax.scatter(embedding[:, 0], embedding[:, 1], embedding[:, 2], alpha=0.08, s=2, marker=".")
plt.xlabel("Axis 0")
plt.ylabel("Axis 1")
ax.set_zlabel("Axis 2")
plt.tight_layout()
# plt.savefig("output/umap_space.png")
plt.show()

In [None]:
df_in["e0"] = embedding[:, 0]
df_in["e1"] = embedding[:, 1]
df_in["e2"] = embedding[:, 2]

In [None]:
sns.histplot(df_in["uncertainty"]) # many uncertain points...
plt.xlabel("Uncertainty [%]")
plt.tight_layout()
plt.savefig("output_old/dbscan/uncertainty/uncertainty_withDrop_histplot.png")
plt.show()

In [None]:
sns.boxplot(df_in["uncertainty"]) # many uncertain points...
plt.ylabel("Uncertainty [%]")
plt.tight_layout()
plt.savefig("output_old/dbscan/uncertainty/uncertainty_withDrop_boxplot.png")
plt.show()

In [None]:
# define Basemap
mymap = Basemap(llcrnrlon=temp["LONGITUDE"].min(), llcrnrlat=temp["LATITUDE"].min(), 
                urcrnrlon=temp["LONGITUDE"].max(), urcrnrlat=temp["LATITUDE"].max(), fix_aspect=False)

# plot
figsize = (6, 6)
fig = plt.figure(figsize=figsize)
ax = fig.add_subplot(projection='3d')
sc_3d = ax.scatter(df_in["LONGITUDE"], df_in["LATITUDE"], df_in["LEV_M"], c=df_in["uncertainty"], s=0.5, alpha=1, zorder=4)  # df["predictions"]
ax.add_collection3d(mymap.drawcoastlines(linewidth=0.5))
ax.set_box_aspect((np.ptp(df_in["LONGITUDE"]), np.ptp(df_in["LATITUDE"]), np.ptp(df_in["LEV_M"])/50))  # aspect ratio is 1:1:1 in data space
plt.gca().invert_zaxis()
plt.colorbar(sc_3d, location="bottom", fraction=0.05, pad=0.01, label="Uncertainty [%]")
plt.tight_layout()
plt.savefig("output_old/dbscan/uncertainty/uncertainty_withDrop_geospace.png")
plt.show()

fig = plt.figure(figsize=figsize)
ax = fig.add_subplot(projection='3d')
sc_umap = ax.scatter(df_in["e0"], df_in["e1"], df_in["e2"], c=df_in["uncertainty"], alpha=0.8, zorder=4, s=1)  # , s=s, alpha=1, zorder=4)
plt.colorbar(sc_umap, location="bottom", fraction=0.05, pad=0.05, label="Uncertainty [%]")
plt.tight_layout()
plt.savefig("output_old/dbscan/uncertainty/uncertainty_withDrop_umapspace.png")
plt.show()