In [None]:
#Imports
import numpy as np
import pandas as pd
import folium
from geopy.distance import great_circle
from sklearn import metrics
from sklearn.cluster import DBSCAN as dbscan
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
from ipywidgets import Label, Text, Button, Output, VBox, HBox

In [52]:
# IPython widget

style = {'description_width': 'initial'}
normal_label = Label("Geographic Selector")
area_input = Text(value='S12000043', description='Area Code:', disabled=False)
minsamp = BoundedIntText(value=5, min=2, max=50, step=1, description='Minimum cars per cluster (min. samples):', disabled=False, style=style)
EPS = BoundedIntText(value=30, min=10, max=5000, step=5, description='Minimum distance between clusters (EPS):', disabled=False, style=style)

normal_button = Button(description="Implement")
normal_output = Output()

def Area_Lock_In(a):
    normal_output.clear_output()
    with normal_output:
        print(f" Selected Area: {area_input.value} with minimum samples: {minsamp.value} & EPS: {EPS.value} ")

normal_button.on_click(Area_Lock_In)

vbox_normal = VBox([normal_label, area_input, minsamp, EPS, normal_button, normal_output])
VBox([vbox_normal])

VBox(children=(VBox(children=(Label(value='Geographic Selector'), Text(value='S12000043', description='Area Co…

In [49]:
#Data selection
def LoadData():
    acc_05_07 = pd.read_csv(r"C:\Users\Thyll\Proj1\accidents_2005_to_2007.csv",low_memory=False)
    acc_09_11 = pd.read_csv(r"C:\Users\Thyll\Proj1\accidents_2009_to_2011.csv",low_memory=False)
    acc_12_14 = pd.read_csv(r"C:\Users\Thyll\Proj1\accidents_2012_to_2014.csv",low_memory=False)
    accidents = [acc_05_07, acc_09_11, acc_12_14]
    accidents_df = pd.concat(accidents)
    accidents_clean = accidents_df.drop_duplicates(
        subset=["Accident_Index", "Date", "LSOA_of_Accident_Location", "Time", "Longitude", "Latitude"], keep="first")
    final_areax = accidents_clean[accidents_clean["Local_Authority_(Highway)"] == area_input.value].copy()
    return final_areax


final_area = LoadData()

In [54]:
# I want to define my function as taking 3 variables, a and b and c.
# a = DF in question
# b = min_samples (default = 5)
# c = distance between accidents
def func_unsupervised_dbscan_all_years(a, b, c):
    acc_area_year = a
    
    def function_geopy_gc(x, y):
        lat1, long1 = x[0], x[1]
        lat2, long2 = y[0], y[1]
        distance = great_circle((lat1, long1), (lat2, long2)).meters
        return distance

    dist_between_accidents = c  # distance in meters to use with dbscan
    # min_samples = The number of samples (or total weight) in a neighborhood for a
    # point to be considered as a core point. This includes the point itself.
    acc_area_year_dbscan = acc_area_year
    location = acc_area_year_dbscan[["Latitude", "Longitude"]]
    dbs = dbscan(eps=dist_between_accidents, min_samples=b, metric=function_geopy_gc).fit(location)
    labels = dbs.labels_
    unique_labels = np.unique(dbs.labels_)
    acc_area_year_dbscan["Cluster"] = labels
   

    '''METRICS'''
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)
    X, labels_true = make_blobs(n_samples=len(acc_area_year_dbscan["Latitude"]), centers=location, cluster_std=0.4,
                                random_state=0)

    X = StandardScaler().fit_transform(X)
    print(str(f"Stats for nerds:"))
    print(str(f"Estimated number of clusters: {n_clusters_}"))
    print(str(f"Estimated number of noise points: {n_noise_}"))
    print(str(f"Homogeneity: {(metrics.homogeneity_score(labels_true, labels)):.3f}"))
    print(str(f"Completeness: {(metrics.completeness_score(labels_true, labels)):.3f}"))
    print(str(f"V-measure: {(metrics.v_measure_score(labels_true, labels)):.3f}"))
    print(str(f"Adjusted Rand Index: {(metrics.adjusted_rand_score(labels_true, labels)):.3f}"))
    print(str(f"Adjusted Mutual Information: {(metrics.adjusted_mutual_info_score(labels_true, labels)):.3f}"))
    print(str(f"Silhouette Coefficient: {(metrics.silhouette_score(X, labels)):.3f}"))

    '''STAGE 3: PLOTTING MY CLUSTERS ON A MAP USING FOLIUM.'''
    location = acc_area_year_dbscan["Latitude"].mean(), acc_area_year_dbscan["Longitude"].mean()
    cluster_map = folium.Map(location=location, zoom_start=13)
    folium.TileLayer("cartodbpositron").add_to(cluster_map)
    clust_colours = ['#a6cee3', '#1f78b4', '#b2df8a', '#33a02c', '#fb9a99', '#e31a1c',
                     '#fdbf6f', '#ff7f00', '#cab2d6', '#6a3d9a', '#7a7a6f', '#b15928']
    for i in range(0,
                   len(acc_area_year_dbscan)):
        colouridx = acc_area_year_dbscan["Cluster"].iloc[i]
        if colouridx == -1:  
            pass
        else: 
            col = clust_colours[colouridx % len(clust_colours)]
            folium.CircleMarker([acc_area_year_dbscan["Latitude"].iloc[i],
                                 acc_area_year_dbscan["Longitude"].iloc[i]],
                                radius=10, color=col, fill=col).add_to(cluster_map)

    display(cluster_map)



func_unsupervised_dbscan_all_years(final_area, minsamp.value, EPS.value)



Stats for nerds:
Estimated number of clusters: 22
Estimated number of noise points: 12779
Homogeneity: 0.032
Completeness: 1.000
V-measure: 0.063
Adjusted Rand Index: 0.000
Adjusted Mutual Information: 0.000
Silhouette Coefficient: -0.162


Time taken to complete task: 75.43 seconds.
