In [26]:
#Imports
import numpy as np
import pandas as pd
import folium
from geopy.distance import great_circle
from sklearn import metrics
from sklearn.cluster import DBSCAN as dbscan
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler
import timeit
from ipywidgets import Label, Text, Button, Output, VBox, HBox

In [36]:
# IPython widget
normal_label = Label("Geographic Selector")
area_input = Text(value='S12000043', description='Area Code:', disabled=False)
normal_button = Button(description="Implement")
normal_output = Output()

def Area_Lock_In(a):
    normal_output.clear_output()
    with normal_output:
        print(f" Selected Area: {area_input.value} ")

normal_button.on_click(Area_Lock_In)

vbox_normal = VBox([normal_label, area_input, normal_button, normal_output])
VBox([vbox_normal])

VBox(children=(VBox(children=(Label(value='Geographic Selector'), Text(value='S12000043', description='Area Co…

In [37]:
#Data selection
def LoadData():
    acc_05_07 = pd.read_csv(r"C:\Users\Thyll\Proj1\accidents_2005_to_2007.csv",low_memory=False)
    acc_09_11 = pd.read_csv(r"C:\Users\Thyll\Proj1\accidents_2009_to_2011.csv",low_memory=False)
    acc_12_14 = pd.read_csv(r"C:\Users\Thyll\Proj1\accidents_2012_to_2014.csv",low_memory=False)
    accidents = [acc_05_07, acc_09_11, acc_12_14]
    accidents_df = pd.concat(accidents)
    accidents_clean = accidents_df.drop_duplicates(
        subset=["Accident_Index", "Date", "LSOA_of_Accident_Location", "Time", "Longitude", "Latitude"], keep="first")
    final_areax = accidents_clean[accidents_clean["Local_Authority_(Highway)"] == area_input.value].copy()
    return final_areax


final_area = LoadData()

Unnamed: 0,Accident_Index,Location_Easting_OSGR,Location_Northing_OSGR,Longitude,Latitude,Police_Force,Accident_Severity,Number_of_Vehicles,Number_of_Casualties,Date,...,Pedestrian_Crossing-Physical_Facilities,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Carriageway_Hazards,Urban_or_Rural_Area,Did_Police_Officer_Attend_Scene_of_Accident,LSOA_of_Accident_Location,Year
192417,200597AB00101,259250.0,664760.0,-4.250145,55.855471,97,3,1,1,02/01/2005,...,No physical crossing within 50 meters,Daylight: Street light present,Raining without high winds,Wet/Damp,,,1,No,,2005
192418,200597AB00103,259710.0,665280.0,-4.243071,55.860273,97,3,1,1,01/03/2005,...,No physical crossing within 50 meters,Daylight: Street light present,Fine without high winds,Dry,,,1,Yes,,2005
192419,200597AB00105,258840.0,665800.0,-4.257231,55.864688,97,3,1,2,03/05/2005,...,non-junction pedestrian crossing,Daylight: Street light present,Fine without high winds,Dry,,,1,Yes,,2005
192420,200597AB00111,259030.0,665760.0,-4.254177,55.864384,97,3,1,1,03/11/2005,...,Pedestrian phase at traffic signal junction,Daylight: Street light present,Raining without high winds,Wet/Damp,,,1,No,,2005
192421,200597AB00203,259260.0,664810.0,-4.250011,55.855922,97,3,1,1,01/03/2005,...,non-junction pedestrian crossing,Daylight: Street light present,Fine without high winds,Wet/Damp,,,1,Yes,,2005
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
462167,201497GE70410,258131.0,662999.0,-4.267086,55.839333,97,3,2,4,14/10/2014,...,No physical crossing within 50 meters,Daylight: Street light present,Fine without high winds,Dry,,,1,Yes,,2014
462168,201497GE70510,259457.0,662424.0,-4.245631,55.834558,97,1,2,1,23/10/2014,...,No physical crossing within 50 meters,Darkness: Street lights present and lit,Raining without high winds,Wet/Damp,,,1,Yes,,2014
462875,201497NA00708,269371.0,664052.0,-4.088239,55.851963,97,3,2,2,01/08/2014,...,No physical crossing within 50 meters,Daylight: Street light present,Fine without high winds,Dry,,,2,Yes,,2014
463037,201497NC00611,264257.0,668119.0,-4.171904,55.887066,97,3,2,1,06/11/2014,...,No physical crossing within 50 meters,Daylight: Street light present,Raining without high winds,Wet/Damp,,,2,No,,2014


In [None]:
tic = timeit.default_timer()




# I want to define my function as taking 3 variables, a and b and c.
# a = DF in question
# b = min_samples (default = 5)
# c = distance between accidents
def func_unsupervised_dbscan_all_years(a, b, c):
    acc_glasgow_year = a
    
    def function_geopy_gc(x, y):
        lat1, long1 = x[0], x[1]
        lat2, long2 = y[0], y[1]
        distance = great_circle((lat1, long1), (lat2, long2)).meters
        return distance

    dist_between_accidents = c  # distance in meters to use with dbscan
    # min_samples = The number of samples (or total weight) in a neighborhood for a
    # point to be considered as a core point. This includes the point itself.
    acc_glasgow_year_dbscan = acc_glasgow_year
    location = acc_glasgow_year_dbscan[["Latitude", "Longitude"]]
    dbs = dbscan(eps=dist_between_accidents, min_samples=b, metric=function_geopy_gc).fit(location)
    labels = dbs.labels_
    unique_labels = np.unique(dbs.labels_)
    print(unique_labels)
    acc_glasgow_year_dbscan["Cluster"] = labels
   

    '''METRICS'''
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)
    X, labels_true = make_blobs(n_samples=len(acc_glasgow_year_dbscan["Latitude"]), centers=location, cluster_std=0.4,
                                random_state=0)

    X = StandardScaler().fit_transform(X)
    print(str(f"Estimated number of clusters: {n_clusters_}"))
    print(str(f"Estimated number of noise points: {n_noise_}"))
    print(str(f"Homogeneity: {(metrics.homogeneity_score(labels_true, labels)):.3f}"))
    print(str(f"Completeness: {(metrics.completeness_score(labels_true, labels)):.3f}"))
    print(str(f"V-measure: {(metrics.v_measure_score(labels_true, labels)):.3f}"))
    print(str(f"Adjusted Rand Index: {(metrics.adjusted_rand_score(labels_true, labels)):.3f}"))
    print(str(f"Adjusted Mutual Information: {(metrics.adjusted_mutual_info_score(labels_true, labels)):.3f}"))
    print(str(f"Silhouette Coefficient: {(metrics.silhouette_score(X, labels)):.3f}"))

    '''STAGE 3: PLOTTING MY CLUSTERS ON A MAP USING FOLIUM.'''
    location = acc_glasgow_year_dbscan["Latitude"].mean(), acc_glasgow_year_dbscan["Longitude"].mean()
    cluster_map = folium.Map(location=location, zoom_start=13)
    folium.TileLayer("cartodbpositron").add_to(cluster_map)
    clust_colours = ['#a6cee3', '#1f78b4', '#b2df8a', '#33a02c', '#fb9a99', '#e31a1c',
                     '#fdbf6f', '#ff7f00', '#cab2d6', '#6a3d9a', '#7a7a6f', '#b15928']
    for i in range(0,
                   len(acc_glasgow_year_dbscan)):
        colouridx = acc_glasgow_year_dbscan["Cluster"].iloc[i]
        if colouridx == -1:  
            pass
        else: 
            col = clust_colours[colouridx % len(clust_colours)]
            folium.CircleMarker([acc_glasgow_year_dbscan["Latitude"].iloc[i],
                                 acc_glasgow_year_dbscan["Longitude"].iloc[i]],
                                radius=10, color=col, fill=col).add_to(cluster_map)

    output_name = str(
        f"Glasgow_all_years_samples_{b}_distance_{c}.html")  # f-strings has got to be the best thing i've learnt in this entire project
    cluster_map.save(output_name)



# a = DF in question
# b = min_samples (default = 5), increase to lower clusters
# c = distance between accidents, decrease to lower clusters
func_unsupervised_dbscan_all_years(glasgow, 10, 300)

toc = timeit.default_timer()

print(str(f"Time taken to complete task: {(toc - tic):.2f} seconds."))
