In [None]:
# Mounts Google Drive to Google Colab.
from google.colab import drive
drive.mount('/content/drive')


In [None]:
# Import essential Python libraries
# pandas is used for tabular data manipulation.
# numpy is used for numerical computations.
import pandas as pd
import numpy as np


In [None]:
path = "/content/drive/MyDrive/msc-dessertation-traffic/data/accidents_clean.csv"
df = pd.read_csv(path)



print("Original size:", df.shape)
df.head()

# SAMPLE for DBSCAN (to avoid RAM crash)
df_sample = df.sample(n=130000, random_state=42)

print("Sampled size:", df_sample.shape)
df.head()



In [None]:
df.columns


In [None]:
# --- Fix object columns before XGBoost ---

# Convert Time (HH:MM) to minutes since midnight
df["Time"] = pd.to_datetime(df["Time"], format="%H:%M", errors="coerce")
df["Time"] = df["Time"].dt.hour * 60 + df["Time"].dt.minute

# Drop rows where Time could not be parsed
df = df.dropna(subset=["Time"])

# Check data types
print(df.dtypes)


In [None]:
X = df.drop(["Accident_Severity", "Accident_Index", "Date"], axis=1)
y = df["Accident_Severity"]

# Ensure ALL features are numeric
X = X.apply(pd.to_numeric, errors="coerce")
X = X.dropna()

# Align y with X after dropping rows
y = y.loc[X.index]

print(X.dtypes)
print(X.shape, y.shape)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# XGBoost labels must start at 0
y_train_xgb = y_train - 1
y_test_xgb  = y_test - 1



In [None]:
!pip -q install xgboost



In [None]:
sorted(y_train.unique()), sorted(y_test.unique())



In [None]:
from xgboost import XGBClassifier

xgb = XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    eval_metric="mlogloss",
    random_state=42,
    n_jobs=-1
)

xgb.fit(X_train, y_train_xgb)
print("XGBoost training completed")


In [None]:
# Take the accuarcy and confusion matrix report
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = xgb.predict(X_test)

print("Accuracy:", accuracy_score(y_test_xgb, y_pred))
print("\nClassification Report:\n", classification_report(y_test_xgb, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test_xgb, y_pred))


In [None]:
coords = df_sample[["Longitude", "Latitude"]].dropna().copy()
print("Coords size:", coords.shape)
coords.head()


In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
coords_scaled = scaler.fit_transform(coords)
print("Scaled shape:", coords_scaled.shape)


In [None]:
# Apply the DBSCAN clustering algorithm to identify spatial accident hotspots.
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.25, min_samples=30)
clusters = dbscan.fit_predict(coords_scaled)

coords["Hotspot_Cluster"] = clusters
coords["Hotspot_Cluster"].value_counts().head(10)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN

# 1) Use your sampled dataframe
coords = df_sample[["Latitude", "Longitude"]].dropna().copy()
print("Coords used:", coords.shape)

# 2) Convert to radians for haversine distance
coords_rad = np.radians(coords[["Latitude", "Longitude"]].values)

# 3) DBSCAN parameters (km-based)
kms = 0.5  # try 0.3, 0.5, 1.0 if needed
eps = kms / 6371.0088

dbscan = DBSCAN(
    eps=eps,
    min_samples=20,
    metric="haversine",
    algorithm="ball_tree"
)

clusters = dbscan.fit_predict(coords_rad)
coords["Hotspot_Cluster"] = clusters

print(coords["Hotspot_Cluster"].value_counts().head(10))
print("Clusters (excluding noise):", coords[coords["Hotspot_Cluster"] != -1]["Hotspot_Cluster"].nunique())


In [None]:
hotspots = coords[coords["Hotspot_Cluster"] != -1]

plt.figure(figsize=(9,6))
plt.scatter(
    hotspots["Longitude"],
    hotspots["Latitude"],
    c=hotspots["Hotspot_Cluster"],
    cmap="tab10",
    s=6
)
plt.title("DBSCAN Accident Hotspots (Sampled STATS19)")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()

print("Hotspot points:", hotspots.shape[0])


In [None]:
coords["Hotspot_Cluster"].value_counts().head()


In [None]:
!pip -q install folium

import folium
import pandas as pd
import numpy as np


In [None]:
# Keep only hotspot clusters (exclude noise -1)
hotspots = coords[coords["Hotspot_Cluster"] != -1].copy()

print("Hotspot points:", hotspots.shape[0])
print("Clusters:", hotspots["Hotspot_Cluster"].nunique())

# Keep only TOP 5 biggest clusters (to keep map fast)
top_clusters = hotspots["Hotspot_Cluster"].value_counts().head(5).index
hotspots_top = hotspots[hotspots["Hotspot_Cluster"].isin(top_clusters)].copy()

print("Top hotspot points:", hotspots_top.shape[0])
print("Top clusters:", hotspots_top["Hotspot_Cluster"].unique())


In [None]:
# Centre map at mean of top hotspots
center_lat = hotspots_top["Latitude"].mean()
center_lon = hotspots_top["Longitude"].mean()

m = folium.Map(location=[center_lat, center_lon], zoom_start=6, tiles="cartodbpositron")
m


In [None]:
# Add points as CircleMarkers
for _, row in hotspots_top.iterrows():
    folium.CircleMarker(
        location=[row["Latitude"], row["Longitude"]],
        radius=3,
        popup=f"Cluster: {row['Hotspot_Cluster']}",
        color="red",
        fill=True,
        fill_opacity=0.6
    ).add_to(m)

m


In [None]:
m2 = folium.Map(location=[center_lat, center_lon], zoom_start=6, tiles="cartodbpositron")

for cl in top_clusters:
    layer = folium.FeatureGroup(name=f"Cluster {cl}")
    temp = hotspots_top[hotspots_top["Hotspot_Cluster"] == cl]

    for _, row in temp.iterrows():
        folium.CircleMarker(
            location=[row["Latitude"], row["Longitude"]],
            radius=3,
            popup=f"Cluster: {row['Hotspot_Cluster']}",
            fill=True,
            fill_opacity=0.6
        ).add_to(layer)

    layer.add_to(m2)

folium.LayerControl().add_to(m2)
m2


In [None]:
import os
os.makedirs("/content/drive/MyDrive/msc-dessertation-traffic/results", exist_ok=True)



In [None]:
out_path = "/content/drive/MyDrive/msc-dessertation-traffic/results/hotspot_map.html"
m2.save(out_path)
print("Saved:", out_path)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10,6))
plt.scatter(
    hotspots_top["Longitude"],
    hotspots_top["Latitude"],
    c=hotspots_top["Hotspot_Cluster"],
    s=5
)
plt.title("Accident Hotspots Identified using DBSCAN (Sampled Data)")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.savefig("/content/drive/MyDrive/msc-dessertation-traffic/results/hotspot_map.png", dpi=300, bbox_inches="tight")
plt.show()
