## UBER Pickups clustering with DBScan

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px 

Like we did with KMeans algorithm, let's build a dataset by cleaning up the data, adding day of week and hour ands taking only 5% of data because DBScan alogrithm is more time consuming.

In [2]:
import os

filepath = './Sources'
df = pd.DataFrame()
for filename in os.listdir(filepath):
    if 'uber-raw-data' in filename and '14.csv' in filename: 
        print(f'Loading {filename} in a dataset')
        df_single_file = pd.read_csv(os.path.join(filepath, filename))
        df = pd.concat([df, df_single_file])
        
print("Cleaning up data, calculating sample and needed columns...")
del df_single_file        

df.drop(columns=['Base'], inplace = True)

# We take a sample of 5% of data
frac=0.05
df_sample = df.sample(frac=frac, random_state=0)
del df

df_sample['datetime'] = pd.to_datetime(df_sample['Date/Time'])
df_sample['day'] = df_sample['datetime'].dt.dayofweek
df_sample['hour'] = df_sample['datetime'].dt.hour
df_sample = df_sample.drop(columns=['datetime', 'Date/Time'])

Loading uber-raw-data-apr14.csv in a dataset
Loading uber-raw-data-aug14.csv in a dataset
Loading uber-raw-data-jul14.csv in a dataset
Loading uber-raw-data-jun14.csv in a dataset
Loading uber-raw-data-may14.csv in a dataset
Loading uber-raw-data-sep14.csv in a dataset
Cleaning up data, calculating sample and needed columns...


In [3]:
print("Sample row count:", df_sample.shape[0])
df_sample.head()

Sample row count: 226716


Unnamed: 0,Lat,Lon,day,hour
202260,40.759,-73.9728,1,21
330654,40.722,-73.996,3,13
89280,40.7935,-73.9736,2,16
603330,40.7146,-73.9667,6,13
108908,40.6271,-73.9437,2,11


In [4]:
fields= ['Lat', 'Lon']

Let's calculate and plot the clusters for all data whatever the day and hour first !

In [5]:
from sklearn.cluster import DBSCAN

db = DBSCAN(eps=0.003, min_samples=500, metric="manhattan")
db.fit(df_sample[fields])

In [6]:
unique, counts = np.unique(db.labels_, return_counts=True)
print(unique, counts)

[-1  0  1  2  3  4  5  6  7  8  9] [ 48528 162883   3638   2232    992   2008   1606    780   2655    802
    592]


In [7]:
print('Percentage of outliers is : ', round(counts[0]*100/df_sample.shape[0]), '%')

Percentage of outliers is :  21 %


By taking DBSCAN(eps=0.03, metric='manhattan', min_samples=500) we get 10 clusters and not so many outliers (-1 value).

In [8]:
df_sample['dbs_cluster'] = db.labels_

In [9]:
fig = px.scatter_mapbox(
        df_sample[df_sample.dbs_cluster!=-1], 
        # X_sample,
        lat="Lat", 
        lon="Lon",
        color="dbs_cluster",
        mapbox_style="carto-positron"
)

# Add title and axis labels
fig.update_layout(
    title=f"Static clusters with DBScan",
    width=600, height=500
)

# Render
fig.show(renderer="iframe_connected")

We can see the main hopspots with former plot. 
Let's generalize it to days and hours !

In [10]:
df_day_hour = []
day_rows_counts=[]
for day in range(7):
    df_day_hour.append([])
    day_hour_rows_counts = 0
    for hour in range(24):
        df_day_hour[day].append(df_sample[(df_sample.day==day) & (df_sample.hour==hour)].copy())
        day_hour_rows_counts += df_day_hour[day][hour].shape[0]
    day_rows_counts.append(day_hour_rows_counts)

In [11]:
# Create figure
fig = px.bar(x=range(7), y=day_rows_counts)

# Add title and axis labels
fig.update_layout(
    yaxis_title="Row count",
    xaxis_title="DataFrame days",
    width=600, height=500
)

# Render
fig.show(renderer="iframe_connected")

Even with sampling, the data is pretty well distributed among the days of week.

Let's calculate the clusters day per day and hour per hour within days of week.
We change the dbscan parameters as the count of items is less than with full day.

In [12]:
day_names={0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6: 'Sunday'}

In [13]:
# We keep the number of clusters calculated hour per hour
calculated_cluster_names = []
calculated_cluster_numbers = []
for day in range(7):
    for hour in range(24):
        db = DBSCAN(eps=0.01, min_samples=15, metric="manhattan")
        db.fit(df_day_hour[day][hour][fields])
        df_day_hour[day][hour]["dbs_cluster"] = db.labels_
        calculated_cluster_names.append(day_names[day][:2]+str(hour).zfill(2))
        calculated_cluster_numbers.append(len(np.unique(db.labels_))-1)
    print(f"Day {day} fit")

Day 0 fit
Day 1 fit
Day 2 fit
Day 3 fit
Day 4 fit
Day 5 fit
Day 6 fit


In [14]:
# Create figure
fig = px.bar(x=calculated_cluster_names, y=calculated_cluster_numbers)

# Add title and axis labels
fig.update_layout(
    yaxis_title="Clusters count",
    xaxis_title="Day/Hour",
    title='Calculated clusters count per day and hour with DBScan on a sample of 5% of data.'
)

# Render
fig.show(renderer="iframe_connected")

In [15]:
df_day = []
for day in range(7):
    df_day.append(pd.concat([df_day_hour[day][hour] for hour in range(24)]))

In [16]:
day_names={0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6: 'Sunday'}
for day in range(7):
    fig = px.scatter_mapbox(
            df_day[day][df_day[day].dbs_cluster>=0],
            lat="Lat", 
            lon="Lon",
            color="dbs_cluster",
            mapbox_style="carto-positron",
            animation_frame="hour"
    )

    # Add title and axis labels
    fig.update_layout(title=f"Clusters for {day_names[day]}",width=600, height=500)

    # Render
    fig.show(renderer="iframe_connected")

In [24]:
day=3
hour=22
fig = px.scatter_mapbox(
        df_day[day][(df_day[day].dbs_cluster>=0) & (df_day[day].hour==hour)],
        lat="Lat", 
        lon="Lon",
        color="dbs_cluster",
        mapbox_style="carto-positron",
        zoom=9,
)

# Add title and axis labels
fig.update_layout(title=f"Clusters for {day_names[day]} @{hour}",width=600, height=500)

# Render
fig.show(renderer="iframe_connected")