In [1]:
!pip install plotly -q

In [2]:
import plotly.graph_objects as go
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import seaborn as sns
import pandas as pd 
import numpy as np 
import datetime as datetime
import plotly.express as px
from sklearn.metrics import silhouette_score
from sklearn.datasets import load_sample_image 
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import warnings; warnings.simplefilter('ignore')

In [3]:
apr = pd.read_csv('uber-raw-data-apr14.csv')
may = pd.read_csv('uber-raw-data-may14.csv')
jun = pd.read_csv('uber-raw-data-jun14.csv')
jul = pd.read_csv('uber-raw-data-jul14.csv')
aug = pd.read_csv('uber-raw-data-aug14.csv')
sep = pd.read_csv('uber-raw-data-sep14.csv')

In [4]:
apr.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.769,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512


In [5]:
apr['Date/Time']

0           4/1/2014 0:11:00
1           4/1/2014 0:17:00
2           4/1/2014 0:21:00
3           4/1/2014 0:28:00
4           4/1/2014 0:33:00
                 ...        
564511    4/30/2014 23:22:00
564512    4/30/2014 23:26:00
564513    4/30/2014 23:31:00
564514    4/30/2014 23:32:00
564515    4/30/2014 23:48:00
Name: Date/Time, Length: 564516, dtype: object

In [6]:
# On merge tous les dataset en un seul mais nous travaillerons uniquement sur un seul 
# mois car les calculs seraient beaucoup trop longs sur plus de 4M lignes

uber_final = may.append([apr, jun, jul, aug, sep], ignore_index = True)
uber_final

Unnamed: 0,Date/Time,Lat,Lon,Base
0,5/1/2014 0:02:00,40.7521,-73.9914,B02512
1,5/1/2014 0:06:00,40.6965,-73.9715,B02512
2,5/1/2014 0:15:00,40.7464,-73.9838,B02512
3,5/1/2014 0:17:00,40.7463,-74.0011,B02512
4,5/1/2014 0:17:00,40.7594,-73.9734,B02512
...,...,...,...,...
4534322,9/30/2014 22:57:00,40.7668,-73.9845,B02764
4534323,9/30/2014 22:57:00,40.6911,-74.1773,B02764
4534324,9/30/2014 22:58:00,40.8519,-73.9319,B02764
4534325,9/30/2014 22:58:00,40.7081,-74.0066,B02764


In [7]:
# Create a second column conerning the date and the hour for all the months

uber_final['Week'] =pd.to_datetime(uber_final['Date/Time']).dt.isocalendar().week
uber_final['Year'] = pd.to_datetime(uber_final['Date/Time']).dt.isocalendar().year
uber_final['Month'] = pd.to_datetime(uber_final['Date/Time']).dt.month
uber_final['Week_day'] = pd.to_datetime(uber_final['Date/Time']).dt.weekday
# The new Date column can be used thanks to the datetime format compare to 
# previous one
uber_final['Date/Time'] = pd.to_datetime(uber_final['Date/Time'])
uber_final['Date/Time'].sort_values()

687971    2014-04-01 00:00:00
687972    2014-04-01 00:00:00
979235    2014-04-01 00:00:00
871234    2014-04-01 00:01:00
687973    2014-04-01 00:02:00
                  ...        
3781160   2014-09-30 22:59:00
3781159   2014-09-30 22:59:00
3781158   2014-09-30 22:59:00
4158853   2014-09-30 22:59:00
4355992   2014-09-30 22:59:00
Name: Date/Time, Length: 4534327, dtype: datetime64[ns]

In [8]:
uber_final[["Hours","Minutes"]] = pd.DataFrame([(x.hour, x.minute) for x in uber_final["Date/Time"]])

In [9]:
uber_final.columns

Index(['Date/Time', 'Lat', 'Lon', 'Base', 'Week', 'Year', 'Month', 'Week_day',
       'Hours', 'Minutes'],
      dtype='object')

In [10]:
# On réorganise l'ordre des colonnes pour un df plus esthétique
uber_final = uber_final.iloc[:, [0, 7, 8, 9, 4, 6, 5, 1, 2, 3]]
uber_final

Unnamed: 0,Date/Time,Week_day,Hours,Minutes,Week,Month,Year,Lat,Lon,Base
0,2014-05-01 00:02:00,3,0,2,18,5,2014,40.7521,-73.9914,B02512
1,2014-05-01 00:06:00,3,0,6,18,5,2014,40.6965,-73.9715,B02512
2,2014-05-01 00:15:00,3,0,15,18,5,2014,40.7464,-73.9838,B02512
3,2014-05-01 00:17:00,3,0,17,18,5,2014,40.7463,-74.0011,B02512
4,2014-05-01 00:17:00,3,0,17,18,5,2014,40.7594,-73.9734,B02512
...,...,...,...,...,...,...,...,...,...,...
4534322,2014-09-30 22:57:00,1,22,57,40,9,2014,40.7668,-73.9845,B02764
4534323,2014-09-30 22:57:00,1,22,57,40,9,2014,40.6911,-74.1773,B02764
4534324,2014-09-30 22:58:00,1,22,58,40,9,2014,40.8519,-73.9319,B02764
4534325,2014-09-30 22:58:00,1,22,58,40,9,2014,40.7081,-74.0066,B02764


In [11]:
# Nous allons travailler sur un seul mois pour le KMeans

uber_april = uber_final[uber_final['Month']==4]
uber_july = uber_final[uber_final['Month']==7]

uber_april

Unnamed: 0,Date/Time,Week_day,Hours,Minutes,Week,Month,Year,Lat,Lon,Base
652435,2014-04-01 00:11:00,1,0,11,14,4,2014,40.7690,-73.9549,B02512
652436,2014-04-01 00:17:00,1,0,17,14,4,2014,40.7267,-74.0345,B02512
652437,2014-04-01 00:21:00,1,0,21,14,4,2014,40.7316,-73.9873,B02512
652438,2014-04-01 00:28:00,1,0,28,14,4,2014,40.7588,-73.9776,B02512
652439,2014-04-01 00:33:00,1,0,33,14,4,2014,40.7594,-73.9722,B02512
...,...,...,...,...,...,...,...,...,...,...
1216946,2014-04-30 23:22:00,2,23,22,18,4,2014,40.7640,-73.9744,B02764
1216947,2014-04-30 23:26:00,2,23,26,18,4,2014,40.7629,-73.9672,B02764
1216948,2014-04-30 23:31:00,2,23,31,18,4,2014,40.7443,-73.9889,B02764
1216949,2014-04-30 23:32:00,2,23,32,18,4,2014,40.6756,-73.9405,B02764


In [12]:
uber_april.columns

Index(['Date/Time', 'Week_day', 'Hours', 'Minutes', 'Week', 'Month', 'Year',
       'Lat', 'Lon', 'Base'],
      dtype='object')

In [13]:
from sklearn.preprocessing import StandardScaler

# Mask réalisé pour une map sur un créneau horaire défini : 
# D'un samedi 17h à dimanche 10h au mois d'avril
# Pour ce faire, nous allons faire une analyse par kmeans pour voir si
# la répartition des clusters est plus précise que celle de DBSCAN que l'on
# verra par la suite

mask1 = ((uber_april["Week_day"] == 5) & (uber_april["Hours"] >= 17))
mask2 = ((uber_april["Week_day"] == 6) & (uber_april["Hours"] <= 10))
mask_final = (mask1 | mask2)

geo_loc = ['Lat', "Lon"]

X = uber_april.loc[mask_final, geo_loc].sample(5000)

In [14]:
from sklearn.cluster import KMeans
wcss =  []
for i in range (1,11): 
    kmeans = KMeans(n_clusters= i, init = "k-means++", random_state = 0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

In [15]:
uber_april['Week_day'].unique()

array([1, 2, 3, 4, 5, 6, 0])

In [16]:
X

Unnamed: 0,Lat,Lon
751055,40.7629,-73.9740
752653,40.8552,-74.0680
890008,40.7178,-74.0032
1018480,40.7706,-73.9920
890339,40.7279,-73.9860
...,...,...
711899,40.7549,-73.9647
1074484,40.7653,-73.9716
1122296,40.7184,-73.9504
752525,40.7494,-73.9868


In [17]:
# Create a DataFrame that will be fed to plotly 
wcss_frame = pd.DataFrame(wcss)

# Using Plotly to visualize elbow 
import plotly.express as px 

# Creating a line plot
fig = px.line(wcss_frame, x=wcss_frame.index, y=wcss_frame.iloc[:, -1])

# Creating layouts 
fig.update_layout(
    title="Inertia per clusters",
    xaxis_title="# clusters",
    yaxis_title="Inertia"
)

# Render in notebook
fig.show(renderer="iframe")

In [18]:
# Create a DataFrame that will be fed to plotly 
wcss_frame = pd.DataFrame(wcss)

import plotly.express as px 

# Creating a line plot
fig = px.bar(wcss_frame, x=wcss_frame.index, y=wcss_frame.iloc[:, -1])

# Creating layouts 
fig.update_layout(
    title="Inertia per clusters",
    xaxis_title="# clusters",
    yaxis_title="Inertia"
)

# Render in notebook
fig.show(renderer="iframe")

In [19]:
from sklearn.metrics import silhouette_score

# Computer mean silhouette score
sil = []

for i in range (2,11): 
    kmeans = KMeans(n_clusters= i, init = "k-means++", random_state = 0)
    kmeans.fit(X)
    sil.append(silhouette_score(X, kmeans.predict(X)))
    print("Silhouette score for K={} is {}".format(i, sil[-1]))

Silhouette score for K=2 is 0.7800663266190513
Silhouette score for K=3 is 0.759429982452895
Silhouette score for K=4 is 0.3904707437743452
Silhouette score for K=5 is 0.4380231473213882
Silhouette score for K=6 is 0.44282593635512624
Silhouette score for K=7 is 0.4556436877267635
Silhouette score for K=8 is 0.4628302574084
Silhouette score for K=9 is 0.4786135815375559
Silhouette score for K=10 is 0.48754121485485513


In [20]:
# Create a data frame 
cluster_scores=pd.DataFrame(sil, index=range(2,11))

# Create figure
fig = px.bar(data_frame=cluster_scores,  
             x=cluster_scores.index, 
             y=cluster_scores.iloc[:, -1]
            )

# Add title and axis labels
fig.update_layout(
    yaxis_title="Silhouette Score",
    xaxis_title="# Clusters",
    title="Silhouette Score per cluster"
)

# Render
fig.show(renderer="iframe")
# fig.show(renderer="iframe") if using workspace

In [21]:
# Applying KMeans to the Mall dataset
kmeans = KMeans(n_clusters= 8, init= "k-means++", max_iter=300, n_init = 10, random_state=0)
y_kmeans = kmeans.fit_predict(X)

In [22]:
result = kmeans.labels_
result

array([4, 4, 0, ..., 6, 4, 0], dtype=int32)

In [23]:
X["cluster"] = kmeans.labels_
X["Hours"] = uber_april['Hours']
X["Day"] = uber_april['Week_day']

X.head()

Unnamed: 0,Lat,Lon,cluster,Hours,Day
751055,40.7629,-73.974,4,23,5
752653,40.8552,-74.068,4,10,6
890008,40.7178,-74.0032,0,18,5
1018480,40.7706,-73.992,4,18,5
890339,40.7279,-73.986,0,20,5


In [24]:
X['Hours'].unique()

array([23, 10, 18, 20,  7,  2,  4, 21, 22, 17, 19,  5,  0,  1,  9,  8,  3,
        6])

In [25]:
fig = px.scatter_mapbox(X.sort_values(by=['Day', 'Hours']), 
                        lat="Lat", 
                        lon="Lon", 
                        animation_frame = 'Hours',
                        color="cluster", 
                        zoom=3.5, 
                        mapbox_style = "carto-positron",
                        
                    )
fig.show('iframe')

In [26]:
# Prenons une semaine plus active : celle de l'Independence Day(1st July week) => week27
mask_jul = (uber_july['Week'] == 27)

X1 = uber_july.loc[mask_jul, ['Lat', "Lon"]].sample(5000)

In [27]:
X1

Unnamed: 0,Lat,Lon
2489865,40.7571,-73.9876
2171411,40.7516,-73.9839
1935048,40.7394,-73.9941
2167581,40.7271,-73.9997
1936953,40.7508,-73.9699
...,...,...
1929791,40.7520,-73.9789
2172193,40.7594,-73.9859
2492485,40.7684,-73.9671
1918461,40.7405,-74.0044


In [28]:
wcss1 =  []
for i in range (1,15): 
    kmeans = KMeans(n_clusters= i, init = "k-means++", random_state = 0)
    kmeans.fit(X1)
    wcss1.append(kmeans.inertia_)

In [29]:
wcss1_frame = pd.DataFrame(wcss1)

fig = px.line(wcss_frame, x=wcss1_frame.index, y=wcss1_frame.iloc[:, -1])

fig.update_layout(
    title="Inertia per clusters",
    xaxis_title="# clusters",
    yaxis_title="Inertia"
)

fig.show(renderer="iframe")

In [30]:
wcss1_frame = pd.DataFrame(wcss1)

fig = px.bar(wcss1_frame, x=wcss1_frame.index, y=wcss1_frame.iloc[:, -1])

# Creating layouts 
fig.update_layout(
    title="Inertia per clusters",
    xaxis_title="# clusters",
    yaxis_title="Inertia"
)

# Render in notebook
fig.show(renderer="iframe")

In [31]:
sil1 = []

for i in range (2,15): 
    kmeans1 = KMeans(n_clusters= i, init = "k-means++", random_state = 0)
    kmeans1.fit(X1)
    sil1.append(silhouette_score(X1, kmeans1.predict(X1)))
    print("Silhouette score for K={} is {}".format(i, sil1[-1]))

Silhouette score for K=2 is 0.7126450534152022
Silhouette score for K=3 is 0.357199015538787
Silhouette score for K=4 is 0.38649952249028857
Silhouette score for K=5 is 0.38967138713929833
Silhouette score for K=6 is 0.405666157830119
Silhouette score for K=7 is 0.43118013162356966
Silhouette score for K=8 is 0.4381112377511442
Silhouette score for K=9 is 0.4387143067623334
Silhouette score for K=10 is 0.44652820694264517
Silhouette score for K=11 is 0.46389256115900296
Silhouette score for K=12 is 0.4516847041636177
Silhouette score for K=13 is 0.4280563538845472
Silhouette score for K=14 is 0.42915190830423305


In [32]:
cluster_scores1 = pd.DataFrame(sil1, index=range(2,15))

fig = px.bar(data_frame=cluster_scores1,  
             x=cluster_scores1.index, 
             y=cluster_scores1.iloc[:, -1]
            )

fig.update_layout(
    yaxis_title="Silhouette Score",
    xaxis_title="# Clusters",
    title="Silhouette Score per cluster"
)

fig.show(renderer="iframe")

In [33]:
kmeans1 = KMeans(n_clusters= 12, init= "k-means++", max_iter=300, n_init = 10, random_state=0)
y_kmeans1 = kmeans1.fit_predict(X1)

In [34]:
result1 = kmeans1.labels_
result1

array([0, 0, 6, ..., 0, 6, 4], dtype=int32)

In [35]:
X1["cluster"] = kmeans1.labels_
X1["Hours"] = uber_july['Hours']
X1["Day"] = uber_july['Week_day']
X1.head()

Unnamed: 0,Lat,Lon,cluster,Hours,Day
2489865,40.7571,-73.9876,0,20,3
2171411,40.7516,-73.9839,0,14,2
1935048,40.7394,-73.9941,6,14,3
2167581,40.7271,-73.9997,6,21,1
1936953,40.7508,-73.9699,0,18,3


In [36]:
fig = px.scatter_mapbox(X1.sort_values(by=['Day', 'Hours']), 
                        lat="Lat", 
                        lon="Lon", 
                        color="cluster",                         
                        animation_frame = 'Day',
                        zoom=3.5, 
                        mapbox_style = "carto-positron",
                        
                    )
fig.show('iframe')

In [37]:
# import DBSCAN from sklearn and numpy
from sklearn.cluster import DBSCAN
import numpy as np
mask_db = (uber_july['Week'] == 27) & (uber_july['Hours'] >=17)
geo_loc_db = ['Lat', 'Lon']
X_db = uber_july.loc[mask_db, geo_loc_db]#.sample(n=10000)
# Instanciate DBSCAN

In [38]:
db = DBSCAN(eps = 0.0012, min_samples = 120, metric = 'manhattan', algorithm = 'brute')
# Fit on data
## No need to normalize data, it already is!
db.fit(X_db)

DBSCAN(algorithm='brute', eps=0.0012, metric='manhattan', min_samples=120)

In [39]:
X_db.value_counts()

Lat      Lon     
40.6448  -73.7820    34
         -73.7818    30
         -73.7819    29
         -73.7821    28
40.7741  -73.8726    26
                     ..
40.7528  -73.9797     1
         -73.9795     1
         -73.9793     1
         -73.9792     1
39.9419  -74.0734     1
Length: 36147, dtype: int64

In [40]:
X_db['cluster'] = db.labels_
X_db['Day'] = uber_july['Week_day']
X_db['Hours'] = uber_july['Hours']
labels = db.labels_
unique_lab = set(X_db['cluster'])
nb_cluster = len(unique_lab)
n_noise_ = list(labels).count(-1)


print(f'Nombres de clusters:{nb_cluster}')
print((f'Nombres de bruits:{n_noise_}'))

Nombres de clusters:22
Nombres de bruits:37860


In [41]:
fig = px.scatter_mapbox(X_db.sort_values(by=['Day', 'Hours'])[X_db.cluster != -1],
                        lat='Lat',
                        lon='Lon',
                        color='cluster',
                        animation_frame = 'Day',
                        zoom=9,
                        mapbox_style = 'carto-positron')

fig.show(renderer='iframe')

In [42]:
fig = px.scatter_mapbox(X_db.sort_values(by=['Day', 'Hours'])[X_db.cluster != -1],
                        lat='Lat',
                        lon='Lon',
                        color='cluster',
                        animation_frame = 'Hours',
                        zoom=9,
                        mapbox_style = 'carto-positron')

fig.show(renderer='iframe')