![title](uber.jpeg)

In [1]:
!pip install plotly -q

In [2]:
import plotly.graph_objects as go
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import seaborn as sns
import pandas as pd 
import numpy as np 
import datetime as datetime
import plotly.express as px
from sklearn.metrics import silhouette_score
from sklearn.datasets import load_sample_image 
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import warnings; warnings.simplefilter('ignore')

In [3]:
apr = pd.read_csv('uber-raw-data-apr14.csv')
may = pd.read_csv('uber-raw-data-may14.csv')
jun = pd.read_csv('uber-raw-data-jun14.csv')
jul = pd.read_csv('uber-raw-data-jul14.csv')
aug = pd.read_csv('uber-raw-data-aug14.csv')
sep = pd.read_csv('uber-raw-data-sep14.csv')

In [4]:
apr.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.769,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512


In [5]:
apr['Date/Time']

0           4/1/2014 0:11:00
1           4/1/2014 0:17:00
2           4/1/2014 0:21:00
3           4/1/2014 0:28:00
4           4/1/2014 0:33:00
                 ...        
564511    4/30/2014 23:22:00
564512    4/30/2014 23:26:00
564513    4/30/2014 23:31:00
564514    4/30/2014 23:32:00
564515    4/30/2014 23:48:00
Name: Date/Time, Length: 564516, dtype: object

In [6]:
# On merge tous les dataset en un seul mais nous travaillerons uniquement sur un seul 
# mois car les calculs seraient beaucoup trop longs sur plus de 4M lignes

uber_final = may.append([apr, jun, jul, aug, sep], ignore_index = True)
uber_final

# Nous allons travailler sur un seul mois pour le KMeans

uber = apr

In [7]:
# Create a second column conerning the date and the hour for all the months

uber_final['Week'] =pd.to_datetime(uber_final['Date/Time']).dt.isocalendar().week
uber_final['Year'] = pd.to_datetime(uber_final['Date/Time']).dt.isocalendar().year
uber_final['Month'] = pd.to_datetime(uber_final['Date/Time']).dt.month
uber_final['Week_day'] = pd.to_datetime(uber_final['Date/Time']).dt.weekday
# The new Date column can be used thanks to the datetime format compare to 
# previous one
uber_final['Date/Time'] = pd.to_datetime(uber_final['Date/Time'])
uber_final['Date/Time'].sort_values()

687971    2014-04-01 00:00:00
687972    2014-04-01 00:00:00
979235    2014-04-01 00:00:00
871234    2014-04-01 00:01:00
687973    2014-04-01 00:02:00
                  ...        
3781160   2014-09-30 22:59:00
3781159   2014-09-30 22:59:00
3781158   2014-09-30 22:59:00
4158853   2014-09-30 22:59:00
4355992   2014-09-30 22:59:00
Name: Date/Time, Length: 4534327, dtype: datetime64[ns]

In [8]:
uber_final[["Hours","Minutes"]] = pd.DataFrame([(x.hour, x.minute) for x in uber_final["Date/Time"]])

In [9]:
uber_final.columns

Index(['Date/Time', 'Lat', 'Lon', 'Base', 'Week', 'Year', 'Month', 'Week_day',
       'Hours', 'Minutes'],
      dtype='object')

In [10]:
# On réorganise l'ordre des colonnes pour un df plus esthétique
uber_final = uber_final.iloc[:, [0, 7, 8, 9, 4, 6, 5, 1, 2, 3]]
uber_final

Unnamed: 0,Date/Time,Week_day,Hours,Minutes,Week,Month,Year,Lat,Lon,Base
0,2014-05-01 00:02:00,3,0,2,18,5,2014,40.7521,-73.9914,B02512
1,2014-05-01 00:06:00,3,0,6,18,5,2014,40.6965,-73.9715,B02512
2,2014-05-01 00:15:00,3,0,15,18,5,2014,40.7464,-73.9838,B02512
3,2014-05-01 00:17:00,3,0,17,18,5,2014,40.7463,-74.0011,B02512
4,2014-05-01 00:17:00,3,0,17,18,5,2014,40.7594,-73.9734,B02512
...,...,...,...,...,...,...,...,...,...,...
4534322,2014-09-30 22:57:00,1,22,57,40,9,2014,40.7668,-73.9845,B02764
4534323,2014-09-30 22:57:00,1,22,57,40,9,2014,40.6911,-74.1773,B02764
4534324,2014-09-30 22:58:00,1,22,58,40,9,2014,40.8519,-73.9319,B02764
4534325,2014-09-30 22:58:00,1,22,58,40,9,2014,40.7081,-74.0066,B02764


In [11]:
# Create a second column conerning the date and the hour only for April

uber['Week'] =pd.to_datetime(uber['Date/Time']).dt.isocalendar().week
uber['Year'] = pd.to_datetime(uber['Date/Time']).dt.isocalendar().year
uber['Month'] = pd.to_datetime(uber['Date/Time']).dt.month
uber['Week_day'] = pd.to_datetime(uber['Date/Time']).dt.weekday
# The new Date column can be used thanks to the datetime format compare to 
# previous one
uber['Date/Time'] = pd.to_datetime(uber['Date/Time'])
uber['Date/Time'].sort_values()

326800   2014-04-01 00:00:00
35537    2014-04-01 00:00:00
35536    2014-04-01 00:00:00
218799   2014-04-01 00:01:00
35538    2014-04-01 00:02:00
                 ...        
35535    2014-04-30 23:59:00
554607   2014-04-30 23:59:00
554606   2014-04-30 23:59:00
554605   2014-04-30 23:59:00
554604   2014-04-30 23:59:00
Name: Date/Time, Length: 564516, dtype: datetime64[ns]

In [12]:
uber[["Hours","Minutes"]] = pd.DataFrame([(x.hour, x.minute) for x in uber["Date/Time"]])

In [13]:
uber.columns

Index(['Date/Time', 'Lat', 'Lon', 'Base', 'Week', 'Year', 'Month', 'Week_day',
       'Hours', 'Minutes'],
      dtype='object')

In [14]:
# On réorganise l'ordre des colonnes pour un df plus esthétique
uber = uber.iloc[:, [0, 7, 8, 9, 4, 6, 5, 1, 2, 3]]
uber

Unnamed: 0,Date/Time,Week_day,Hours,Minutes,Week,Month,Year,Lat,Lon,Base
0,2014-04-01 00:11:00,1,0,11,14,4,2014,40.7690,-73.9549,B02512
1,2014-04-01 00:17:00,1,0,17,14,4,2014,40.7267,-74.0345,B02512
2,2014-04-01 00:21:00,1,0,21,14,4,2014,40.7316,-73.9873,B02512
3,2014-04-01 00:28:00,1,0,28,14,4,2014,40.7588,-73.9776,B02512
4,2014-04-01 00:33:00,1,0,33,14,4,2014,40.7594,-73.9722,B02512
...,...,...,...,...,...,...,...,...,...,...
564511,2014-04-30 23:22:00,2,23,22,18,4,2014,40.7640,-73.9744,B02764
564512,2014-04-30 23:26:00,2,23,26,18,4,2014,40.7629,-73.9672,B02764
564513,2014-04-30 23:31:00,2,23,31,18,4,2014,40.7443,-73.9889,B02764
564514,2014-04-30 23:32:00,2,23,32,18,4,2014,40.6756,-73.9405,B02764


In [15]:
from sklearn.preprocessing import StandardScaler

# Mask réalisé pour une map sur un créneau horaire défini : 
# D'un samedi 17h à dimanche 10h au mois d'avril
# Pour ce faire, nous allons faire une analyse par kmeans pour voir si
# la répartition des clusters est plus précise que celle de DBSCAN que l'on
# verra par la suite

mask1 = ((uber["Week_day"] == 5) & (uber["Hours"] >= 17))
mask2 = ((uber["Week_day"] == 6) & (uber["Hours"] <= 10))
mask_final = (mask1 | mask2)

geo_loc = ['Lat', "Lon"]

X = uber.loc[mask_final, geo_loc].sample(5000)

In [16]:
from sklearn.cluster import KMeans
wcss =  []
for i in range (1,11): 
    kmeans = KMeans(n_clusters= i, init = "k-means++", random_state = 0)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)

In [17]:
uber['Week_day'].unique()

array([1, 2, 3, 4, 5, 6, 0])

In [18]:
X

Unnamed: 0,Lat,Lon
99249,40.7239,-73.9851
134836,40.7314,-73.9888
60137,40.7786,-73.9821
308955,40.7422,-73.9972
421393,40.7275,-73.9907
...,...,...
183114,40.6681,-73.9840
181856,40.6927,-73.9921
22332,40.7271,-73.9785
520008,40.7514,-73.9876


In [19]:
# Create a DataFrame that will be fed to plotly 
wcss_frame = pd.DataFrame(wcss)

# Using Plotly to visualize elbow 
import plotly.express as px 

# Creating a line plot
fig = px.line(wcss_frame, x=wcss_frame.index, y=wcss_frame.iloc[:, -1])

# Creating layouts 
fig.update_layout(
    title="Inertia per clusters",
    xaxis_title="# clusters",
    yaxis_title="Inertia"
)

# Render in notebook
fig.show(renderer="iframe")

In [20]:
# Create a DataFrame that will be fed to plotly 
wcss_frame = pd.DataFrame(wcss)

import plotly.express as px 

# Creating a line plot
fig = px.bar(wcss_frame, x=wcss_frame.index, y=wcss_frame.iloc[:, -1])

# Creating layouts 
fig.update_layout(
    title="Inertia per clusters",
    xaxis_title="# clusters",
    yaxis_title="Inertia"
)

# Render in notebook
fig.show(renderer="iframe")

In [21]:
from sklearn.metrics import silhouette_score

# Computer mean silhouette score
sil = []

for i in range (2,11): 
    kmeans = KMeans(n_clusters= i, init = "k-means++", random_state = 0)
    kmeans.fit(X)
    sil.append(silhouette_score(X, kmeans.predict(X)))
    print("Silhouette score for K={} is {}".format(i, sil[-1]))

Silhouette score for K=2 is 0.7380785153058287
Silhouette score for K=3 is 0.39217040046961205
Silhouette score for K=4 is 0.4530312900199571
Silhouette score for K=5 is 0.4643910220202683
Silhouette score for K=6 is 0.4666350754357009
Silhouette score for K=7 is 0.4812711068624159
Silhouette score for K=8 is 0.4870057393088072
Silhouette score for K=9 is 0.4121113708605726
Silhouette score for K=10 is 0.4128329398940197


In [22]:
# Create a data frame 
cluster_scores=pd.DataFrame(sil, index=range(2,11))

# Create figure
fig = px.bar(data_frame=cluster_scores,  
             x=cluster_scores.index, 
             y=cluster_scores.iloc[:, -1]
            )

# Add title and axis labels
fig.update_layout(
    yaxis_title="Silhouette Score",
    xaxis_title="# Clusters",
    title="Silhouette Score per cluster"
)

# Render
fig.show(renderer="iframe")
# fig.show(renderer="iframe") if using workspace

In [23]:
# Applying KMeans to the Mall dataset
kmeans = KMeans(n_clusters= 5, init= "k-means++", max_iter=300, n_init = 10, random_state=0)
y_kmeans = kmeans.fit_predict(X)

In [24]:
result = kmeans.labels_
result

array([4, 4, 2, ..., 4, 4, 1], dtype=int32)

In [25]:
X.columns

Index(['Lat', 'Lon'], dtype='object')

In [26]:
X["cluster"] = kmeans.labels_
X["Hours"] = uber['Hours']
X["Day"] = uber['Week_day']

X.head()

Unnamed: 0,Lat,Lon,cluster,Hours,Day
99249,40.7239,-73.9851,4,0,6
134836,40.7314,-73.9888,4,18,5
60137,40.7786,-73.9821,2,20,5
308955,40.7422,-73.9972,4,18,5
421393,40.7275,-73.9907,4,21,5


In [27]:
X['Hours'].unique()

array([ 0, 18, 20, 21,  7, 19, 23, 10,  3, 17,  5,  1, 22,  8,  6,  2,  9,
        4])

In [28]:
fig = px.scatter_mapbox(X.sort_values(by=['Day', 'Hours']), 
                        lat="Lat", 
                        lon="Lon", 
                        animation_frame = 'Hours',
                        color="cluster", 
                        zoom=3.5, 
                        mapbox_style = "carto-positron",
                        
                    )
fig.show('iframe')

In [29]:
# Prenons une semaine plus active : celle de l'Independence Day(1st July week) => week27
mask_jul = (uber_final['Week'] == 27)

X1 = uber_final.loc[mask_jul, ['Lat', "Lon"]].sample(5000)

In [30]:
X1

Unnamed: 0,Lat,Lon
1945806,40.6435,-73.7898
1871603,40.6737,-73.9418
2178953,40.7664,-73.9683
1933723,40.7639,-73.9733
1486115,40.7473,-73.9856
...,...,...
2477889,40.7356,-73.9796
2187876,40.7282,-73.9820
2178651,40.7810,-73.9821
1248906,40.7727,-73.9459


In [31]:
wcss1 =  []
for i in range (1,11): 
    kmeans = KMeans(n_clusters= i, init = "k-means++", random_state = 0)
    kmeans.fit(X1)
    wcss1.append(kmeans.inertia_)

In [32]:
wcss1_frame = pd.DataFrame(wcss1)

fig = px.line(wcss_frame, x=wcss1_frame.index, y=wcss1_frame.iloc[:, -1])

fig.update_layout(
    title="Inertia per clusters",
    xaxis_title="# clusters",
    yaxis_title="Inertia"
)

fig.show(renderer="iframe")

In [33]:
wcss1_frame = pd.DataFrame(wcss1)

fig = px.bar(wcss1_frame, x=wcss1_frame.index, y=wcss1_frame.iloc[:, -1])

# Creating layouts 
fig.update_layout(
    title="Inertia per clusters",
    xaxis_title="# clusters",
    yaxis_title="Inertia"
)

# Render in notebook
fig.show(renderer="iframe")

In [34]:
sil1 = []

for i in range (2,11): 
    kmeans1 = KMeans(n_clusters= i, init = "k-means++", random_state = 0)
    kmeans1.fit(X1)
    sil1.append(silhouette_score(X1, kmeans1.predict(X1)))
    print("Silhouette score for K={} is {}".format(i, sil1[-1]))

Silhouette score for K=2 is 0.7273333284266379
Silhouette score for K=3 is 0.36265038028405927
Silhouette score for K=4 is 0.3926378765177171
Silhouette score for K=5 is 0.3978390127432826
Silhouette score for K=6 is 0.41581452647446787
Silhouette score for K=7 is 0.43847478139044377
Silhouette score for K=8 is 0.4419114716417411
Silhouette score for K=9 is 0.45066633806093526
Silhouette score for K=10 is 0.4679796017282338


In [35]:
cluster_scores1 = pd.DataFrame(sil1, index=range(2,11))

fig = px.bar(data_frame=cluster_scores1,  
             x=cluster_scores1.index, 
             y=cluster_scores1.iloc[:, -1]
            )

fig.update_layout(
    yaxis_title="Silhouette Score",
    xaxis_title="# Clusters",
    title="Silhouette Score per cluster"
)

fig.show(renderer="iframe")

In [36]:
kmeans1 = KMeans(n_clusters= 5, init= "k-means++", max_iter=300, n_init = 10, random_state=0)
y_kmeans1 = kmeans1.fit_predict(X1)

In [37]:
result1 = kmeans1.labels_
result1

array([2, 0, 1, ..., 1, 1, 0], dtype=int32)

In [38]:
X1.columns

Index(['Lat', 'Lon'], dtype='object')

In [39]:
X1

Unnamed: 0,Lat,Lon
1945806,40.6435,-73.7898
1871603,40.6737,-73.9418
2178953,40.7664,-73.9683
1933723,40.7639,-73.9733
1486115,40.7473,-73.9856
...,...,...
2477889,40.7356,-73.9796
2187876,40.7282,-73.9820
2178651,40.7810,-73.9821
1248906,40.7727,-73.9459


In [40]:
X1["cluster"] = kmeans1.labels_
X1["Hours"] = uber_final['Hours']
X1["Day"] = uber_final['Week_day']
X1.head()

Unnamed: 0,Lat,Lon,cluster,Hours,Day
1945806,40.6435,-73.7898,2,17,5
1871603,40.6737,-73.9418,0,22,0
2178953,40.7664,-73.9683,1,11,3
1933723,40.7639,-73.9733,1,10,3
1486115,40.7473,-73.9856,0,0,0


In [41]:
fig = px.scatter_mapbox(X1.sort_values(by=['Day', 'Hours']), 
                        lat="Lat", 
                        lon="Lon", 
                        color="cluster",                         
                        animation_frame = 'Day',
                        zoom=3.5, 
                        mapbox_style = "carto-positron",
                        
                    )
fig.show('iframe')

In [42]:
# import DBSCAN from sklearn and numpy
from sklearn.cluster import DBSCAN
import numpy as np
mask_db = (uber_final['Week'] == 27) & (uber_final['Hours'] >=17)
geo_loc_db = ['Lat', 'Lon']
X_db = uber_final.loc[mask_db, geo_loc_db]#.sample(n=10000)
# Instanciate DBSCAN

In [43]:
db = DBSCAN(eps = 0.0012, min_samples = 145, metric = 'manhattan', algorithm = 'brute')
# Fit on data
## No need to normalize data, it already is!
db.fit(X_db)


DBSCAN(algorithm='brute', eps=0.0012, metric='manhattan', min_samples=145)

In [44]:
X_db['cluster'] = db.labels_
X_db['Day'] = uber_final['Week_day']
X_db['Hours'] = uber_final['Hours']
labels = db.labels_
unique_lab = set(X_db['cluster'])
nb_cluster = len(unique_lab)
n_noise_ = list(labels).count(-1)

print(f'Nombres de clusters:{nb_cluster}')
print((f'Nombres de bruits:{n_noise_}'))

Nombres de clusters:19
Nombres de bruits:43691


In [45]:
fig = px.scatter_mapbox(X_db.sort_values(by=['Day', 'Hours'])[X_db.cluster != -1],
                        lat='Lat',
                        lon='Lon',
                        color='cluster',
                        animation_frame = 'Day',
                        zoom=9,
                        mapbox_style = 'carto-positron')

fig.show(renderer='iframe')

In [46]:
fig = px.scatter_mapbox(X_db.sort_values(by=['Day', 'Hours'])[X_db.cluster != -1],
                        lat='Lat',
                        lon='Lon',
                        color='cluster',
                        animation_frame = 'Hours',
                        zoom=9,
                        mapbox_style = 'carto-positron')

fig.show(renderer='iframe')