# IMPORTATIONS

In [46]:
# Import les librairies :
import pandas as pd
import numpy as np

from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import  silhouette_score

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

In [47]:
# Importer le dataset :

fichiers =['uber-raw-data-apr14.csv','uber-raw-data-may14.csv','uber-raw-data-jun14.csv','uber-raw-data-jul14.csv','uber-raw-data-aug14.csv','uber-raw-data-sep14.csv']
datasets=[pd.read_csv(fichier) for fichier in fichiers]
dataset=pd.concat(datasets, ignore_index=True)
dataset.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.769,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512


# BASIC STATS

In [48]:
print("Nombre de lignes : {}".format(dataset.shape[0]))
print()

print("Nombre de colonnes : {}".format(dataset.shape[1]))
print()

print("Aperçu du dataset : ")
display(dataset.head())
print()

print("Statistiques basiques : ")
data_desc = dataset.describe(include='all')
display(data_desc)
print()

print("Pourcentage de valeurs manquantes : ")
display(100*dataset.isnull().sum()/dataset.shape[0])

Nombre de lignes : 4534327

Nombre de colonnes : 4

Aperçu du dataset : 


Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.769,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512



Statistiques basiques : 


Unnamed: 0,Date/Time,Lat,Lon,Base
count,4534327,4534327.0,4534327.0,4534327
unique,260093,,,5
top,4/7/2014 20:21:00,,,B02617
freq,97,,,1458853
mean,,40.73926,-73.97302,
std,,0.03994991,0.0572667,
min,,39.6569,-74.929,
25%,,40.7211,-73.9965,
50%,,40.7422,-73.9834,
75%,,40.761,-73.9653,



Pourcentage de valeurs manquantes : 


Date/Time    0.0
Lat          0.0
Lon          0.0
Base         0.0
dtype: float64

# DATA CLEANING

In [49]:
# Convertir la colonne Date/Time :

dataset['Date/Time']=pd.to_datetime(dataset['Date/Time'], format='%m/%d/%Y %H:%M:%S')

# Création de nouvelles colonnes :
dataset['Year'] = dataset['Date/Time'].dt.year
dataset['Month'] = dataset['Date/Time'].dt.month
dataset['Day'] = dataset['Date/Time'].dt.day
dataset['Day_of_week'] = dataset['Date/Time'].dt.dayofweek
dataset['Day_of_year'] = dataset['Date/Time'].dt.dayofyear
dataset['Day_of_month'] = dataset['Date/Time'].dt.day
dataset['Week_of_year'] = dataset['Date/Time'].dt.isocalendar().week
dataset['hour'] = dataset['Date/Time'].dt.hour

dataset.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,Year,Month,Day,Day_of_week,Day_of_year,Day_of_month,Week_of_year,hour
0,2014-04-01 00:11:00,40.769,-73.9549,B02512,2014,4,1,1,91,1,14,0
1,2014-04-01 00:17:00,40.7267,-74.0345,B02512,2014,4,1,1,91,1,14,0
2,2014-04-01 00:21:00,40.7316,-73.9873,B02512,2014,4,1,1,91,1,14,0
3,2014-04-01 00:28:00,40.7588,-73.9776,B02512,2014,4,1,1,91,1,14,0
4,2014-04-01 00:33:00,40.7594,-73.9722,B02512,2014,4,1,1,91,1,14,0


In [50]:
# Notre dataset est énorme, je prends alors un sample et je travaille dessus :

dataset_sample=dataset.sample(10000)
dataset_sample.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,Year,Month,Day,Day_of_week,Day_of_year,Day_of_month,Week_of_year,hour
2478428,2014-07-02 08:07:00,40.7592,-73.9946,B02682,2014,7,2,2,183,2,27,8
142063,2014-04-21 12:05:00,40.7759,-73.9614,B02598,2014,4,21,0,111,21,17,12
3884120,2014-09-08 22:56:00,40.7104,-74.0115,B02617,2014,9,8,0,251,8,37,22
2302346,2014-07-17 15:40:00,40.7579,-73.9895,B02617,2014,7,17,3,198,17,29,15
3899476,2014-09-10 07:40:00,40.778,-73.9524,B02617,2014,9,10,2,253,10,37,7


# VISUALIZATION

In [51]:
fig = px.scatter_mapbox(
        dataset_sample,
        lat="Lat",
        lon="Lon",
        color="Day_of_week")

fig.update_layout(
    mapbox_style="open-street-map"
)

fig.show()

# KMeans

In [52]:
# On essaye KMeans sur 2 features : Lat et Lon :
X_1= dataset_sample[['Lat','Lon']]
sc=StandardScaler()
X_1=sc.fit_transform(X_1)


In [53]:
# Méthode Elbow pour connaitre le nombre de clusters :
wcss =  []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i, random_state=0)
    kmeans.fit(X_1)
    wcss.append(kmeans.inertia_)





















In [54]:
px.line(x=range(2,11), y=wcss)

In [55]:
# Méthode silhouette :
silhouette_scores = []

for i in range(2, 11):
    kmeans = KMeans(n_clusters=i, random_state=0)
    kmeans.fit(X_1)
    silhouette_scores.append(silhouette_score(X_1, kmeans.predict(X_1)))

print(silhouette_scores)





















[0.7147729697992128, 0.401493929244503, 0.4188597230797429, 0.42086825138179895, 0.42948219945974103, 0.445168203481843, 0.4612554830796267, 0.4635466226000763, 0.42305635525636853]


In [56]:
# Afficher le graphe des scores silhouette :
px.bar(x=range(2,11), y=silhouette_scores)

In [57]:
# On applique KMeans sur notre dataset avec K= 7 clusters

kmeans = KMeans(n_clusters=7
                , random_state=0)
kmeans.fit(X_1)





In [58]:
# On ajoute une colonne 'Cluster_KMeans' à notre dataset :
dataset_sample['Cluster_KMeans_1'] = kmeans.predict(X_1)
dataset_sample.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,Year,Month,Day,Day_of_week,Day_of_year,Day_of_month,Week_of_year,hour,Cluster_KMeans_1
2478428,2014-07-02 08:07:00,40.7592,-73.9946,B02682,2014,7,2,2,183,2,27,8,0
142063,2014-04-21 12:05:00,40.7759,-73.9614,B02598,2014,4,21,0,111,21,17,12,0
3884120,2014-09-08 22:56:00,40.7104,-74.0115,B02617,2014,9,8,0,251,8,37,22,3
2302346,2014-07-17 15:40:00,40.7579,-73.9895,B02617,2014,7,17,3,198,17,29,15,0
3899476,2014-09-10 07:40:00,40.778,-73.9524,B02617,2014,9,10,2,253,10,37,7,0


In [59]:
# Visualisation des clusters :
fig=px.scatter_mapbox(
        dataset_sample,
        lat= 'Lat',
        lon='Lon',
        color='Cluster_KMeans_1', 
        title='Hotspots pour les retraits Uber',
        mapbox_style='carto-positron')
fig.show()

Là où il y a beaucoup de monde est autour de Manhattan (cluster 0 et cluster 3) et sur une zone de Brooklyn (cluster 2). Le reste est dispersé sur le reste des zones.

In [60]:
# On essaye KMeans sur 3 features : Lat, Lon et Day_of_week :
X_2= dataset_sample[['Lat','Lon','Day_of_week']]
sc=StandardScaler()
X_2=sc.fit_transform(X_2)


In [61]:
# Méthode Elbow pour connaitre le nombre de clusters :
wcss =  []
for i in range (2,11): 
    kmeans = KMeans(n_clusters= i, random_state=0)
    kmeans.fit(X_2)
    wcss.append(kmeans.inertia_)





















In [62]:
px.line(x=range(2,11), y=wcss)

In [63]:
# Méthode silhouette :
silhouette_scores = []

for i in range(2, 11):
    kmeans = KMeans(n_clusters=i, random_state=0)
    kmeans.fit(X_2)
    silhouette_scores.append(silhouette_score(X_2, kmeans.predict(X_2)))

print(silhouette_scores)





















[0.32584884287819016, 0.35818513007508357, 0.31633211472981027, 0.34703109270385474, 0.26396967175286107, 0.27851595229731996, 0.29272398202342736, 0.292226887107758, 0.2824623716608582]


In [64]:
# Afficher le graphe des scores silhouette :
px.bar(x=range(2,11), y=silhouette_scores)

In [65]:
# On applique KMeans sur notre dataset avec K= 3 clusters

kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(X_2)





In [66]:
# On ajoute une colonne 'Cluster_KMeans' à notre dataset :
dataset_sample['Cluster_KMeans_2'] = kmeans.predict(X_2)
dataset_sample.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,Year,Month,Day,Day_of_week,Day_of_year,Day_of_month,Week_of_year,hour,Cluster_KMeans_1,Cluster_KMeans_2
2478428,2014-07-02 08:07:00,40.7592,-73.9946,B02682,2014,7,2,2,183,2,27,8,0,0
142063,2014-04-21 12:05:00,40.7759,-73.9614,B02598,2014,4,21,0,111,21,17,12,0,0
3884120,2014-09-08 22:56:00,40.7104,-74.0115,B02617,2014,9,8,0,251,8,37,22,3,0
2302346,2014-07-17 15:40:00,40.7579,-73.9895,B02617,2014,7,17,3,198,17,29,15,0,1
3899476,2014-09-10 07:40:00,40.778,-73.9524,B02617,2014,9,10,2,253,10,37,7,0,0


In [67]:
# Visualisation des clusters :
fig=px.scatter_mapbox(
        dataset_sample,
        lat= 'Lat',
        lon='Lon',
        color='Cluster_KMeans_2',
        title="Hotspots pour les retraits Uber selon le jour de la semaine", 
        mapbox_style='carto-positron',
        animation_frame='Day_of_week')
fig.show()

Le KMeans en ajoutant le feature 'Day_of_week' donne 3 clusters. et là où il ya plus de monde est toujours au même endroit et dispersés sur les deux zones de Manhattan et Brooklyn.

# DBSCAN 

In [68]:
# On essaye DBSCAN sur 2 features : Lat et Lon :
X_3= dataset_sample[['Lat','Lon']]
sc=StandardScaler()
X_3=sc.fit_transform(X_3)

In [69]:
db = DBSCAN(eps=0.2, min_samples=100, metric="manhattan", algorithm='auto')
db.fit(X_3)

In [70]:
np.unique(db.labels_)

array([-1,  0,  1,  2,  3,  4], dtype=int64)

In [71]:
dataset_sample["db_cluster_1"] = db.labels_
dataset_sample.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,Year,Month,Day,Day_of_week,Day_of_year,Day_of_month,Week_of_year,hour,Cluster_KMeans_1,Cluster_KMeans_2,db_cluster_1
2478428,2014-07-02 08:07:00,40.7592,-73.9946,B02682,2014,7,2,2,183,2,27,8,0,0,0
142063,2014-04-21 12:05:00,40.7759,-73.9614,B02598,2014,4,21,0,111,21,17,12,0,0,0
3884120,2014-09-08 22:56:00,40.7104,-74.0115,B02617,2014,9,8,0,251,8,37,22,3,0,0
2302346,2014-07-17 15:40:00,40.7579,-73.9895,B02617,2014,7,17,3,198,17,29,15,0,1,0
3899476,2014-09-10 07:40:00,40.778,-73.9524,B02617,2014,9,10,2,253,10,37,7,0,0,0


In [72]:
dataset_sample['db_cluster_1'].nunique()

6

In [73]:
dataset_sample['db_cluster_1'].value_counts()

db_cluster_1
 0    7381
-1    1458
 2     377
 4     333
 1     229
 3     222
Name: count, dtype: int64

In [74]:
# Visualisation sur une carte (en enlevant les outliers):
fig = px.scatter_mapbox(
        dataset_sample[dataset_sample.db_cluster_1 != -1],
        lat="Lat",
        lon="Lon",
        color="db_cluster_1",
        mapbox_style="carto-positron",
        title='Hotspots pour les retraits Uber')

fig.show()

DBScan repère Manhattan en cluster n° 0 et c'est là où il y a plus de monde. Il repère 4 autres clusters (n°1 à l'aéroport John F. Kennedy, n°2 et 4 au nord-ouest de Brooklyn, n°3 à l'aéroport LaGuardia) avec moins de monde.

In [75]:
# On essaye DBSCAN sur 3 features : Lat, Lon et Day_of_Week :
X_4= dataset_sample[['Lat','Lon', 'Day_of_week']]
sc=StandardScaler()
X_4=sc.fit_transform(X_4)

In [76]:
db = DBSCAN(eps=0.9, min_samples=100, metric="manhattan", algorithm='auto')
db.fit(X_4)

In [77]:
np.unique(db.labels_)

array([-1,  0,  1], dtype=int64)

In [78]:
dataset_sample["db_cluster_2"] = db.labels_
dataset_sample.head()

Unnamed: 0,Date/Time,Lat,Lon,Base,Year,Month,Day,Day_of_week,Day_of_year,Day_of_month,Week_of_year,hour,Cluster_KMeans_1,Cluster_KMeans_2,db_cluster_1,db_cluster_2
2478428,2014-07-02 08:07:00,40.7592,-73.9946,B02682,2014,7,2,2,183,2,27,8,0,0,0,0
142063,2014-04-21 12:05:00,40.7759,-73.9614,B02598,2014,4,21,0,111,21,17,12,0,0,0,0
3884120,2014-09-08 22:56:00,40.7104,-74.0115,B02617,2014,9,8,0,251,8,37,22,3,0,0,0
2302346,2014-07-17 15:40:00,40.7579,-73.9895,B02617,2014,7,17,3,198,17,29,15,0,1,0,0
3899476,2014-09-10 07:40:00,40.778,-73.9524,B02617,2014,9,10,2,253,10,37,7,0,0,0,0


In [79]:
dataset_sample['db_cluster_2'].nunique()

3

In [80]:
dataset_sample['db_cluster_2'].value_counts()

db_cluster_2
 0    9293
-1     589
 1     118
Name: count, dtype: int64

In [81]:
# Visualisation sur une carte (en enlevant les outliers):
fig = px.scatter_mapbox(
        dataset_sample[dataset_sample.db_cluster_2 != -1],
        lat="Lat",
        lon="Lon",
        color="db_cluster_2",
        mapbox_style="carto-positron",
        title="Hotspots pour les retraits Uber selon le jour de la semaine",
        animation_frame='Day_of_week')

fig.show()

DBScan avec un 3ème fetaure 'Day_of_week' repère plus ou moins les mêmes zones que celui d'au-dessus. La zone de Manhattan est la zone avec le plus de monde et ensuite la zone de Brooklyn.
La seule différence est qu'il y a du monde au niveau de l'aéroport durant les 3 jours de weekend ce qui est justsifiable (les gens voyagent plus les weekends).

On clonclut alors que peu importe le jour de la semaine, il faudra mettre plus de chauffeur Uber au niveau de Manhattan et Brooklyn. Et au niveau des aéroports durant les weekends.