## Part 0: Import libraries and load the dataset

In [10]:
import pandas as pd
import plotly.express as px
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans
import plotly.express as px
from sklearn.metrics import silhouette_score

In [None]:
df = pd.read_csv("uber-raw-data-sep14.csv")

As the dataset is quite big, we will proceed with a sample of the dataset.

In [None]:
df_sample = df.sample(20000, random_state=0)
df_sample.head()

In [None]:
print("Display of dataset:\n")
display(df_sample.head())
print('Basics statistics:\n')
display(df_sample.describe(include='all'))
print('Percentage of missing values:\n')
missing_percentage = df_sample.isnull().sum() * 100 / len(df_sample)
display(missing_percentage)

In [None]:
df_sample['Date/Time']= pd.to_datetime(df_sample['Date/Time'])
df_sample['dayofweek'] =df_sample['Date/Time'].dt.dayofweek
df_sample['day'] =df_sample['Date/Time'].dt.day
df_sample['hour'] =df_sample['Date/Time'].dt.hour
df_sample = df_sample.drop("Date/Time", axis=1)
df_sample

In [None]:
fig = px.scatter_mapbox(df_sample, lat="Lat", lon="Lon", color="dayofweek",
                         mapbox_style="carto-positron", color_continuous_scale="reds")
fig.show()

In [8]:
df_sample.head()

Unnamed: 0,Lat,Lon,Base,dayofweek,day,hour
310176,40.6165,-73.951,B02617,3,4,8
243214,40.7303,-73.9895,B02598,4,26,18
486503,40.8194,-73.9445,B02617,2,17,10
695475,40.7131,-73.9595,B02682,6,7,13
885696,40.7623,-73.9785,B02764,1,9,15


Let's train a Kmeans model.

In [12]:
numeric_features = ["Lat", "Lon", "dayofweek", "day", "hour"]

numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
])

print("Preprocessing sur le train set...")
print(df_sample.head())
X = preprocessor.fit_transform(df_sample)
X[:5]

Preprocessing sur le train set...
            Lat      Lon    Base  dayofweek  day  hour
310176  40.6165 -73.9510  B02617          3    4     8
243214  40.7303 -73.9895  B02598          4   26    18
486503  40.8194 -73.9445  B02617          2   17    10
695475  40.7131 -73.9595  B02682          6    7    13
885696  40.7623 -73.9785  B02764          1    9    15


array([[-3.05398228,  0.38627992,  0.03377581, -1.36567662, -1.00708253],
       [-0.23364476, -0.30924941,  0.55180978,  1.23358678,  0.6614467 ],
       [ 1.97454568,  0.50370695, -0.48425815,  0.17025175, -0.67337668],
       [-0.65991722,  0.23272149,  1.58787772, -1.01123161, -0.17281792],
       [ 0.55942027, -0.11052675, -1.00229212, -0.77493494,  0.16088793]])

In [13]:
wcss =  []
k = []
for i in range (2,10): 
    kmeans = KMeans(n_clusters= i, random_state = 0, n_init="auto").fit(X)
    wcss.append(kmeans.inertia_)
    k.append(i)
print(wcss)

[85382.03398957744, 75051.13074381382, 68061.11575909096, 58382.60299408916, 53148.20289985447, 49949.501195910845, 45230.98010543822, 42755.86169645453]


In [14]:
wcss_frame = pd.DataFrame(wcss)
k_frame = pd.Series(k)

# Create figure
fig= px.line(
    wcss_frame,
    x=k_frame,
    y=wcss_frame.iloc[:,-1]
)

# Create title and axis labels
fig.update_layout(
    yaxis_title="Inertia",
    xaxis_title="# Clusters",
    title="Inertia per cluster"
)

fig.show()

In [15]:
sil = []
k = []

for i in range (2,10): 
    kmeans = KMeans(n_clusters= i, random_state = 0, n_init="auto").fit(X)
    sil.append(silhouette_score(X, kmeans.predict(X)))
    k.append(i)
print(sil)

[0.17912098210384234, 0.1710322064482769, 0.1709816247436951, 0.1863344999919541, 0.18923944177657778, 0.19080913370529098, 0.1998239679068977, 0.1971932582002332]


In [16]:
cluster_scores=pd.DataFrame(sil)
k_frame = pd.Series(k)

# Create figure
fig = px.bar(data_frame=cluster_scores,  
             x=k, 
             y=cluster_scores.iloc[:, -1]
            )

# Add title and axis labels
fig.update_layout(
    yaxis_title="Silhouette Score",
    xaxis_title="# Clusters",
    title="Silhouette Score per cluster"
)

# Render
#fig.show(renderer="notebook")
fig.show()

In [17]:
kmeans = KMeans(n_clusters=8, random_state=0, n_init="auto").fit(X)
df_sample["Cluster_KMeans"] = kmeans.labels_
df_sample.head()

Unnamed: 0,Lat,Lon,Base,dayofweek,day,hour,Cluster_KMeans
310176,40.6165,-73.951,B02617,3,4,8,0
243214,40.7303,-73.9895,B02598,4,26,18,6
486503,40.8194,-73.9445,B02617,2,17,10,4
695475,40.7131,-73.9595,B02682,6,7,13,3
885696,40.7623,-73.9785,B02764,1,9,15,2


In [19]:
fig = px.scatter_mapbox(
        df_sample, 
        lat="Lat", 
        lon="Lon",
        color="Cluster_KMeans",
        mapbox_style="carto-positron"
)

fig.show()