## DATA ANALYSIS AND CLEANING

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

We are going to create the full Dataset with all the data from Uber with only the *14.cdv files. Other files are not GPS located.

In [2]:
import os

filepath = './Sources'
df = pd.DataFrame()
for filename in os.listdir(filepath):
    if 'uber-raw-data' in filename and '14.csv' in filename: 
        print(f'Loading {filename} in a dataset')
        df_single_file = pd.read_csv(os.path.join(filepath, filename))
        df = pd.concat([df, df_single_file])

Loading uber-raw-data-apr14.csv in a dataset
Loading uber-raw-data-aug14.csv in a dataset
Loading uber-raw-data-jul14.csv in a dataset
Loading uber-raw-data-jun14.csv in a dataset
Loading uber-raw-data-may14.csv in a dataset
Loading uber-raw-data-sep14.csv in a dataset


In [3]:
df.head()

Unnamed: 0,Date/Time,Lat,Lon,Base
0,4/1/2014 0:11:00,40.769,-73.9549,B02512
1,4/1/2014 0:17:00,40.7267,-74.0345,B02512
2,4/1/2014 0:21:00,40.7316,-73.9873,B02512
3,4/1/2014 0:28:00,40.7588,-73.9776,B02512
4,4/1/2014 0:33:00,40.7594,-73.9722,B02512


We are going to remove the Base column as it is useless. We can't find any mapping with this ID in other file and it does not add any usefull information

In [4]:
df.drop(columns=['Base'], inplace = True)
df.head()

Unnamed: 0,Date/Time,Lat,Lon
0,4/1/2014 0:11:00,40.769,-73.9549
1,4/1/2014 0:17:00,40.7267,-74.0345
2,4/1/2014 0:21:00,40.7316,-73.9873
3,4/1/2014 0:28:00,40.7588,-73.9776
4,4/1/2014 0:33:00,40.7594,-73.9722


In [5]:
df.shape

(4534327, 3)

In [6]:
df.dtypes

Date/Time     object
Lat          float64
Lon          float64
dtype: object

In [7]:
df.describe(include='all')

Unnamed: 0,Date/Time,Lat,Lon
count,4534327,4534327.0,4534327.0
unique,260093,,
top,4/7/2014 20:21:00,,
freq,97,,
mean,,40.73926,-73.97302
std,,0.03994991,0.0572667
min,,39.6569,-74.929
25%,,40.7211,-73.9965
50%,,40.7422,-73.9834
75%,,40.761,-73.9653


There is no missing data. 
Let's change Date/Time column into a real DateTime column.

In [8]:
df['datetime'] = pd.to_datetime(df['Date/Time'])
df.dtypes

Date/Time            object
Lat                 float64
Lon                 float64
datetime     datetime64[ns]
dtype: object

In [9]:
df['datetime'].isnull().values.any()

False

There is no null values in new datetime column. We can remove the former one.

In [10]:
df.drop(columns=['Date/Time'], inplace=True)

In [11]:
df.head()

Unnamed: 0,Lat,Lon,datetime
0,40.769,-73.9549,2014-04-01 00:11:00
1,40.7267,-74.0345,2014-04-01 00:17:00
2,40.7316,-73.9873,2014-04-01 00:21:00
3,40.7588,-73.9776,2014-04-01 00:28:00
4,40.7594,-73.9722,2014-04-01 00:33:00


Let's add the hour and day of week from it and we want to determine some information reagrding the day of week and the hour.

In [12]:
df['day'] = df['datetime'].dt.dayofweek
df['hour'] = df['datetime'].dt.hour
X = df.loc[:,df.columns != 'datetime']
del df

In [13]:
X.head()

Unnamed: 0,Lat,Lon,day,hour
0,40.769,-73.9549,1,0
1,40.7267,-74.0345,1,0
2,40.7316,-73.9873,1,0
3,40.7588,-73.9776,1,0
4,40.7594,-73.9722,1,0


## Clustering with KMeans

Processing Elbow method to help to get the number of clusters.

In [14]:
# Fields to calculate the clusters
fields= ['Lat', 'Lon']

In [15]:
# Using the Elbow method to find the optimal number K of clusters
from sklearn.cluster import KMeans
wcss =  []
for i in range (1,11): 
    kmeans = KMeans(n_clusters= i, init = "k-means++", random_state = 0, n_init='auto')
    kmeans.fit(X[fields])
    wcss.append(kmeans.inertia_)
    print(f"Kmeans fit for {i} cluster(s)")

Kmeans fit for 1 cluster(s)
Kmeans fit for 2 cluster(s)
Kmeans fit for 3 cluster(s)
Kmeans fit for 4 cluster(s)
Kmeans fit for 5 cluster(s)
Kmeans fit for 6 cluster(s)
Kmeans fit for 7 cluster(s)
Kmeans fit for 8 cluster(s)
Kmeans fit for 9 cluster(s)
Kmeans fit for 10 cluster(s)


In [16]:
# Create a DataFrame that will be fed to plotly 
wcss_frame = pd.DataFrame(wcss)

# Using Plotly to visualize elbow 
import plotly.express as px 

# Creating a line plot
fig = px.line(wcss_frame, x=wcss_frame.index, y=wcss_frame.iloc[:, -1])

# Creating layouts 
fig.update_layout(
    title="Inertia per clusters",
    xaxis_title="# clusters",
    yaxis_title="Inertia",
    width=600, height=500
)

# Render
fig.show(renderer="iframe_connected")

Ideal cluster number seems to be between 4 and 6.

Then we use the silhouette to fine tune cluster numbers. We need to calculate it on a sample of data (1%) as it takes a long time to compute silhouette score for all data.

In [17]:
# Import silhouette score
from sklearn.metrics import silhouette_score

# Computer mean silhouette score
sil = []

X_sample = X.sample(frac=0.01, random_state=0)
print(f"Sample contains {X_sample.shape[0]} rows.")

## Careful, you need to start at i=2 as silhouette score cannot accept less than 2 labels 
for i in range (3,8): 
    kmeans = KMeans(n_clusters= i, init = "k-means++", random_state = 0, n_init='auto')
    kmeans.fit(X_sample[fields])
    print(f"Kmeans fit for {i} clusters.")
    sil.append(silhouette_score(X_sample[fields], kmeans.predict(X_sample[fields])))
    print("Silhouette score for K={} is {}".format(i, sil[-1]))

Sample contains 45343 rows.
Kmeans fit for 3 clusters.
Silhouette score for K=3 is 0.37547728029806743
Kmeans fit for 4 clusters.
Silhouette score for K=4 is 0.38451281944818094
Kmeans fit for 5 clusters.
Silhouette score for K=5 is 0.38790899527953243
Kmeans fit for 6 clusters.
Silhouette score for K=6 is 0.43087945270491873
Kmeans fit for 7 clusters.
Silhouette score for K=7 is 0.4505846744019787


In [18]:
# Create a data frame 
cluster_scores=pd.DataFrame(sil, index=range(3,8))

# Create figure
fig = px.bar(data_frame=cluster_scores,  
             x=cluster_scores.index, 
             y=cluster_scores.iloc[:, -1]
            )

# Add title and axis labels
fig.update_layout(
    yaxis_title="Silhouette Score",
    xaxis_title="# Clusters",
    title="Silhouette Score per cluster",
    width=600, height=500
)

# Render
fig.show(renderer="iframe_connected")

We will choose 6 clusters to continue.

In [19]:
n_clusters = 6

## DataFrame splitting by days and hours

We are going to build a list of 7 DataFrames corresponding to 7 days with 24 calculations of the 4 clusters during 24 hours per day.

In [26]:
df_day_hour = []
day_rows_counts=[]
for day in range(7):
    df_day_hour.append([])
    day_hour_rows_counts = 0
    for hour in range(24):
        df_day_hour[day].append(X[(X.day==day) & (X.hour==hour)].copy())
        day_hour_rows_counts += df_day_hour[day][hour].shape[0]
    day_rows_counts.append(day_hour_rows_counts)

In [27]:
# Create figure
fig = px.bar(x=range(7), y=day_rows_counts)

# Add title and axis labels
fig.update_layout(
    yaxis_title="Row count",
    xaxis_title="DataFrame days",
    width=600, height=500
)

# Render
fig.show(renderer="iframe_connected")

Then we calculate the kmeans clusters for each hour of each day, independantly.

In [28]:
for day in range(7):
    for hour in range(24):
        kmeans = KMeans(n_clusters= 6, init = "k-means++", random_state = 0, n_init='auto')
        kmeans.fit(df_day_hour[day][hour][fields])
        df_day_hour[day][hour]["kmean_cluster"] = kmeans.labels_
    print(f"Day {day} fit")

Day 0 fit
Day 1 fit
Day 2 fit
Day 3 fit
Day 4 fit
Day 5 fit
Day 6 fit


In [29]:
df_day_hour[day][hour]

Unnamed: 0,Lat,Lon,day,hour,kmean_cluster
7774,40.6880,-74.1810,6,23,3
7775,40.7145,-73.9906,6,23,0
7776,40.7415,-73.9875,6,23,0
7777,40.6449,-73.7820,6,23,1
7778,40.6950,-74.1782,6,23,3
...,...,...,...,...,...
1014138,40.6447,-73.7821,6,23,1
1014139,40.7513,-73.9941,6,23,0
1014140,40.6875,-74.1824,6,23,3
1014141,40.6482,-73.7823,6,23,1


Then we concatenate all hours per day in a day, in order to be able to plot data for each day, playing with hours.

In [30]:
df_day = []
for day in range(7):
    df_day.append(pd.concat([df_day_hour[day][hour] for hour in range(24)]))

In [32]:
day_names={0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6: 'Sunday'}

for day in range(7):
    fig = px.scatter_mapbox(
            df_day[day], 
            lat="Lat", 
            lon="Lon",
            color="kmean_cluster",
            mapbox_style="carto-positron",
            animation_frame="hour"
    )

    # Add title and axis labels
    fig.update_layout(
        title=f"Clusters for {day_names[day]}",
        width=600, height=500
    )

    # Render
    fig.show(renderer="iframe_connected")