## Clustering

In [11]:
# Import all necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D

# also import these "new" libraries 
# Note: you may have to download and add them to your environment (using e.g. 'conda install -c conda-forge folium')
# !important! Install this version of plotly=5.10.0 or else some maps and animations may not render correctly
import plotly.express as px
import folium
from folium import plugins
from folium.plugins import HeatMap
from branca.element import Figure
import haversine as hs

# import the necessary libraries for the machine learning models
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

In [18]:
# Finished
taxi_df_clustering = pd.read_parquet('data/prepared/taxi_data_prepared.gzip')

#### General Data Prep
Before we can start with the clustering, we add an addtional feature to our clustering dataframe.

In [19]:
# Finished dataframe
taxi_df_clustering

Unnamed: 0,trip_start_timestamp,trip_end_timestamp,trip_seconds,trip_miles,pickup_census_tract,dropoff_census_tract,trip_total,pickup_centroid_location,dropoff_centroid_location
0,2016-01-01 00:00:00,2016-01-01 00:15:00,900,2.2,17031281900,17031081402,11.65,POINT (-87.642648998 41.8792550844),POINT (-87.6129454143 41.8919715078)
1,2016-01-01 00:00:00,2016-01-01 00:00:00,480,0.0,17031081700,17031839100,10.05,POINT (-87.6318639497 41.8920421365),POINT (-87.6327464887 41.8809944707)
2,2016-01-01 00:00:00,2016-01-01 00:15:00,900,3.0,17031841000,17031320100,12.05,POINT (-87.6241352979 41.84924675450001),POINT (-87.6209929134 41.8849871918)
3,2016-01-01 00:00:00,2016-01-01 00:15:00,600,1.8,17031320100,17031833100,8.45,POINT (-87.6209929134 41.8849871918),POINT (-87.657005027 41.8790669938)
4,2016-01-01 00:00:00,2016-01-01 00:00:00,600,0.0,17031838300,17031320100,8.25,POINT (-87.6384040116 41.9015669095),POINT (-87.6209929134 41.8849871918)
...,...,...,...,...,...,...,...,...,...
20356204,2016-12-31 23:45:00,2017-01-01 00:00:00,780,0.0,17031839100,17031080202,9.50,POINT (-87.6327464887 41.8809944707),POINT (-87.6308650266 41.9058577688)
20356205,2016-12-31 23:45:00,2017-01-01 00:00:00,780,2.4,17031841000,17031320100,11.75,POINT (-87.6241352979 41.84924675450001),POINT (-87.6209929134 41.8849871918)
20356206,2016-12-31 23:45:00,2017-01-01 00:00:00,1020,4.2,17031081500,17031070200,18.30,POINT (-87.6262149064 41.8925077809),POINT (-87.6462934762 41.9290776551)
20356207,2016-12-31 23:45:00,2017-01-01 00:00:00,720,0.7,17031839100,17031320400,12.50,POINT (-87.6327464887 41.8809944707),POINT (-87.6219716519 41.8774061234)


#### Functions to use for Clustering
Next we create a number of functions to help with frequent Clustering Steps:

In [14]:
# This function is used to standardize features
def scalingData(dataframe):
    newDataframe = dataframe.copy()
    scaler = StandardScaler()
    newDataframe[newDataframe.columns] = pd.DataFrame(scaler.fit_transform(newDataframe[newDataframe.columns]))
    return newDataframe

In [15]:
# This function calculates the Loss per cluster amount and plots the result of it in the range of 0-10 on the x-axis
def calcAndPlotLoss(clusterAmount, dataframe):
    k_max = clusterAmount

    clusters = []
    losses = []

    for k in range(k_max):
        model = KMeans(n_clusters=k+1, n_init=10)
        model.fit(dataframe)
        clusters.append(k+1)
        losses.append(model.inertia_)

    fig = plt.subplots(figsize=(12,7))
    plt.plot(clusters, losses)
    plt.ylabel("Loss")
    plt.xlabel("Number of clusters")
    plt.xlim([0,10])
    plt.grid(True)
    plt.show()

In [16]:
# This function calculates for a the defined amount of clusters KMeans on the given dataframe
def calcKMeans(numClusters, dataframe):
    result = KMeans(n_clusters=numClusters, n_init=10)
    result.fit(dataframe)

    dataframe['Cluster'] = result.predict(dataframe)

In [17]:
# This function describes every KMeans cluster with the describe() function for the original dataframe
def describeData(originalDataframe, scaledDataframe, numClusters):   
    for i in range(0, numClusters):
        display(originalDataframe[scaledDataframe['Cluster'] == i].describe())

### Clustering Trip/Customer Types