## K-Means Station Clustering

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from tslearn.utils import to_time_series_dataset
from tslearn.clustering import TimeSeriesKMeans

from deepar_model_utils import prep_station_data
from deepar_model_utils import get_station_data
from deepar_model_utils import deepar_station_data
from deepar_model_utils import write_dicts_to_file

%matplotlib inline

Install h5py to use hdf5 features: http://docs.h5py.org/
  warn(h5py_msg)


In [None]:
s3_bucket = ""

## Trip Start Station

In [2]:
#start_file = "s3://{}/model_trips_start_station_20208029_20220831.csv".format(s3_bucket)

start_file = "../model_trips_start_station_20208029_20220831.csv"

trips_start = pd.read_csv(start_file, parse_dates = True)
trips_start.shape

(6059710, 2)

In [3]:
trips_start_all_group = prep_station_data(trips_start, "start station id", "starttime")
print(sum(trips_start_all_group["size"]))

6059710


In [4]:
freq = "D" # group and sum trips by a set increment
min_date = "2020-08-29 00:00:00" # make sure all series end at the same time
max_date = "2022-08-31 23:45:00" # make sure all series end at the same time

**EDA**

In [None]:
# from manual_station_clustering notebook
most_popular_start = [67, 68, 74, 60, 178, 46, 107, 179, 9, 53]
semi_popular_start = [157, 471, 40, 39, 437, 177, 190, 22, 386, 33]
least_popular_start = [546, 556, 543, 388, 548, 317, 571, 308, 570, 572]

In [None]:
fig, axs = plt.subplots(10, 1, figsize = (20, 20), sharex = True)
axx = axs.ravel()
for i in range(0, 10):
    temp_station = least_popular_start[i]
    get_station_data(trips_start_all_group, "start station id", "starttime", temp_station, freq, max_date, cluster = True, min_date = min_date).plot(ax = axx[i])
    axx[i].set_xlabel("date")
    axx[i].set_ylabel("trip count")
    axx[i].set_title(str(temp_station))
    axx[i].grid(which = "minor", axis = "x")

**Time Series K-Means Clustering**

In [5]:
start_list = []
for station in tqdm(trips_start_all_group["start station id"].unique()):
    start_station_data = get_station_data(trips_start_all_group, "start station id", "starttime", station, freq, max_date, cluster = True, min_date = min_date)
    start_list.append(start_station_data["size"].tolist())

100%|██████████| 451/451 [00:13<00:00, 32.42it/s]


In [6]:
print(len(start_list))
print(sum([sum(i) for i in zip(*start_list)]))

451
6059710


In [7]:
formatted_start_list = to_time_series_dataset(start_list)
formatted_start_list.shape

(451, 733, 1)

In [None]:
%%time
start_km = TimeSeriesKMeans(n_clusters = 21, metric = "dtw", max_iter = 10)
start_km_labels = start_km.fit_predict(formatted_start_list)

In [None]:
start_km_labels

In [None]:
[label for label in start_km_labels]

In [None]:
start_series_names = [station for station in trips_start_all_group["start station id"].unique()]
start_km_labels_list = [label for label in start_km_labels]

start_clusters = pd.DataFrame(zip(start_series_names, start_km_labels_list), columns = ["station id", "cluster"]).sort_values(by = "cluster")
start_clusters

In [None]:
start_clusters.to_csv("start_clusters.csv", index = False)

## Trip End Station

In [None]:
#stop_file = "s3://{}/model_trips_stop_station_20208029_20220831.csv".format(s3_bucket)

stop_file = "../model_trips_stop_station_20208029_20220831.csv"

trips_stop = pd.read_csv(stop_file, parse_dates = True)
trips_stop.shape

In [None]:
trips_stop_all_group = prep_station_data(trips_stop, "end station id", "stoptime")
print(sum(trips_stop_all_group["size"]))

In [None]:
freq = "D" # group and sum trips by a set increment
min_date = "2020-08-29 00:00:00" # make sure all series end at the same time
max_date = "2022-08-31 23:45:00" # make sure all series end at the same time

**EDA**

In [None]:
# from manual_station_clustering notebook
most_popular_stop = [67, 68, 74, 60, 107, 46, 178, 179, 53, 9]
semi_popular_stop = [189, 471, 39, 40, 437, 190, 370, 22, 33, 459]
least_popular_stop = [556, 543, 388, 548, 317, 308, 570, 572, 571, 438]

In [None]:
fig, axs = plt.subplots(10, 1, figsize = (20, 20), sharex = True)
axx = axs.ravel()
for i in range(0, 10):
    temp_station = least_popular_stop[i]
    get_station_data(trips_stop_all_group, "end station id", "stoptime", temp_station, freq, max_date, cluster = True, min_date = min_date).plot(ax = axx[i])
    axx[i].set_xlabel("date")
    axx[i].set_ylabel("trip count")
    axx[i].set_title(str(temp_station))
    axx[i].grid(which = "minor", axis = "x")

**Time Series K-Means Clustering**

In [None]:
stop_list = []
for station in tqdm(trips_stop_all_group["end station id"].unique()):
    stop_station_data = get_station_data(trips_stop_all_group, "end station id", "stoptime", station, freq, max_date, cluster = True, min_date = min_date)
    stop_list.append(stop_station_data["size"].tolist())

In [None]:
print(len(stop_list))
print(sum([sum(i) for i in zip(*stop_list)]))

In [None]:
formatted_stop_list = to_time_series_dataset(stop_list)
formatted_stop_list.shape

In [None]:
%%time
stop_km = TimeSeriesKMeans(n_clusters = 21, metric = "dtw", max_iter = 10)
stop_km_labels = stop_km.fit_predict(formatted_stop_list)

In [None]:
stop_km_labels

In [None]:
[label for label in stop_km_labels]

In [None]:
stop_series_names = [station for station in trips_stop_all_group["end station id"].unique()]
stop_km_labels_list = [label for label in stop_km_labels]

stop_clusters = pd.DataFrame(zip(stop_series_names, stop_km_labels_list), columns = ["station id", "cluster"]).sort_values(by = "cluster")
stop_clusters

In [None]:
stop_clusters.to_csv("stop_clusters.csv", index = False)