## K-Means Station Clustering

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

from tslearn.utils import to_time_series_dataset
from tslearn.clustering import TimeSeriesKMeans

from deepar_model_utils import prep_station_data
from deepar_model_utils import get_station_data
from deepar_model_utils import deepar_station_data
from deepar_model_utils import write_dicts_to_file

%matplotlib inline

## Trip Start Station

In [None]:
start_file = "../model_trips_start_station_20208029_20220831.csv"

trips_start = pd.read_csv(start_file, parse_dates = True)
trips_start.shape

In [None]:
trips_start_all_group = prep_station_data(trips_start, "start station id", "starttime")
print(sum(trips_start_all_group["size"]))

**EDA**

In [None]:
freq = "15min" # group and sum trips by a set increment
min_date = "2020-08-29 00:00:00" # make sure all series start at the same time
max_date = "2022-08-31 23:45:00" # make sure all series end at the same time

In [None]:
# from manual_station_clustering notebook
most_popular_start = [67, 68, 74, 60, 178, 46, 107, 179, 9, 53]
semi_popular_start = [157, 471, 40, 39, 437, 177, 190, 22, 386, 33]
least_popular_start = [546, 556, 543, 388, 548, 317, 571, 308, 570, 572]

In [None]:
fig, axs = plt.subplots(10, 1, figsize = (20, 20), sharex = True)
axx = axs.ravel()
for i in range(0, 10):
    temp_station = least_popular_start[i]
    get_station_data(trips_start_all_group, "start station id", "starttime", temp_station, freq, max_date, cluster = True, min_date = min_date).plot(ax = axx[i])
    axx[i].set_xlabel("date")
    axx[i].set_ylabel("trip count")
    axx[i].set_title(str(temp_station))
    axx[i].grid(which = "minor", axis = "x")

**Time Series K-Means Clustering**

In [None]:
start_list = []
for station in tqdm(trips_start_all_group["start station id"].unique()):
    start_station_data = get_station_data(trips_start_all_group, "start station id", "starttime", station, freq, max_date, cluster = True, min_date = min_date)
    start_list.append(start_station_data["size"].tolist())

In [None]:
print(len(start_list))
print(sum([sum(i) for i in zip(*start_list)]))

In [None]:
formatted_start_list = to_time_series_dataset(start_list)
formatted_start_list.shape

In [None]:
cluster_count = 21
kmeans_metric = "euclidean"

In [None]:
#%%time
#start_km = TimeSeriesKMeans(n_clusters = cluster_count, metric = kmeans_metric, max_iter = 10)
#start_km_labels = start_km.fit_predict(formatted_start_list)

In [None]:
start_series_names = [station for station in trips_start_all_group["start station id"].unique()]
start_km_labels_list = [label for label in start_km_labels]

start_clusters = pd.DataFrame(zip(start_series_names, start_km_labels_list), columns = ["station id", "cluster"]).sort_values(by = "cluster")
start_clusters

In [None]:
#start_clusters.to_csv("start_clusters.csv", index = False)

**Cluster Analysis**

In [None]:
start_clusters = pd.read_csv("start_clusters_euc21_15min.csv")
start_clusters

Cluster 0 is the "catch-all" cluster for 1 day frequency.
Cluster 3 is the "catch-all" cluster for 15 min frequency.

In [None]:
start_cluster_c = [len(start_clusters[start_clusters["cluster"] == i]) for i in range(cluster_count)]
start_cluster_n = ["Cluster " + str(i) for i in range(cluster_count)]
plt.figure(figsize = (15, 5))
plt.title("Trip Start Station Cluster Distribution for K-Means")
plt.barh(start_cluster_n, start_cluster_c)
plt.gca().invert_yaxis()
plt.show()

In [None]:
fig, axs = plt.subplots(cluster_count, 1, figsize = (20, 50), sharex = True)
axx = axs.ravel()
for i in range(0, cluster_count):
    start_cluster = start_clusters[start_clusters["cluster"] == i]
    for j in range(len(start_cluster)):
        temp_station = start_cluster.iloc[j]["station id"]
        get_station_data(trips_start_all_group, "start station id", "starttime", temp_station, freq, max_date, cluster = True, min_date = min_date).plot(ax = axx[i], color = "#808080")
        axx[i].set_xlabel("date")
        axx[i].set_ylabel("trip count")
        axx[i].set_title("Start Station Cluster " + str(i))
        axx[i].grid(which = "minor", axis = "x")
        axx[i].get_legend().remove()

**Data for DeepAR**

In [None]:
freq = "15min" # group and sum trips by a set increment
max_date = "2022-08-31 23:45:00" # make sure all series end at the same time
train_date = "2022-08-28"
test_date = "2022-08-29"

In [None]:
train_euc21_start, test_euc21_start = deepar_station_data(trips_start_all_group, "start station id", "starttime", freq, max_date, train_date, test_date, cluster = True, cluster_data = start_clusters)

In [None]:
# retained all stations
print(len(train_euc21_start))
print(len(test_euc21_start))

In [None]:
# make sure all test data is the same length
test_length = 0
for i in range(len(test_euc21_start)):
    test_length += len(test_euc21_start[i]["target"])
test_length / len(test_euc21_start) # should be 288

In [None]:
# check number of trips
trips = 0
for i in range(len(train_euc21_start)):
    trips += sum(train_euc21_start[i]["target"])
for i in range(len(test_euc21_start)):
    trips += sum(test_euc21_start[i]["target"])
trips # retained all trips

In [None]:
# spot check of correct categories were added
t_station = trips_start_all_group["start station id"].unique()[:10]
print(start_clusters[start_clusters["station id"] == t_station[9]]["cluster"].tolist())
print(train_euc21_start[9]["cat"])
print(test_euc21_start[9]["cat"])

In [None]:
# save to json lines format
write_dicts_to_file("train_euc21_15min_start.json", train_euc21_start)
write_dicts_to_file("test_euc21_15min_start.json", test_euc21_start)

## Trip End Station

In [None]:
stop_file = "../model_trips_stop_station_20208029_20220831.csv"

trips_stop = pd.read_csv(stop_file, parse_dates = True)
trips_stop.shape

In [None]:
trips_stop_all_group = prep_station_data(trips_stop, "end station id", "stoptime")
print(sum(trips_stop_all_group["size"]))

**EDA**

In [None]:
freq = "D" # group and sum trips by a set increment
min_date = "2020-08-29 00:00:00" # make sure all series start at the same time
max_date = "2022-08-31 23:45:00" # make sure all series end at the same time

In [None]:
# from manual_station_clustering notebook
most_popular_stop = [67, 68, 74, 60, 107, 46, 178, 179, 53, 9]
semi_popular_stop = [189, 471, 39, 40, 437, 190, 370, 22, 33, 459]
least_popular_stop = [556, 543, 388, 548, 317, 308, 570, 572, 571, 438]

In [None]:
fig, axs = plt.subplots(10, 1, figsize = (20, 20), sharex = True)
axx = axs.ravel()
for i in range(0, 10):
    temp_station = least_popular_stop[i]
    get_station_data(trips_stop_all_group, "end station id", "stoptime", temp_station, freq, max_date, cluster = True, min_date = min_date).plot(ax = axx[i])
    axx[i].set_xlabel("date")
    axx[i].set_ylabel("trip count")
    axx[i].set_title(str(temp_station))
    axx[i].grid(which = "minor", axis = "x")

**Time Series K-Means Clustering**

In [None]:
stop_list = []
for station in tqdm(trips_stop_all_group["end station id"].unique()):
    stop_station_data = get_station_data(trips_stop_all_group, "end station id", "stoptime", station, freq, max_date, cluster = True, min_date = min_date)
    stop_list.append(stop_station_data["size"].tolist())

In [None]:
print(len(stop_list))
print(sum([sum(i) for i in zip(*stop_list)]))

In [None]:
formatted_stop_list = to_time_series_dataset(stop_list)
formatted_stop_list.shape

In [None]:
cluster_count = 21
kmeans_metric = "euclidean"

In [None]:
#%%time
#stop_km = TimeSeriesKMeans(n_clusters = cluster_count, metric = kmeans_metric, max_iter = 10)
#stop_km_labels = stop_km.fit_predict(formatted_stop_list)

In [None]:
stop_series_names = [station for station in trips_stop_all_group["end station id"].unique()]
stop_km_labels_list = [label for label in stop_km_labels]

stop_clusters = pd.DataFrame(zip(stop_series_names, stop_km_labels_list), columns = ["station id", "cluster"]).sort_values(by = "cluster")
stop_clusters

In [None]:
#stop_clusters.to_csv("stop_clusters.csv", index = False)

**Cluster Analysis**

In [None]:
stop_clusters = pd.read_csv("stop_clusters_euc21.csv")
stop_clusters

Cluster 2 is the "catch-all" cluster.

In [None]:
stop_cluster_c = [len(stop_clusters[stop_clusters["cluster"] == i]) for i in range(cluster_count)]
stop_cluster_n = ["Cluster " + str(i) for i in range(cluster_count)]
plt.figure(figsize = (15, 5))
plt.title("Trip End Station Cluster Distribution for K-Means")
plt.barh(stop_cluster_n, stop_cluster_c)
plt.gca().invert_yaxis()
plt.show()

In [None]:
fig, axs = plt.subplots(cluster_count, 1, figsize = (20, 50), sharex = True)
axx = axs.ravel()
for i in range(0, cluster_count):
    stop_cluster = stop_clusters[stop_clusters["cluster"] == i]
    for j in range(len(stop_cluster)):
        temp_station = stop_cluster.iloc[j]["station id"]
        get_station_data(trips_stop_all_group, "end station id", "stoptime", temp_station, freq, max_date, cluster = True, min_date = min_date).plot(ax = axx[i], color = "#808080")
        axx[i].set_xlabel("date")
        axx[i].set_ylabel("trip count")
        axx[i].set_title("End Station Cluster " + str(i))
        axx[i].grid(which = "minor", axis = "x")
        axx[i].get_legend().remove()

**Data for DeepAR**

In [None]:
freq = "15min" # group and sum trips by a set increment
max_date = "2022-08-31 23:45:00" # make sure all series end at the same time
train_date = "2022-08-28"
test_date = "2022-08-29"

In [None]:
train_euc21_stop, test_euc21_stop = deepar_station_data(trips_stop_all_group, "end station id", "stoptime", freq, max_date, train_date, test_date, cluster = True, cluster_data = stop_clusters)

In [None]:
# retained all but 1 station
print(len(train_euc21_stop))
print(len(test_euc21_stop))

In [None]:
# make sure all test data is the same length
test_length = 0
for i in range(len(test_euc21_stop)):
    test_length += len(test_euc21_stop[i]["target"])
test_length / len(test_euc21_stop) # should be 288

In [None]:
# check number of trips
trips = 0
for i in range(len(train_euc21_stop)):
    trips += sum(train_euc21_stop[i]["target"])
for i in range(len(test_euc21_stop)):
    trips += sum(test_euc21_stop[i]["target"])
trips # lost 5 trips due to the 1 station loss

In [None]:
# spot check of correct categories were added
t_station = trips_stop_all_group["end station id"].unique()[:10]
print(stop_clusters[stop_clusters["station id"] == t_station[9]]["cluster"].tolist())
print(train_euc21_stop[9]["cat"])
print(test_euc21_stop[9]["cat"])

Station 572 w/ 5 trips was dropped b/c the first trip that ended there was after the `test_date` of 8/29/2022.

In [None]:
# save to json lines format
write_dicts_to_file("train_euc21_stop.json", train_euc21_stop)
write_dicts_to_file("test_euc21_stop.json", test_euc21_stop)