In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet("../../data/data_taxi_central_based.parquet")

In [3]:
first_rows = df.groupby('TRIP_ID').first().reset_index()

In [4]:
df = df[(df['POLYLINE_LENGTH'] >= 30) & (df['POLYLINE_LENGTH'] < 276)]

In [5]:
import timesfm

 See https://github.com/google-research/timesfm/blob/master/README.md for updated APIs.
Loaded PyTorch TimesFM, likely because python version is 3.11.13 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 13:03:15) [MSC v.1929 64 bit (AMD64)].


In [6]:
df_lon = df.drop(columns=['POLYLINE_LENGTH', 'LAT'])
df_lat = df.drop(columns=['POLYLINE_LENGTH', 'LON'])

In [7]:
tfm_lat = timesfm.TimesFm(
      hparams=timesfm.TimesFmHparams(
          backend="cpu",
          per_core_batch_size=32,
          horizon_len=20,
          num_layers=50,
          use_positional_embedding=False,
          context_len=256,
      ),
      checkpoint=timesfm.TimesFmCheckpoint(
          huggingface_repo_id="google/timesfm-2.0-500m-pytorch"),
  )

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [8]:
tfm_lon = timesfm.TimesFm(
      hparams=timesfm.TimesFmHparams(
          backend="gpu",
          per_core_batch_size=32,
          horizon_len=20,
          num_layers=50,
          use_positional_embedding=False,
          context_len=256,
      ),
      checkpoint=timesfm.TimesFmCheckpoint(
          huggingface_repo_id="google/timesfm-2.0-500m-pytorch"),
  )

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [9]:
df_lat = df_lat.sort_values(by=['TRIP_ID', 'TIMESTAMP'])

In [10]:
df_lon = df_lon.sort_values(by=['TRIP_ID', 'TIMESTAMP'])

In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler_lat = MinMaxScaler()
df_lat['LAT'] = scaler_lat.fit_transform(df_lat[['LAT']])

scaler_lon = MinMaxScaler()
df_lon['LON'] = scaler_lon.fit_transform(df_lon[['LON']])

In [12]:
df_lat['TIMESTAMP'] = pd.to_datetime(df_lat['TIMESTAMP'], unit='s')
df_lon['TIMESTAMP'] = pd.to_datetime(df_lon['TIMESTAMP'], unit='s')

In [13]:
df_lon = df_lon.rename(columns={'TIMESTAMP': 'ds', 'LON': 'y', 'TRIP_ID': 'unique_id', 'ORIGIN_CALL': 'origin_call', 'TAXI_ID': 'taxi_id'})
df_lat = df_lat.rename(columns={'TIMESTAMP': 'ds', 'LAT': 'y', 'TRIP_ID': 'unique_id', 'ORIGIN_CALL': 'origin_call', 'TAXI_ID': 'taxi_id'})

In [14]:
from collections import defaultdict


def get_taxi_data_fn(df, batch_size: int = 256, horizon_len: int = 20):
    examples = defaultdict(list)
    num_examples = 0

    for ride_id, sub_df in df.groupby("unique_id"):
        if len(sub_df) <= horizon_len:
            continue

        context_df = sub_df.iloc[:-horizon_len]
        horizon_df = sub_df.iloc[-horizon_len:]

        num_examples += 1
        examples["unique_id"].append(ride_id)
        examples["origin_call"].append(sub_df["origin_call"].iloc[0])
        examples["taxi_id"].append(sub_df["taxi_id"].iloc[0])

        examples["inputs"].append(context_df["y"].tolist())
        examples["outputs"].append(horizon_df["y"].tolist())
        examples["ds"].append(sub_df["ds"].tolist())

    def data_fn():
        for i in range(1 + (num_examples - 1) // batch_size):
            yield {
                k: v[i * batch_size : (i + 1) * batch_size]
                for k, v in examples.items()
            }

    return data_fn

In [15]:
import time

input_data = get_taxi_data_fn(df_lat)
forecast_df_lat = []

for i, example in enumerate(input_data()):
  start_time = time.time()
  cov_forecast, _ = tfm_lat.forecast_with_covariates(
      inputs=example["inputs"],
      static_numerical_covariates={
          "origin_call": example["origin_call"],
          "taxi_id": example["taxi_id"],
      },
      freq=[0] * len(example["inputs"]),
      xreg_mode="xreg + timesfm",
      ridge=0.0,
      force_on_cpu=False,
      normalize_xreg_target_per_input=True,
  )
  print(
      f"\rFinished batch {i} linear in {time.time() - start_time} seconds",
      end="",
  )
  forecast_df_lat.append(cov_forecast)

print()

Finished batch 1114 linear in 2.8383007049560547 seconds


In [16]:
import time

input_data = get_taxi_data_fn(df_lon)
forecast_df_lon = []

for i, example in enumerate(input_data()):
  start_time = time.time()
  cov_forecast, _ = tfm_lon.forecast_with_covariates(
      inputs=example["inputs"],
      static_numerical_covariates={
          "origin_call": example["origin_call"],
          "taxi_id": example["taxi_id"],
      },
      freq=[0] * len(example["inputs"]),
      xreg_mode="xreg + timesfm",
      ridge=0.0,
      force_on_cpu=False,
      normalize_xreg_target_per_input=True,
  )
  print(
      f"\rFinished batch {i} linear in {time.time() - start_time} seconds",
      end="",
  )
  forecast_df_lon.append(cov_forecast)

print()

Finished batch 1114 linear in 1.9477241039276123 seconds


In [17]:
def forecasts_to_df(forecasts, examples, coord_name):
    records = []
    for arr, uid, call, taxi, coor in zip(forecasts,
                                    examples["unique_id"],
                                    examples["origin_call"],
                                    examples["taxi_id"],
                                    examples["inputs"]):
        for step, value in enumerate(arr):
            records.append({
                "unique_id": uid,
                "origin_call": call,
                "taxi_id": taxi,
                "horizon_step": step + 1,
                f"yhat_{coord_name}": value,
            })
    return pd.DataFrame(records)

In [18]:
input_data = get_taxi_data_fn(df_lat)
df_lat_forecast = []
for example, forecast in zip(input_data(), forecast_df_lat):
    df_lat_temp = forecasts_to_df(forecast, example, coord_name="lat")
    df_lat_forecast.append(df_lat_temp)
df_lat_forecast = pd.concat(df_lat_forecast, ignore_index=True)
df_lat_forecast

Unnamed: 0,unique_id,origin_call,taxi_id,horizon_step,yhat_lat
0,1372637254620000657,39233.0,20000657,1,0.585462
1,1372637254620000657,39233.0,20000657,2,0.585471
2,1372637254620000657,39233.0,20000657,3,0.585486
3,1372637254620000657,39233.0,20000657,4,0.585513
4,1372637254620000657,39233.0,20000657,5,0.585509
...,...,...,...,...,...
5704575,1404172319620000571,4650.0,20000571,16,0.575430
5704576,1404172319620000571,4650.0,20000571,17,0.575442
5704577,1404172319620000571,4650.0,20000571,18,0.575414
5704578,1404172319620000571,4650.0,20000571,19,0.575407


In [19]:
input_data = get_taxi_data_fn(df_lon)
df_lon_forecast = []
for example, forecast in zip(input_data(), forecast_df_lon):
    df_lon_temp = forecasts_to_df(forecast, example, coord_name="lon")
    df_lon_forecast.append(df_lon_temp)
df_lon_forecast = pd.concat(df_lon_forecast, ignore_index=True)
df_lon_forecast

Unnamed: 0,unique_id,origin_call,taxi_id,horizon_step,yhat_lon
0,1372637254620000657,39233.0,20000657,1,0.638454
1,1372637254620000657,39233.0,20000657,2,0.638667
2,1372637254620000657,39233.0,20000657,3,0.638778
3,1372637254620000657,39233.0,20000657,4,0.638917
4,1372637254620000657,39233.0,20000657,5,0.639057
...,...,...,...,...,...
5704575,1404172319620000571,4650.0,20000571,16,0.640514
5704576,1404172319620000571,4650.0,20000571,17,0.640549
5704577,1404172319620000571,4650.0,20000571,18,0.640562
5704578,1404172319620000571,4650.0,20000571,19,0.640543


In [20]:
df_lat_forecast["yhat_lat"] = scaler_lat.inverse_transform(df_lat_forecast[["yhat_lat"]])
df_lon_forecast["yhat_lon"] = scaler_lon.inverse_transform(df_lon_forecast[["yhat_lon"]])
predictions = pd.merge(df_lat_forecast, df_lon_forecast, on=['unique_id', 'origin_call', 'taxi_id', 'horizon_step'], how='left')
predictions

Unnamed: 0,unique_id,origin_call,taxi_id,horizon_step,yhat_lat,yhat_lon
0,1372637254620000657,39233.0,20000657,1,41.180661,-8.628694
1,1372637254620000657,39233.0,20000657,2,41.180692,-8.627177
2,1372637254620000657,39233.0,20000657,3,41.180740,-8.626384
3,1372637254620000657,39233.0,20000657,4,41.180828,-8.625398
4,1372637254620000657,39233.0,20000657,5,41.180816,-8.624404
...,...,...,...,...,...,...
5704575,1404172319620000571,4650.0,20000571,16,41.147688,-8.614034
5704576,1404172319620000571,4650.0,20000571,17,41.147728,-8.613786
5704577,1404172319620000571,4650.0,20000571,18,41.147634,-8.613692
5704578,1404172319620000571,4650.0,20000571,19,41.147612,-8.613827


In [21]:
df_lat = df_lat.rename(columns={"y": "lat"})
df_lon = df_lon.rename(columns={"y": "lon"})

In [22]:
df_lon_test = df_lon.groupby('unique_id', group_keys=False).tail(1)
df_lat_test = df_lat.groupby('unique_id', group_keys=False).tail(1)
test = pd.merge(df_lat_test, df_lon_test, on=['unique_id', 'origin_call', 'taxi_id', 'ds'], how='left')
test

Unnamed: 0,unique_id,origin_call,taxi_id,ds,lat,lon
0,1372637254620000657,39233.0,20000657,2013-07-01 00:18:04,0.585812,0.642220
1,1372637343620000571,31508.0,20000571,2013-07-01 00:16:48,0.579865,0.645989
2,1372637397620000190,2002.0,20000190,2013-07-01 00:23:42,0.584413,0.648225
3,1372638303620000112,63882.0,20000112,2013-07-01 00:53:48,0.579917,0.644292
4,1372638451620000621,59594.0,20000621,2013-07-01 00:36:01,0.589312,0.646904
...,...,...,...,...,...,...
285224,1404170795620000242,35193.0,20000242,2014-06-30 23:34:50,0.584167,0.636203
285225,1404171078620000455,58509.0,20000455,2014-06-30 23:44:18,0.576346,0.641099
285226,1404171293620000565,46207.0,20000565,2014-06-30 23:43:23,0.573403,0.640410
285227,1404172208620000196,30020.0,20000196,2014-06-30 23:59:23,0.584315,0.643616


In [23]:
test["lat"] = scaler_lat.inverse_transform(test[["lat"]])
test["lon"] = scaler_lon.inverse_transform(test[["lon"]])
test

Unnamed: 0,unique_id,origin_call,taxi_id,ds,lat,lon
0,1372637254620000657,39233.0,20000657,2013-07-01 00:18:04,41.181813,-8.601894
1,1372637343620000571,31508.0,20000571,2013-07-01 00:16:48,41.162265,-8.575065
2,1372637397620000190,2002.0,20000190,2013-07-01 00:23:42,41.177214,-8.559153
3,1372638303620000112,63882.0,20000112,2013-07-01 00:53:48,41.162436,-8.587143
4,1372638451620000621,59594.0,20000621,2013-07-01 00:36:01,41.193315,-8.568558
...,...,...,...,...,...,...
285224,1404170795620000242,35193.0,20000242,2014-06-30 23:34:50,41.176404,-8.644716
285225,1404171078620000455,58509.0,20000455,2014-06-30 23:44:18,41.150700,-8.609868
285226,1404171293620000565,46207.0,20000565,2014-06-30 23:43:23,41.141025,-8.614773
285227,1404172208620000196,30020.0,20000196,2014-06-30 23:59:23,41.176890,-8.591958


In [24]:
predictions = predictions.groupby('unique_id', group_keys=False).tail(1)
predictions

Unnamed: 0,unique_id,origin_call,taxi_id,horizon_step,yhat_lat,yhat_lon
19,1372637254620000657,39233.0,20000657,20,41.178299,-8.616890
39,1372637343620000571,31508.0,20000571,20,41.156000,-8.610107
59,1372637397620000190,2002.0,20000190,20,41.152702,-8.620771
79,1372638303620000112,63882.0,20000112,20,41.163426,-8.588996
99,1372638451620000621,59594.0,20000621,20,41.183778,-8.587079
...,...,...,...,...,...,...
5704499,1404170795620000242,35193.0,20000242,20,41.162703,-8.627589
5704519,1404171078620000455,58509.0,20000455,20,41.160104,-8.603283
5704539,1404171293620000565,46207.0,20000565,20,41.152473,-8.613018
5704559,1404172208620000196,30020.0,20000196,20,41.183136,-8.600233


In [25]:
predictions = predictions.drop(['origin_call', 'taxi_id', 'horizon_step'], axis=1)
test = test.drop(['origin_call', 'taxi_id', 'ds'], axis=1)
df = pd.merge(test, predictions, on='unique_id', how='left')
df

Unnamed: 0,unique_id,lat,lon,yhat_lat,yhat_lon
0,1372637254620000657,41.181813,-8.601894,41.178299,-8.616890
1,1372637343620000571,41.162265,-8.575065,41.156000,-8.610107
2,1372637397620000190,41.177214,-8.559153,41.152702,-8.620771
3,1372638303620000112,41.162436,-8.587143,41.163426,-8.588996
4,1372638451620000621,41.193315,-8.568558,41.183778,-8.587079
...,...,...,...,...,...
285224,1404170795620000242,41.176404,-8.644716,41.162703,-8.627589
285225,1404171078620000455,41.150700,-8.609868,41.160104,-8.603283
285226,1404171293620000565,41.141025,-8.614773,41.152473,-8.613018
285227,1404172208620000196,41.176890,-8.591958,41.183136,-8.600233


In [26]:
from evaluation_script import haversine_distance
df['distance_delta'] = df.apply(lambda row: haversine_distance(
    row['lat'], row['lon'], row['yhat_lat'], row['yhat_lon']), axis=1)
df['distance_delta'].mean()

np.float64(1.5993295468439943)

In [27]:
df['distance_delta'].median()

np.float64(1.2971063862604717)

In [28]:
df['distance_delta']

0         1.314440
1         3.015171
2         5.833915
3         0.190241
4         1.877892
            ...   
285224    2.091943
285225    1.182103
285226    1.281413
285227    0.980836
285228    2.336978
Name: distance_delta, Length: 285229, dtype: float64

In [29]:
df.to_parquet("results.parquet")

In [30]:
len(forecast_df_lon)

1115

In [49]:
# 1114 batches
last_lon_coord = []
for batch in forecast_df_lon:
    # 256 rides
    for ride in batch:
        # 20 predicted coordinates
        # 1 last coordinate
        last_coord = ride[-1]
        last_lon_coord.append(last_coord)

In [50]:
# 1114 batches
last_lat_coord = []
for batch in forecast_df_lat:
    # 256 rides
    for ride in batch:
        # 20 predicted coordinates
        # 1 last coordinate
        last_coord = ride[-1]
        last_lat_coord.append(last_coord)

In [41]:
test

Unnamed: 0,unique_id,lat,lon
0,1372637254620000657,41.181813,-8.601894
1,1372637343620000571,41.162265,-8.575065
2,1372637397620000190,41.177214,-8.559153
3,1372638303620000112,41.162436,-8.587143
4,1372638451620000621,41.193315,-8.568558
...,...,...,...
285224,1404170795620000242,41.176404,-8.644716
285225,1404171078620000455,41.150700,-8.609868
285226,1404171293620000565,41.141025,-8.614773
285227,1404172208620000196,41.176890,-8.591958


In [51]:
len(last_lon_coord)

285229

In [52]:
double_check_df = test.copy()

In [53]:
import numpy as np
last_lon_coord_np = np.array(last_lon_coord)
last_lat_coord_np = np.array(last_lat_coord)

In [54]:
last_lon_coord_np

array([0.64011253, 0.64106561, 0.63956716, ..., 0.64065662, 0.64245299,
       0.64048363], shape=(285229,))

In [55]:
double_check_df["yhat_lon"] = last_lon_coord_np
double_check_df["yhat_lat"] = last_lat_coord_np

In [56]:
double_check_df

Unnamed: 0,unique_id,lat,lon,yhat_lon,yhat_lat
0,1372637254620000657,41.181813,-8.601894,0.640113,0.584743
1,1372637343620000571,41.162265,-8.575065,0.641066,0.577959
2,1372637397620000190,41.177214,-8.559153,0.639567,0.576955
3,1372638303620000112,41.162436,-8.587143,0.644032,0.580218
4,1372638451620000621,41.193315,-8.568558,0.644301,0.586410
...,...,...,...,...,...
285224,1404170795620000242,41.176404,-8.644716,0.638609,0.579998
285225,1404171078620000455,41.150700,-8.609868,0.642024,0.579208
285226,1404171293620000565,41.141025,-8.614773,0.640657,0.576886
285227,1404172208620000196,41.176890,-8.591958,0.642453,0.586215


In [57]:
double_check_df["yhat_lat"] = scaler_lat.inverse_transform(double_check_df[["yhat_lat"]])
double_check_df["yhat_lon"] = scaler_lon.inverse_transform(double_check_df[["yhat_lon"]])
double_check_df

Unnamed: 0,unique_id,lat,lon,yhat_lon,yhat_lat
0,1372637254620000657,41.181813,-8.601894,-8.616890,41.178299
1,1372637343620000571,41.162265,-8.575065,-8.610107,41.156000
2,1372637397620000190,41.177214,-8.559153,-8.620771,41.152702
3,1372638303620000112,41.162436,-8.587143,-8.588996,41.163426
4,1372638451620000621,41.193315,-8.568558,-8.587079,41.183778
...,...,...,...,...,...
285224,1404170795620000242,41.176404,-8.644716,-8.627589,41.162703
285225,1404171078620000455,41.150700,-8.609868,-8.603283,41.160104
285226,1404171293620000565,41.141025,-8.614773,-8.613018,41.152473
285227,1404172208620000196,41.176890,-8.591958,-8.600233,41.183136


In [58]:
double_check_df['distance_delta'] = double_check_df.apply(lambda row: haversine_distance(
    row['lat'], row['lon'], row['yhat_lat'], row['yhat_lon']), axis=1)

In [59]:
double_check_df['distance_delta'].mean()

np.float64(1.5993295468439943)

In [60]:
double_check_df['distance_delta'].median()

np.float64(1.2971063862604717)