In [1]:
import pandas as pd

In [2]:
df = pd.read_parquet("../../../data/data_taxi_central_based.parquet")

In [3]:
first_rows = df.groupby('TRIP_ID').first().reset_index()

In [4]:
df_41 = df[df['POLYLINE_LENGTH'] == 41]

In [5]:
import timesfm

 See https://github.com/google-research/timesfm/blob/master/README.md for updated APIs.
Loaded PyTorch TimesFM, likely because python version is 3.11.13 | packaged by Anaconda, Inc. | (main, Jun  5 2025, 13:03:15) [MSC v.1929 64 bit (AMD64)].


In [6]:
df_41_lon = df_41.drop(columns=['POLYLINE_LENGTH', 'LAT'])
df_41_lat = df_41.drop(columns=['POLYLINE_LENGTH', 'LON'])

In [7]:
tfm_lat = timesfm.TimesFm(
      hparams=timesfm.TimesFmHparams(
          backend="cpu",
          per_core_batch_size=32,
          horizon_len=10,
          num_layers=50,
          use_positional_embedding=False,
          context_len=2048,
      ),
      checkpoint=timesfm.TimesFmCheckpoint(
          huggingface_repo_id="google/timesfm-2.0-500m-pytorch"),
  )

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [8]:
tfm_lon = timesfm.TimesFm(
      hparams=timesfm.TimesFmHparams(
          backend="gpu",
          per_core_batch_size=32,
          horizon_len=10,
          num_layers=50,
          use_positional_embedding=False,
          context_len=2048,
      ),
      checkpoint=timesfm.TimesFmCheckpoint(
          huggingface_repo_id="google/timesfm-2.0-500m-pytorch"),
  )

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [9]:
df_41_lat = df_41_lat.sort_values(by=['TRIP_ID', 'TIMESTAMP'])

In [10]:
df_41_lon = df_41_lon.sort_values(by=['TRIP_ID', 'TIMESTAMP'])

In [11]:
from sklearn.preprocessing import MinMaxScaler
scaler_lat = MinMaxScaler()
df_41_lat['LAT'] = scaler_lat.fit_transform(df_41_lat[['LAT']])

scaler_lon = MinMaxScaler()
df_41_lon['LON'] = scaler_lon.fit_transform(df_41_lon[['LON']])

In [12]:
df_41_lat['TIMESTAMP'] = pd.to_datetime(df_41_lat['TIMESTAMP'], unit='s')
df_41_lon['TIMESTAMP'] = pd.to_datetime(df_41_lon['TIMESTAMP'], unit='s')

In [13]:
df_41_lon = df_41_lon.rename(columns={'TIMESTAMP': 'ds', 'LON': 'y', 'TRIP_ID': 'unique_id', 'ORIGIN_CALL': 'origin_call', 'TAXI_ID': 'taxi_id'})
df_41_lat = df_41_lat.rename(columns={'TIMESTAMP': 'ds', 'LAT': 'y', 'TRIP_ID': 'unique_id', 'ORIGIN_CALL': 'origin_call', 'TAXI_ID': 'taxi_id'})

In [14]:
from collections import defaultdict


def get_taxi_data_fn(df, batch_size: int = 128, horizon_len: int = 10):
    examples = defaultdict(list)
    num_examples = 0

    for ride_id, sub_df in df.groupby("unique_id"):
        if len(sub_df) <= horizon_len:
            continue

        context_df = sub_df.iloc[:-horizon_len]
        horizon_df = sub_df.iloc[-horizon_len:]

        num_examples += 1
        examples["unique_id"].append(ride_id)
        examples["origin_call"].append(sub_df["origin_call"].iloc[0])
        examples["taxi_id"].append(sub_df["taxi_id"].iloc[0])

        examples["inputs"].append(context_df["y"].tolist())
        examples["outputs"].append(horizon_df["y"].tolist())
        examples["ds"].append(sub_df["ds"].tolist())

    def data_fn():
        for i in range(1 + (num_examples - 1) // batch_size):
            yield {
                k: v[i * batch_size : (i + 1) * batch_size]
                for k, v in examples.items()
            }

    return data_fn

In [17]:
import time

input_data = get_taxi_data_fn(df_41_lat)
forecast_df_lat = []

for i, example in enumerate(input_data()):
  start_time = time.time()
  cov_forecast, _ = tfm_lat.forecast_with_covariates(
      inputs=example["inputs"],
      static_numerical_covariates={
          "origin_call": example["origin_call"],
          "taxi_id": example["taxi_id"],
      },
      freq=[0] * len(example["inputs"]),
      xreg_mode="xreg + timesfm",
      ridge=0.0,
      force_on_cpu=False,
      normalize_xreg_target_per_input=True,
  )
  print(
      f"\rFinished batch {i} linear in {time.time() - start_time} seconds",
      end="",
  )
  forecast_df_lat.append(cov_forecast)

print()

Finished batch 53 linear in 14.165938138961792 seconds


In [18]:
import time

input_data = get_taxi_data_fn(df_41_lon)
forecast_df_lon = []

for i, example in enumerate(input_data()):
  start_time = time.time()
  cov_forecast, _ = tfm_lon.forecast_with_covariates(
      inputs=example["inputs"],
      static_numerical_covariates={
          "origin_call": example["origin_call"],
          "taxi_id": example["taxi_id"],
      },
      freq=[0] * len(example["inputs"]),
      xreg_mode="xreg + timesfm",
      ridge=0.0,
      force_on_cpu=False,
      normalize_xreg_target_per_input=True,
  )
  print(
      f"\rFinished batch {i} linear in {time.time() - start_time} seconds",
      end="",
  )
  forecast_df_lon.append(cov_forecast)

print()

Finished batch 53 linear in 14.403236150741577 seconds


In [33]:
def forecasts_to_df(forecasts, examples, coord_name):
    records = []
    for arr, uid, call, taxi, coor in zip(forecasts,
                                    examples["unique_id"],
                                    examples["origin_call"],
                                    examples["taxi_id"],
                                    examples["inputs"]):
        for step, value in enumerate(arr):
            records.append({
                "unique_id": uid,
                "origin_call": call,
                "taxi_id": taxi,
                "horizon_step": step + 1,
                f"yhat_{coord_name}": value,
            })
    return pd.DataFrame(records)

In [34]:
input_data = get_taxi_data_fn(df_41_lat)
df_lat_forecast = []
for example, forecast in zip(input_data(), forecast_df_lat):
    df_lat = forecasts_to_df(forecast, example, coord_name="lat")
    df_lat_forecast.append(df_lat)
df_lat_forecast = pd.concat(df_lat_forecast, ignore_index=True)
df_lat_forecast

Unnamed: 0,unique_id,origin_call,taxi_id,horizon_step,yhat_lat
0,1372638793620000571,2002.0,20000571,1,0.370878
1,1372638793620000571,2002.0,20000571,2,0.371868
2,1372638793620000571,2002.0,20000571,3,0.372954
3,1372638793620000571,2002.0,20000571,4,0.374244
4,1372638793620000571,2002.0,20000571,5,0.376145
...,...,...,...,...,...
68235,1404157914620000488,4448.0,20000488,6,0.533837
68236,1404157914620000488,4448.0,20000488,7,0.534414
68237,1404157914620000488,4448.0,20000488,8,0.534262
68238,1404157914620000488,4448.0,20000488,9,0.534673


In [35]:
input_data = get_taxi_data_fn(df_41_lon)
df_lon_forecast = []
for example, forecast in zip(input_data(), forecast_df_lon):
    df_lon = forecasts_to_df(forecast, example, coord_name="lon")
    df_lon_forecast.append(df_lon)
df_lon_forecast = pd.concat(df_lon_forecast, ignore_index=True)
df_lon_forecast

Unnamed: 0,unique_id,origin_call,taxi_id,horizon_step,yhat_lon
0,1372638793620000571,2002.0,20000571,1,0.466090
1,1372638793620000571,2002.0,20000571,2,0.454030
2,1372638793620000571,2002.0,20000571,3,0.440114
3,1372638793620000571,2002.0,20000571,4,0.431554
4,1372638793620000571,2002.0,20000571,5,0.425996
...,...,...,...,...,...
68235,1404157914620000488,4448.0,20000488,6,0.590144
68236,1404157914620000488,4448.0,20000488,7,0.582093
68237,1404157914620000488,4448.0,20000488,8,0.571354
68238,1404157914620000488,4448.0,20000488,9,0.568040


In [38]:
df_lat_forecast["yhat_lat"] = scaler_lat.inverse_transform(df_lat_forecast[["yhat_lat"]])
df_lon_forecast["yhat_lon"] = scaler_lon.inverse_transform(df_lon_forecast[["yhat_lon"]])
predictions = pd.merge(df_lat_forecast, df_lon_forecast, on=['unique_id', 'origin_call', 'taxi_id', 'horizon_step'], how='left')
predictions

Unnamed: 0,unique_id,origin_call,taxi_id,horizon_step,yhat_lat,yhat_lon
0,1372638793620000571,2002.0,20000571,1,41.141456,-8.616468
1,1372638793620000571,2002.0,20000571,2,41.141617,-8.618903
2,1372638793620000571,2002.0,20000571,3,41.141795,-8.621713
3,1372638793620000571,2002.0,20000571,4,41.142005,-8.623442
4,1372638793620000571,2002.0,20000571,5,41.142316,-8.624564
...,...,...,...,...,...,...
68235,1404157914620000488,4448.0,20000488,6,41.168053,-8.591417
68236,1404157914620000488,4448.0,20000488,7,41.168147,-8.593043
68237,1404157914620000488,4448.0,20000488,8,41.168122,-8.595212
68238,1404157914620000488,4448.0,20000488,9,41.168190,-8.595881


In [39]:
df_41_lon

Unnamed: 0_level_0,unique_id,origin_call,taxi_id,ds,y
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1284,1372638793620000571,2002.0,20000571,2013-07-01 00:33:13,0.672906
1285,1372638793620000571,2002.0,20000571,2013-07-01 00:33:28,0.672951
1286,1372638793620000571,2002.0,20000571,2013-07-01 00:33:43,0.672995
1287,1372638793620000571,2002.0,20000571,2013-07-01 00:33:58,0.673040
1288,1372638793620000571,2002.0,20000571,2013-07-01 00:34:13,0.672995
...,...,...,...,...,...
83359063,1404157914620000488,4448.0,20000488,2014-06-30 20:00:54,0.577038
83359064,1404157914620000488,4448.0,20000488,2014-06-30 20:01:09,0.576904
83359065,1404157914620000488,4448.0,20000488,2014-06-30 20:01:24,0.576860
83359066,1404157914620000488,4448.0,20000488,2014-06-30 20:01:39,0.576815


In [42]:
df_41_lat = df_41_lat.rename(columns={"y": "lat"})
df_41_lon = df_41_lon.rename(columns={"y": "lon"})

Unnamed: 0,unique_id,origin_call,taxi_id,ds,lat,lon
0,1372638793620000571,2002.0,20000571,2013-07-01 00:43:13,0.378109,0.470785
1,1372641068620000178,52403.0,20000178,2013-07-01 01:21:08,0.266170,0.466907
2,1372656887620000167,45340.0,20000167,2013-07-01 05:44:47,0.428398,0.183090
3,1372657987620000623,35205.0,20000623,2013-07-01 06:03:07,0.451227,0.182154
4,1372660930620000571,55326.0,20000571,2013-07-01 06:52:10,0.240143,0.520613
...,...,...,...,...,...,...
6819,1404155056620000206,35725.0,20000206,2014-06-30 19:14:16,0.434905,0.495565
6820,1404155073620000099,4690.0,20000099,2014-06-30 19:14:33,0.404356,0.459643
6821,1404155819620000242,14922.0,20000242,2014-06-30 19:26:59,0.552412,0.487855
6822,1404157331620000343,33419.0,20000343,2014-06-30 19:52:11,0.469865,0.330481


In [44]:
df_41_lon_test = df_41_lon.groupby('unique_id', group_keys=False).tail(1)
df_41_lat_test = df_41_lat.groupby('unique_id', group_keys=False).tail(1)
test = pd.merge(df_41_lat_test, df_41_lon_test, on=['unique_id', 'origin_call', 'taxi_id', 'ds'], how='left')
test

Unnamed: 0,unique_id,origin_call,taxi_id,ds,lat,lon
0,1372638793620000571,2002.0,20000571,2013-07-01 00:43:13,0.378109,0.470785
1,1372641068620000178,52403.0,20000178,2013-07-01 01:21:08,0.266170,0.466907
2,1372656887620000167,45340.0,20000167,2013-07-01 05:44:47,0.428398,0.183090
3,1372657987620000623,35205.0,20000623,2013-07-01 06:03:07,0.451227,0.182154
4,1372660930620000571,55326.0,20000571,2013-07-01 06:52:10,0.240143,0.520613
...,...,...,...,...,...,...
6819,1404155056620000206,35725.0,20000206,2014-06-30 19:14:16,0.434905,0.495565
6820,1404155073620000099,4690.0,20000099,2014-06-30 19:14:33,0.404356,0.459643
6821,1404155819620000242,14922.0,20000242,2014-06-30 19:26:59,0.552412,0.487855
6822,1404157331620000343,33419.0,20000343,2014-06-30 19:52:11,0.469865,0.330481


In [45]:
test["lat"] = scaler_lat.inverse_transform(test[["lat"]])
test["lon"] = scaler_lon.inverse_transform(test[["lon"]])
test

Unnamed: 0,unique_id,origin_call,taxi_id,ds,lat,lon
0,1372638793620000571,2002.0,20000571,2013-07-01 00:43:13,41.142636,-8.615520
1,1372641068620000178,52403.0,20000178,2013-07-01 01:21:08,41.124366,-8.616303
2,1372656887620000167,45340.0,20000167,2013-07-01 05:44:47,41.150844,-8.673615
3,1372657987620000623,35205.0,20000623,2013-07-01 06:03:07,41.154570,-8.673804
4,1372660930620000571,55326.0,20000571,2013-07-01 06:52:10,41.120118,-8.605458
...,...,...,...,...,...,...
6819,1404155056620000206,35725.0,20000206,2014-06-30 19:14:16,41.151906,-8.610516
6820,1404155073620000099,4690.0,20000099,2014-06-30 19:14:33,41.146920,-8.617770
6821,1404155819620000242,14922.0,20000242,2014-06-30 19:26:59,41.171085,-8.612073
6822,1404157331620000343,33419.0,20000343,2014-06-30 19:52:11,41.157612,-8.643852


In [46]:
predictions = predictions.groupby('unique_id', group_keys=False).tail(1)
predictions

Unnamed: 0,unique_id,origin_call,taxi_id,horizon_step,yhat_lat,yhat_lon
9,1372638793620000571,2002.0,20000571,10,41.143548,-8.628679
19,1372641068620000178,52403.0,20000178,10,41.123303,-8.605149
29,1372656887620000167,45340.0,20000167,10,41.149990,-8.669328
39,1372657987620000623,35205.0,20000623,10,41.166904,-8.664035
49,1372660930620000571,55326.0,20000571,10,41.118157,-8.595368
...,...,...,...,...,...,...
68199,1404155056620000206,35725.0,20000206,10,41.152426,-8.616852
68209,1404155073620000099,4690.0,20000099,10,41.144578,-8.620922
68219,1404155819620000242,14922.0,20000242,10,41.172208,-8.606134
68229,1404157331620000343,33419.0,20000343,10,41.153483,-8.643383


In [47]:
predictions = predictions.drop(['origin_call', 'taxi_id', 'horizon_step'], axis=1)
test = test.drop(['origin_call', 'taxi_id', 'ds'], axis=1)
df = pd.merge(test, predictions, on='unique_id', how='left')
df

Unnamed: 0,unique_id,lat,lon,yhat_lat,yhat_lon
0,1372638793620000571,41.142636,-8.615520,41.143548,-8.628679
1,1372641068620000178,41.124366,-8.616303,41.123303,-8.605149
2,1372656887620000167,41.150844,-8.673615,41.149990,-8.669328
3,1372657987620000623,41.154570,-8.673804,41.166904,-8.664035
4,1372660930620000571,41.120118,-8.605458,41.118157,-8.595368
...,...,...,...,...,...
6819,1404155056620000206,41.151906,-8.610516,41.152426,-8.616852
6820,1404155073620000099,41.146920,-8.617770,41.144578,-8.620922
6821,1404155819620000242,41.171085,-8.612073,41.172208,-8.606134
6822,1404157331620000343,41.157612,-8.643852,41.153483,-8.643383


In [48]:
from evaluation_script import haversine_distance
df['distance_delta'] = df.apply(lambda row: haversine_distance(
    row['lat'], row['lon'], row['yhat_lat'], row['yhat_lon']), axis=1)
df['distance_delta'].mean()

np.float64(0.6035927588035952)

In [49]:
df['distance_delta']

0       1.106580
1       0.941698
2       0.371316
3       1.596818
4       0.872918
          ...   
6819    0.533675
6820    0.370797
6821    0.512497
6822    0.460831
6823    0.397522
Name: distance_delta, Length: 6824, dtype: float64