In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset


def load_london_bike_dataframe(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path, parse_dates=["timestamp"])
    df = df.sort_values("timestamp").reset_index(drop=True)
    return df


def engineer_london_features(df: pd.DataFrame) -> pd.DataFrame:
    df_feat = df.copy()

    df_feat["hour"] = df_feat["timestamp"].dt.hour
    df_feat["dayofweek"] = df_feat["timestamp"].dt.dayofweek
    df_feat["month"] = df_feat["timestamp"].dt.month
    df_feat["year"] = df_feat["timestamp"].dt.year

    df_feat["is_peak_hour"] = (
        ((df_feat["hour"] >= 7) & (df_feat["hour"] <= 9)) |
        ((df_feat["hour"] >= 16) & (df_feat["hour"] <= 19))
    ).astype(int)

    df_feat["temp_diff"] = df_feat["t1"] - df_feat["t2"]

    df_feat = df_feat.drop(columns=["timestamp"])

    return df_feat


def london_bike_to_tensors(
    csv_path: str,
    classification: bool = False,
    n_classes: int = 3,
):
    df_raw = load_london_bike_dataframe(csv_path)
    df_feat = engineer_london_features(df_raw)

    cnt = df_feat["cnt"].copy()

    df_feat = df_feat.drop(columns=["cnt"])

    feature_names = list(df_feat.columns)

    X = df_feat.to_numpy(dtype=np.float32)

    if classification:
        try:
            y_bins = pd.qcut(cnt, q=n_classes, labels=False, duplicates="drop")
        except ValueError:
            y_bins = pd.cut(cnt, bins=n_classes, labels=False)

        y = y_bins.to_numpy(dtype=np.int64)
        y_tensor = torch.from_numpy(y)
    else:
        y = cnt.to_numpy(dtype=np.float32)
        y_tensor = torch.from_numpy(y)

    X_tensor = torch.from_numpy(X)

    return X_tensor, y_tensor, feature_names


def london_bike_torch_dataset(
    csv_path: str,
    classification: bool = False,
    n_classes: int = 3,
) -> TensorDataset:
    X, y, _ = london_bike_to_tensors(csv_path, classification=classification, n_classes=n_classes)
    return TensorDataset(X, y)


In [None]:
CSV_PATH = "../data/london_merged.csv"
OUTPUT_PATH = "../data/london_bike_dataset.pt"

X_reg, y_reg, feat_names = london_bike_to_tensors(CSV_PATH, classification=False)

dataset = TensorDataset(X_reg, y_reg)

torch.save({
    'X': X_reg,
    'y': y_reg,
    'feature_names': feat_names,
    'classification': False,
}, OUTPUT_PATH)

print(f"Dataset saved to {OUTPUT_PATH}")
print(f"Dataset shape: X={X_reg.shape}, y={y_reg.shape}")
print(f"Number of features: {len(feat_names)}")
print(f"Feature names: {feat_names}")


Dataset saved to ../data/london_bike_dataset.pt
Dataset shape: X=torch.Size([17414, 14]), y=torch.Size([17414])
Number of features: 14
Feature names: ['t1', 't2', 'hum', 'wind_speed', 'weather_code', 'is_holiday', 'is_weekend', 'season', 'hour', 'dayofweek', 'month', 'year', 'is_peak_hour', 'temp_diff']
