In [8]:
! pip install kagglehub

Collecting kagglehub
  Downloading kagglehub-0.3.6-py3-none-any.whl.metadata (30 kB)
Downloading kagglehub-0.3.6-py3-none-any.whl (51 kB)
Installing collected packages: kagglehub
Successfully installed kagglehub-0.3.6


In [9]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shubham2703/bitcoin-time-series-datajan-2018-jan-2022")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/shubham2703/bitcoin-time-series-datajan-2018-jan-2022?dataset_version_number=4...


100%|██████████| 28.6M/28.6M [00:03<00:00, 9.18MB/s]

Extracting files...





Path to dataset files: /Users/xuyan/.cache/kagglehub/datasets/shubham2703/bitcoin-time-series-datajan-2018-jan-2022/versions/4


In [14]:
import pandas as pd
import glob
import os

dataset_path = "/Users/xuyan/.cache/kagglehub/datasets/shubham2703/bitcoin-time-series-datajan-2018-jan-2022/versions/4/Bitcoin Data/Data"

csv_files = glob.glob(os.path.join(dataset_path, "*.csv"))

dfs = {}

for file in csv_files:
    file_name = os.path.basename(file)
    dfs[file_name] = pd.read_csv(file)

print(dfs["btc_1h.csv"].head())


              datetime      open      high       low     close      volume
0  2018-01-01 05:30:00  13715.65  13715.65  13400.01  13529.01  443.356199
1  2018-01-01 06:30:00  13528.99  13595.89  13155.38  13203.06  383.697006
2  2018-01-01 07:30:00  13203.00  13418.43  13200.00  13330.18  429.064572
3  2018-01-01 08:30:00  13330.26  13611.27  13290.00  13410.03  420.087030
4  2018-01-01 09:30:00  13434.98  13623.29  13322.15  13601.01  340.807329


In [15]:
print(len(dfs))

8


In [15]:
print(dfs.keys())

dict_keys(['btc_4h.csv', 'btc_5m.csv', 'btc_15m.csv', 'btc_2h.csv', 'btc_3m.csv', 'btc_30m.csv', 'btc_1h.csv', 'btc_6h.csv'])


In [16]:
all_data = []

for file in csv_files:
    file_name = os.path.basename(file)
    df = pd.read_csv(file)
    df["time_interval"] = file_name
    all_data.append(df)

merged_df = pd.concat(all_data, ignore_index=True)

print(merged_df.head())


              datetime      open      high       low     close       volume  \
0  2018-01-01 05:30:00  13715.65  13715.65  13155.38  13410.03  1676.204807   
1  2018-01-01 09:30:00  13434.98  13818.55  13322.15  13570.35  1302.214836   
2  2018-01-01 13:30:00  13569.98  13735.24  13001.13  13220.56  1319.755931   
3  2018-01-01 17:30:00  13220.56  13330.00  12750.00  13247.00  1831.933153   
4  2018-01-01 21:30:00  13247.00  13290.65  12940.00  13240.37  1092.337234   

  time_interval  
0    btc_4h.csv  
1    btc_4h.csv  
2    btc_4h.csv  
3    btc_4h.csv  
4    btc_4h.csv  


# merge all data into one DataFrame, make time interval as a Feature

In [17]:
import pandas as pd
import glob
import os
import re

dataset_path = "/Users/xuyan/.cache/kagglehub/datasets/shubham2703/bitcoin-time-series-datajan-2018-jan-2022/versions/4/Bitcoin Data/Data"

csv_files = glob.glob(os.path.join(dataset_path, "*.csv"))

all_data = []

time_pattern = re.compile(r"btc_(\d+)([hm])\.csv")

for file in csv_files:
    file_name = os.path.basename(file)
    match = time_pattern.match(file_name)
    
    if match:
        value, unit = match.groups()
        minutes = int(value) * (60 if unit == "h" else 1)
        
        df = pd.read_csv(file)
        df["time_interval"] = minutes # based on the filename ['btc_4h.csv', 'btc_5m.csv', 'btc_15m.csv', 'btc_2h.csv', 'btc_3m.csv', 'btc_30m.csv', 'btc_1h.csv', 'btc_6h.csv']
        all_data.append(df)

merged_df = pd.concat(all_data, ignore_index=True)

print(merged_df.head())


              datetime      open      high       low     close       volume  \
0  2018-01-01 05:30:00  13715.65  13715.65  13155.38  13410.03  1676.204807   
1  2018-01-01 09:30:00  13434.98  13818.55  13322.15  13570.35  1302.214836   
2  2018-01-01 13:30:00  13569.98  13735.24  13001.13  13220.56  1319.755931   
3  2018-01-01 17:30:00  13220.56  13330.00  12750.00  13247.00  1831.933153   
4  2018-01-01 21:30:00  13247.00  13290.65  12940.00  13240.37  1092.337234   

   time_interval  
0            240  
1            240  
2            240  
3            240  
4            240  


In [18]:
print(merged_df.count())

datetime         1423148
open             1423148
high             1423148
low              1423148
close            1423148
volume           1423148
time_interval    1423148
dtype: int64


# Data clean

In [32]:
cleaned_df = merged_df.copy()

missing_values = cleaned_df.isnull().sum()

for col in cleaned_df.columns:
    if cleaned_df[col].isnull().sum() > 0:
        if cleaned_df[col].dtype == 'object':
            cleaned_df[col].fillna(cleaned_df[col].mode()[0], inplace=True)  # 众数填充
        else:
            cleaned_df[col].fillna(cleaned_df[col].median(), inplace=True)  # 中位数填充



cleaned_df.drop_duplicates(inplace=True)

num_cols = cleaned_df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    Q1 = cleaned_df[col].quantile(0.25)
    Q3 = cleaned_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    cleaned_df = cleaned_df[(cleaned_df[col] >= lower_bound) & (cleaned_df[col] <= upper_bound)]


In [33]:
print(cleaned_df.head())

                 datetime      open      high       low     close     volume  \
8931  2018-01-01 05:30:00  13715.65  13715.65  13576.28  13600.00  33.617798   
8932  2018-01-01 05:35:00  13600.00  13600.00  13501.01  13554.58  40.528679   
8933  2018-01-01 05:40:00  13554.58  13569.97  13400.01  13556.15  49.469536   
8934  2018-01-01 05:45:00  13533.75  13547.73  13402.00  13430.52  32.725614   
8935  2018-01-01 05:50:00  13440.01  13459.99  13410.44  13439.94  26.614135   

      time_interval  
8931              5  
8932              5  
8933              5  
8934              5  
8935              5  


In [34]:
print(cleaned_df.count())

datetime         1112065
open             1112065
high             1112065
low              1112065
close            1112065
volume           1112065
time_interval    1112065
dtype: int64


# How It Fits into Chronos' Domain Adaptation Framework

## Experimental Ideas for Domain Adaptation
### Pretraining on high-resolution data (15m, 30m) and adapting to lower-resolution data (4h, 6h).
### Using self-supervised learning (contrastive learning, masked reconstruction) on 15m data and fine-tuning on 6h data.
### Comparing direct training vs. domain adaptation techniques like adversarial training and feature alignment (MMD loss, CORAL loss).


## Using Self-Supervised Learning for Temporal Adaptation
### Train a Chronos-based self-supervised model on 15m-1h data.
### Fine-tune it on 4h-6h data using domain adaptation techniques.
### Baseline comparison: Train a separate model on 4h-6h without adaptation.

In [16]:
import pandas as pd

dataset_path = "/Users/xuyan/.cache/kagglehub/datasets/shubham2703/bitcoin-time-series-datajan-2018-jan-2022/versions/4/Bitcoin Data/Data"
df = pd.read_csv(f"{dataset_path}/btc_15m.csv")
print(df.head())


              datetime      open      high       low     close      volume
0  2018-01-01 05:30:00  13715.65  13715.65  13400.01  13556.15  123.616013
1  2018-01-01 05:45:00  13533.75  13550.87  13402.00  13521.12   98.136430
2  2018-01-01 06:00:00  13500.00  13545.37  13450.00  13470.41   79.904037
3  2018-01-01 06:15:00  13494.65  13690.87  13450.00  13529.01  141.699719
4  2018-01-01 06:30:00  13528.99  13571.74  13402.28  13445.63   72.537533


In [3]:
print(df.count())

datetime    142610
open        142610
high        142610
low         142610
close       142610
volume      142610
dtype: int64


In [17]:
cleaned_df = df.copy()
missing_values = cleaned_df.isnull().sum()

for col in cleaned_df.columns:
    if cleaned_df[col].isnull().sum() > 0:
        if cleaned_df[col].dtype == 'object':
            cleaned_df[col].fillna(cleaned_df[col].mode()[0], inplace=True)
        else:
            cleaned_df[col].fillna(cleaned_df[col].median(), inplace=True)


In [18]:
print(cleaned_df.count())

datetime    142610
open        142610
high        142610
low         142610
close       142610
volume      142610
dtype: int64


In [19]:
import numpy as np

cleaned_df.drop_duplicates(inplace=True)

num_cols = cleaned_df.select_dtypes(include=[np.number]).columns
for col in num_cols:
    Q1 = cleaned_df[col].quantile(0.25)
    Q3 = cleaned_df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    cleaned_df = cleaned_df[(cleaned_df[col] >= lower_bound) & (cleaned_df[col] <= upper_bound)]

In [20]:
print(cleaned_df.count())

datetime    131769
open        131769
high        131769
low         131769
close       131769
volume      131769
dtype: int64


In [21]:
cleaned_df.rename(columns={'datetime': 'timestamp'}, inplace=True)
cleaned_df['timestamp'] = pd.to_datetime(cleaned_df['timestamp']).dt.strftime('%Y-%m-%dT%H:%M:%S')

In [22]:
print(cleaned_df.head())

             timestamp      open      high       low     close      volume
0  2018-01-01T05:30:00  13715.65  13715.65  13400.01  13556.15  123.616013
1  2018-01-01T05:45:00  13533.75  13550.87  13402.00  13521.12   98.136430
2  2018-01-01T06:00:00  13500.00  13545.37  13450.00  13470.41   79.904037
3  2018-01-01T06:15:00  13494.65  13690.87  13450.00  13529.01  141.699719
4  2018-01-01T06:30:00  13528.99  13571.74  13402.28  13445.63   72.537533


In [23]:
from pathlib import Path
from typing import List, Union

import numpy as np
from gluonts.dataset.arrow import ArrowWriter


def convert_to_arrow(
    path: Union[str, Path],
    time_series: Union[List[np.ndarray], np.ndarray],
    compression: str = "lz4",
):
    """
    Store a given set of series into Arrow format at the specified path.

    Input data can be either a list of 1D numpy arrays, or a single 2D
    numpy array of shape (num_series, time_length).
    """
    assert isinstance(time_series, list) or (
        isinstance(time_series, np.ndarray) and
        time_series.ndim == 2
    )

    # Set an arbitrary start time
    start = np.datetime64("2000-01-01 00:00", "s")

    dataset = [
        {"start": start, "target": ts} for ts in time_series
    ]

    ArrowWriter(compression=compression).write_to_file(
        dataset,
        path=path,
    )

In [None]:
time_series = [np.random.randn(1024) for i in range(20)]
# Convert to GluonTS arrow format
convert_to_arrow("./noise-data.arrow", time_series=time_series)

In [11]:
! pip install gluonts

Collecting gluonts
  Downloading gluonts-0.16.0-py3-none-any.whl.metadata (9.8 kB)
Downloading gluonts-0.16.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: gluonts
Successfully installed gluonts-0.16.0


In [2]:
import pandas as pd
from gluonts.dataset.arrow import ArrowWriter


dataset_path = "/Users/xuyan/.cache/kagglehub/datasets/shubham2703/bitcoin-time-series-datajan-2018-jan-2022/versions/4/Bitcoin Data/Data"

df = pd.read_csv(f"{dataset_path}/btc_15m.csv")
df.drop_duplicates(inplace=True)

for col in df.columns:
    if df[col].isnull().sum() > 0:
        if df[col].dtype == 'object':
            df[col].fillna(df[col].mode()[0], inplace=True)
        else:
            df[col].fillna(df[col].median(), inplace=True)

# Convert 'datetime' to proper datetime format
df["datetime"] = pd.to_datetime(df["datetime"])

# Ensure data is sorted by datetime (important for time series processing)
df = df.sort_values(by="datetime")

train_df = df[(df["datetime"] >= "2018-01-01") & (df["datetime"] < "2021-01-01")]
test_df = df[(df["datetime"] >= "2021-01-01") & (df["datetime"] <= "2022-12-31")]

# Convert and save training data
# convert_to_arrow(train_df, Path("./bitcoin-train.arrow"))

# Convert and save testing data
# convert_to_arrow(test_df, Path("./bitcoin-test.arrow"))


  from pandas.core import (


In [3]:
df.head()

Unnamed: 0,datetime,open,high,low,close,volume
0,2018-01-01 05:30:00,13715.65,13715.65,13400.01,13556.15,123.616013
1,2018-01-01 05:45:00,13533.75,13550.87,13402.0,13521.12,98.13643
2,2018-01-01 06:00:00,13500.0,13545.37,13450.0,13470.41,79.904037
3,2018-01-01 06:15:00,13494.65,13690.87,13450.0,13529.01,141.699719
4,2018-01-01 06:30:00,13528.99,13571.74,13402.28,13445.63,72.537533


In [31]:
print(train_df.count())

datetime    104732
open        104732
high        104732
low         104732
close       104732
volume      104732
dtype: int64


In [32]:
print(test_df.count())

datetime    37878
open        37878
high        37878
low         37878
close       37878
volume      37878
dtype: int64


In [8]:
train_open_close_df = train_df[["datetime", "open", "close"]]
test_open_close_df = test_df[["datetime", "open", "close"]]

In [9]:
train_open_close_df.head()

Unnamed: 0,datetime,open,close
0,2018-01-01 05:30:00,13715.65,13556.15
1,2018-01-01 05:45:00,13533.75,13521.12
2,2018-01-01 06:00:00,13500.0,13470.41
3,2018-01-01 06:15:00,13494.65,13529.01
4,2018-01-01 06:30:00,13528.99,13445.63


In [10]:
from pathlib import Path

def convert_to_arrow(df, path):
    time_series = [
        {"start": df["datetime"].iloc[0], "target": df[col].values}
        for col in ["high", "low"]
    ]
    ArrowWriter(compression="lz4").write_to_file(time_series, path=path)


# Convert and save training data
convert_to_arrow(train_open_close_df, Path("./bitcoin-openclose-train.arrow"))

# Convert and save testing data
convert_to_arrow(test_open_close_df, Path("./bitcoin-openclose-test.arrow"))

In [11]:
train_high_low_df = train_df[["datetime", "high", "low"]]
test_high_low_df = test_df[["datetime", "high", "low"]]

In [12]:
train_high_low_df.head()

Unnamed: 0,datetime,high,low
0,2018-01-01 05:30:00,13715.65,13400.01
1,2018-01-01 05:45:00,13550.87,13402.0
2,2018-01-01 06:00:00,13545.37,13450.0
3,2018-01-01 06:15:00,13690.87,13450.0
4,2018-01-01 06:30:00,13571.74,13402.28


In [14]:
def convert_to_arrow(df, path):
    time_series = [
        {"start": df["datetime"].iloc[0], "target": df[col].values}
        for col in ["high", "low"]
    ]
    ArrowWriter(compression="lz4").write_to_file(time_series, path=path)
# Convert and save training data
convert_to_arrow(train_high_low_df, Path("./bitcoin-highlow-train.arrow"))