In [24]:
import pandas as pd
import warnings
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
from transformers import AutoModelForCausalLM, T5Tokenizer, TrainingArguments, Trainer

warnings.filterwarnings('ignore')

In [21]:
MODEL_NAME = "amazon/chronos-t5-mini"
features = ["timestamp", "GlobalOpen", "GlobalHigh", "GlobalLow", "GlobalVol.", 
            "GlobalChange %", "GlobalPrice", "CE_Close", "CE_High", "CE_Low", "CE_Open"]
target = "price"

In [13]:
def date_to_timestamp(df: pd.DataFrame, column='Date') -> pd.DataFrame:
    df['timestamp'] = pd.to_datetime(df[column]).astype(np.int64)
    df.drop(columns=[column], inplace=True)
    return df

In [14]:
df_train_dataset = pd.read_csv('./AdditionalDataset/training_dataset.csv')
df_test_dataset = pd.read_csv('./AdditionalDataset/testing_dataset.csv')

df_train_dataset.drop(columns=['Unnamed: 0'], inplace=True)
df_test_dataset.drop(columns=['Unnamed: 0'], inplace=True)

In [15]:
df_train_dataset = date_to_timestamp(df_train_dataset)
df_test_dataset = date_to_timestamp(df_test_dataset)

In [16]:
df_train_dataset = df_train_dataset[features + [target]]
df_train_dataset.head()

Unnamed: 0,timestamp,GlobalOpen,GlobalHigh,GlobalLow,GlobalVol.,GlobalChange %,GlobalPrice,CE_Close,CE_High,CE_Low,CE_Open,price
0,1640995200000000000,205.9048,782.25,3.5,228640.0,-0.48,202.7536,3554.002863,14301.299805,0.029976,3561.752985,28970.0
1,1640995200000000000,205.9048,782.25,3.5,228640.0,-0.48,202.7536,3554.002863,14301.299805,0.029976,3561.752985,27440.0
2,1640995200000000000,205.9048,782.25,3.5,228640.0,-0.48,202.7536,3554.002863,14301.299805,0.029976,3561.752985,11030.0
3,1640995200000000000,205.9048,782.25,3.5,228640.0,-0.48,202.7536,3554.002863,14301.299805,0.029976,3561.752985,12080.0
4,1640995200000000000,205.9048,782.25,3.5,228640.0,-0.48,202.7536,3554.002863,14301.299805,0.029976,3561.752985,22360.0


In [17]:
df_train_dataset, df_val_dataset = train_test_split(df_train_dataset, test_size=0.2)

In [18]:
df_train_dataset.head()

Unnamed: 0,timestamp,GlobalOpen,GlobalHigh,GlobalLow,GlobalVol.,GlobalChange %,GlobalPrice,CE_Close,CE_High,CE_Low,CE_Open,price
352709,1674691200000000000,239.6786,758.0,2.68,471080.0,0.361667,242.2616,3734.931638,14981.599609,0.030441,3734.931638,30940.0
316871,1658880000000000000,264.2442,818.75,8.345,313000.0,-0.071667,262.2288,3745.942885,15026.299805,0.027115,3745.942885,10610.0
159103,1653782400000000000,136.55175,406.65,8.275,137720.0,-0.265,136.8005,3646.296387,14626.0,0.02922,3646.296387,28870.0
312476,1708819200000000000,160.3954,592.5,1.683,353230.0,-2.001667,158.6598,3897.245561,15631.0,0.027685,3897.245561,15370.0
324027,1710979200000000000,156.0958,552.5,1.797,348130.0,-0.185,156.2062,3921.996428,15724.599609,0.027525,3921.996428,47010.0


In [19]:
train_dataset = Dataset.from_pandas(df_train_dataset)
val_dataset = Dataset.from_pandas(df_val_dataset)
test_dataset = Dataset.from_pandas(df_test_dataset)

In [25]:
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(example):
    input_features = [example[f] for f in features]
    return tokenizer(" ".join(map(str, input_features)), return_tensors="pt")

ImportError: 
T5Tokenizer requires the SentencePiece library but it was not found in your environment. Checkout the instructions on the
installation page of its repo: https://github.com/google/sentencepiece#installation and follow the ones
that match your environment. Please note that you may need to restart your runtime after installation.


In [None]:
train_tokenized = train_dataset.map(tokenize_function, remove_columns=features)