In [3]:
pip install kagglehub

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple, https://pypi.ngc.nvidia.com
Collecting kagglehub
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/5c/e7/71927b088047132317c14eb513d69c8375ddba3c9029d4154a054f6c8765/kagglehub-0.2.9-py3-none-any.whl (39 kB)
Installing collected packages: kagglehub
Successfully installed kagglehub-0.2.9
Note: you may need to restart the kernel to use updated packages.


In [8]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mczielinski/bitcoin-historical-data")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\zhang\.cache\kagglehub\datasets\mczielinski\bitcoin-historical-data\versions\154


In [11]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.optim.adam import Adam
from torch.utils.data import Dataset, DataLoader, random_split

# Create class BitcoinDataset which is a child of Dataset class from torch.util.data
class BitcoinDataset(Dataset):
    
    def __init__(self, csv_file):
        """
        Args:
            csv_file (str): Path to the CSV file.
        """
        # Load data from CSV, select columns and drop null values
        self.dataframe = pd.read_csv(csv_file)[['High','Low','Open','Close']].dropna()

        # Extract the features for easier manipulation
        self.features = self.dataframe.values

        # Calculate mean and std for normalization
        self.mean = self.features.mean(axis=0)
        self.std = self.features.std(axis=0)

        # Apply normalization to features
        self.features = (self.features - self.mean) / self.std

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        
        # Return the item at index idx in the form of tensor
        features = torch.tensor(self.features[idx], dtype=torch.float32).to(device)
        
        return features


# Create an instance of BitcoinDataset and store in variable
csv_file = './datasets/btcusd_1-min_data.csv'
dataset = BitcoinDataset(csv_file)
print("Number of samples in dataset:", len(dataset))
# Then, the batch_size and input_dim were set. The dataset was divided into train and test datasets. Each of these was loaded into a DataLoader. The device was set to ‘cuda’ if available. 

Number of samples in dataset: 6772281


In [12]:
# Define batchsize and input dimensions
batch_size = 64
input_dim = 4

# Split dataset into train and test in the ratio of 80:20
train_dataset, test_dataset = random_split(dataset, [0.8,0.2])

# Use DataLoader for batching and shuffling
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

# Define Device as Cuda if available else CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
class VAE(nn.Module):

    def __init__(self, input_dim=4, hidden_dim=40, latent_dim=3, device=device):
        super(VAE, self).__init__()

        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Linear(hidden_dim, latent_dim),
            nn.LeakyReLU(0.2)
            )
        
        # Latent mean and variance
        self.mean_layer = nn.Linear(latent_dim, 1)
        self.logvar_layer = nn.Linear(latent_dim, 1)
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(1, latent_dim),
            nn.LeakyReLU(0.2),
            nn.Linear(latent_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Linear(hidden_dim, input_dim)
            )
     
    # Encode function
    def encode(self, x):
        x = self.encoder(x)
        mean, log_var = self.mean_layer(x), self.logvar_layer(x)
        return mean, log_var
    
    # Add Reparameterization
    def reparameterization(self, mean, var):
        epsilon = torch.randn_like(var).to(device)      
        z = mean + var*epsilon
        return z

    # Decode function
    def decode(self, x):
        return self.decoder(x)

    # Forward Function
    def forward(self, x):
        mean, log_var = self.encode(x)
        z = self.reparameterization(mean, log_var)
        x_hat = self.decode(z)
        return x_hat, mean, log_var
    
    # Reconstruct input from compressed form
    def reconstruction(mean, log_var):
        z = self.reparameterization(mean, log_var)
        x_hat = self.decode(z)
        return x_hat

In [14]:
def loss_function(x, x_hat, mean, log_var):
    # Reproduction Loss
    reproduction_loss = nn.functional.mse_loss(x_hat, x)
    
    # KL Divergence Loss
    KLD = - 0.5 * torch.sum(1+ log_var - mean.pow(2) - log_var.exp())
    return reproduction_loss + KLD

# VAE Model created and stored in device
model = VAE().to(device)

# Optimizer defined
optimizer = Adam(model.parameters(), lr=1e-3)

In [15]:
def train(model, optimizer, epochs, device):
    
    # Set model to training mode
    model.train()
    
    # Loop for each epoch
    for epoch in range(epochs):
        overall_loss = 0
        
        # Iterate over the batches formed by DataLoader
        for batch_idx, x in enumerate(train_dataloader):
            x = x.to(device)
            
            # Reset Gradient
            optimizer.zero_grad()
            x_hat, mean, log_var = model(x)
            
            # Calculate batch loss and then overall loss
            loss = loss_function(x, x_hat, mean, log_var)
            overall_loss += loss.item()
            
            # Backpropagate the loss and train the optimizer
            loss.backward()
            optimizer.step()

        print("\tEpoch", epoch + 1, "\tAverage Loss: ", overall_loss/(batch_idx*batch_size))
    return overall_loss

# Train the model for 5 epochs
train(model, optimizer, epochs=5, device=device)

	Epoch 1 	Average Loss:  0.0010465322552025735
	Epoch 2 	Average Loss:  6.398167515173866e-05
	Epoch 3 	Average Loss:  4.211362657392762e-05
	Epoch 4 	Average Loss:  2.9124277026888643e-05
	Epoch 5 	Average Loss:  2.362720924003563e-05


128.00730520299112