In [1]:
# pip uninstall torch torchvision torchaudio -y


In [2]:
# pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

In [1]:
import torch

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"Device name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available.")

PyTorch version: 2.5.0+cu118
CUDA available: True
Device name: NVIDIA GeForce RTX 2060


In [2]:
# !pip install yfinance

In [3]:
import math
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import yfinance as yf

In [7]:
def calculate_bollinger_bands(data, window=10, num_of_std=2):
    """Calculate Bollinger Bands"""
    rolling_mean = data.rolling(window=window).mean()
    rolling_std = data.rolling(window=window).std()
    upper_band = rolling_mean + (rolling_std * num_of_std)
    lower_band = rolling_mean - (rolling_std * num_of_std)
    return upper_band, lower_band

def calculate_rsi(data, window=10):
    """Calculate Relative Strength Index"""
    delta = data.diff()
    gain = delta.clip(lower=0)
    loss = -delta.clip(upper=0)
    avg_gain = gain.rolling(window=window, min_periods=1).mean()
    avg_loss = loss.rolling(window=window, min_periods=1).mean()
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

def calculate_roc(data, periods=10):
    """Calculate Rate of Change."""
    roc = ((data - data.shift(periods)) / data.shift(periods)) * 100
    return roc

In [9]:
data = yf.download('META', period="1mo", interval="15m")


[*********************100%***********************]  1 of 1 completed


In [11]:
data.head

<bound method NDFrame.head of Price                       Adj Close       Close        High         Low  \
Ticker                           META        META        META        META   
Datetime                                                                    
2024-09-30 09:30:00+00:00  568.869995  568.869995  570.379883  565.400085   
2024-09-30 09:45:00+00:00  567.210022  567.210022  570.190002  567.140015   
2024-09-30 10:00:00+00:00  565.849976  565.849976  567.719910  564.799988   
2024-09-30 10:15:00+00:00  568.260010  568.260010  568.349976  565.300476   
2024-09-30 10:30:00+00:00  568.119995  568.119995  569.000000  567.460022   
...                               ...         ...         ...         ...   
2024-10-28 12:30:00+00:00  577.789978  577.789978  577.919922  576.599976   
2024-10-28 12:45:00+00:00  577.210022  577.210022  578.010010  577.039978   
2024-10-28 13:00:00+00:00  576.869995  576.869995  577.596497  576.740112   
2024-10-28 13:15:00+00:00  577.724976  577.724

In [61]:
import pandas as pd
import numpy as np
import yfinance as yf

tickers = ['META', 'AAPL', 'MSFT', 'AMZN', 'GOOG']

# Initialize containers
ticker_data_frames = []
stats = {}

# Define sequence length (in days)
SEQUENCE_LEN = 30  # 30 days

for ticker in tickers:
    print(f"Processing {ticker}...")
    # Download historical data at 1-hour intervals
    data = yf.download(ticker, period="2y", interval="1h")
    
    if data.empty:
        print(f"No data found for {ticker}. Skipping.")
        continue  # Skip to the next ticker if no data is found

    # Extract features
    open_price = data['Open'].squeeze()
    close_price = data['Close'].squeeze()
    high = data['High'].squeeze()
    low = data['Low'].squeeze()
    volume = data['Volume'].squeeze()
    
    # Calculate technical indicators
    daily_return = close_price.pct_change()
    moving_avg_10 = close_price.rolling(window=10).mean()
    moving_avg_30 = close_price.rolling(window=30).mean()
    bollinger_upper, bollinger_lower = calculate_bollinger_bands(close_price)
    rsi = calculate_rsi(close_price)
    roc = calculate_roc(close_price)
    
    # Create a dictionary of variables
    variables = {
        ticker + '_open': open_price,
        ticker + '_close': close_price,
        ticker + '_high': high,
        ticker + '_low': low,
        ticker + '_volume': volume,
        ticker + '_daily_return': daily_return,
        ticker + '_ma10': moving_avg_10,
        ticker + '_ma30': moving_avg_30,
        ticker + '_bollinger_upper': bollinger_upper,
        ticker + '_bollinger_lower': bollinger_lower,
        ticker + '_rsi': rsi,
        ticker + '_roc': roc,
    }
    
    # Create DataFrame from the variables dictionary
    ticker_df = pd.DataFrame(variables)
    
    # Handle missing values
    ticker_df.dropna(inplace=True)
    
    if ticker_df.empty:
        print(f"No data left for {ticker} after dropping NaNs.")
        continue
    
    # Calculate mean and std for normalization
    MEAN = ticker_df.mean()
    STD = ticker_df.std()
    
    # Store stats for each feature
    for column in MEAN.index:
        stats[f"{column}_mean"] = MEAN[column]
        stats[f"{column}_std"] = STD[column]
    
    # Normalize the data
    ticker_df = (ticker_df - MEAN) / STD
    
    ticker_data_frames.append(ticker_df)
    print(f"Finished processing {ticker}.")

# Combine all ticker data into a single DataFrame
df = pd.concat(ticker_data_frames, axis=1)

# Handle any remaining missing values
df.dropna(inplace=True)

# Ensure the index is DateTime and create a 'date' column
df.index = pd.to_datetime(df.index)
# df['date'] = df.index.date

# Verify the 'date' column
print("Columns after adding 'date':", df.columns.tolist())
print("First few rows:\n", df.head())


Processing META...


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Finished processing META.
Processing AAPL...
Finished processing AAPL.
Processing MSFT...


[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


Finished processing MSFT.
Processing AMZN...
Finished processing AMZN.
Processing GOOG...


[*********************100%***********************]  1 of 1 completed


Finished processing GOOG.
Columns after adding 'date': ['META_open', 'META_close', 'META_high', 'META_low', 'META_volume', 'META_daily_return', 'META_ma10', 'META_ma30', 'META_bollinger_upper', 'META_bollinger_lower', 'META_rsi', 'META_roc', 'AAPL_open', 'AAPL_close', 'AAPL_high', 'AAPL_low', 'AAPL_volume', 'AAPL_daily_return', 'AAPL_ma10', 'AAPL_ma30', 'AAPL_bollinger_upper', 'AAPL_bollinger_lower', 'AAPL_rsi', 'AAPL_roc', 'MSFT_open', 'MSFT_close', 'MSFT_high', 'MSFT_low', 'MSFT_volume', 'MSFT_daily_return', 'MSFT_ma10', 'MSFT_ma30', 'MSFT_bollinger_upper', 'MSFT_bollinger_lower', 'MSFT_rsi', 'MSFT_roc', 'AMZN_open', 'AMZN_close', 'AMZN_high', 'AMZN_low', 'AMZN_volume', 'AMZN_daily_return', 'AMZN_ma10', 'AMZN_ma30', 'AMZN_bollinger_upper', 'AMZN_bollinger_lower', 'AMZN_rsi', 'AMZN_roc', 'GOOG_open', 'GOOG_close', 'GOOG_high', 'GOOG_low', 'GOOG_volume', 'GOOG_daily_return', 'GOOG_ma10', 'GOOG_ma30', 'GOOG_bollinger_upper', 'GOOG_bollinger_lower', 'GOOG_rsi', 'GOOG_roc']
First few rows

In [63]:
# Define window size (number of past data points)
WINDOW_SIZE = 30  # For example, last 30 hours

# Define feature columns (all columns except 'date' and closing prices if needed)
feature_columns = [col for col in df.columns if col.endswith(('_open', '_high', '_low', '_volume', '_daily_return', '_ma10', '_ma30', '_bollinger_upper', '_bollinger_lower', '_rsi', '_roc'))]

# Extract features and labels
features = df[feature_columns].values  # Shape: (num_samples, num_features)

# Define label columns (closing prices for all tickers)
label_columns = [f"{ticker}_close" for ticker in tickers]
labels = df[label_columns].values  # Shape: (num_samples, num_tickers)

# Initialize lists to store sequences and labels
sequences_list = []
labels_list = []

# Iterate to create sequences
for i in range(WINDOW_SIZE, len(features)):
    # Extract the window of past data points
    window = features[i - WINDOW_SIZE:i]  # Shape: (WINDOW_SIZE, num_features)
    
    # Label: next closing prices for all tickers
    label = labels[i]  # Shape: (num_tickers,)
    
    # Append the sequence and label to the lists
    sequences_list.append(window)
    labels_list.append(label)

print(f"Total sequences: {len(sequences_list)}")
print(f"Sequence shape: {np.array(sequences_list).shape}")  # Expected: (num_samples - WINDOW_SIZE, WINDOW_SIZE, num_features)
print(f"Labels shape: {np.array(labels_list).shape}")        # Expected: (num_samples - WINDOW_SIZE, num_tickers)


Total sequences: 3433
Sequence shape: (3433, 30, 55)
Labels shape: (3433, 5)


In [65]:
import numpy as np

# Convert lists to numpy arrays
all_sequences = np.array(sequences_list)
all_labels = np.array(labels_list)

# Check for NaNs in sequences and labels
print(f"NaNs in sequences: {np.isnan(all_sequences).any()}")
print(f"NaNs in labels: {np.isnan(all_labels).any()}")


NaNs in sequences: False
NaNs in labels: False


In [67]:
def print_sample(sequence, label, tickers, index=0):
    print(f"\nSample {index + 1}:")
    print("Sequence (Last 30 data points):")
    print(sequence)
    print("Label (Next Closing Prices):")
    print(label)
    print("Label Tickers:", tickers)

# Select a sample index
sample_idx = 0

# Print the first sample
print_sample(all_sequences[sample_idx], all_labels[sample_idx], tickers, index=sample_idx)

# Optionally, inspect a few more samples
for i in range(1, 3):
    print_sample(all_sequences[i], all_labels[i], tickers, index=i)



Sample 1:
Sequence (Last 30 data points):
[[-1.81395138 -1.8183092  -1.81268102 ... -1.84398146 -1.44959726
  -2.67878944]
 [-1.81441087 -1.81661816 -1.81617028 ... -1.86080222 -1.53919401
  -2.96764865]
 [-1.80872046 -1.80661273 -1.81932815 ... -1.81741139 -1.28126897
  -2.57512236]
 ...
 [-1.72086329 -1.72551229 -1.72281802 ... -1.6347019  -1.05239104
  -1.14851935]
 [-1.7216324  -1.72713222 -1.72594042 ... -1.6398764  -0.9453778
  -0.96402674]
 [-1.72658056 -1.73079686 -1.72892087 ... -1.65216213 -0.79068225
  -0.73675873]]
Label (Next Closing Prices):
[-1.69203955 -1.46010723 -1.69043174 -1.37396367 -1.50537123]
Label Tickers: ['META', 'AAPL', 'MSFT', 'AMZN', 'GOOG']

Sample 2:
Sequence (Last 30 data points):
[[-1.81441087 -1.81661816 -1.81617028 ... -1.86080222 -1.53919401
  -2.96764865]
 [-1.80872046 -1.80661273 -1.81932815 ... -1.81741139 -1.28126897
  -2.57512236]
 [-1.81190141 -1.81291895 -1.81528324 ... -1.80220698 -1.03888464
  -1.74811603]
 ...
 [-1.7216324  -1.72713222 -1

In [69]:
# Select a specific sample to verify
sample_idx = 0

# Get the corresponding datetime for the label
label_datetime = df.index[WINDOW_SIZE + sample_idx]

print(f"\nVerifying Sample {sample_idx + 1}:")
print(f"Label DateTime: {label_datetime}")

# Extract the actual closing prices from the DataFrame
actual_close = df.loc[label_datetime, label_columns].values

# Extract the sequence from the DataFrame
sequence_start_idx = sample_idx
sequence_end_idx = sample_idx + WINDOW_SIZE
actual_sequence = df.iloc[sequence_start_idx:sequence_end_idx][feature_columns].values

# Compare with the generated sequence
generated_sequence = all_sequences[sample_idx]
if np.array_equal(actual_sequence, generated_sequence):
    print("Sequence matches the expected data.")
else:
    print("Sequence does NOT match the expected data.")



Verifying Sample 1:
Label DateTime: 2022-11-10 14:30:00+00:00
Sequence matches the expected data.
