In [22]:
import pandas as pd

df = pd.read_csv("stock_price_history.csv")
df.head()
df.info()
df.head()

print (len(df))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 135786 entries, 0 to 135785
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   id                135786 non-null  int64  
 1   stock_id          135786 non-null  int64  
 2   price             135786 non-null  float64
 3   trade_time_stamp  135786 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 4.1+ MB
135786


In [23]:
# Data Cleanup

import pandas as pd

# Load CSV
df = pd.read_csv("stock_price_history.csv")

# Clean & preprocess
df['trade_time_stamp'] = pd.to_datetime(df['trade_time_stamp'], errors='coerce')

# Drop rows with invalid timestamps or missing prices
df.dropna(subset=['trade_time_stamp', 'price'], inplace=True)

# Drop duplicates
df.drop_duplicates(inplace=True)

# Filter out invalid price entries (<= 0)
df = df[df['price'] > 0]

# Optional: remove rows with absurd jumps (optional for smoothing)
df.sort_values(by=['stock_id', 'trade_time_stamp'], inplace=True)
df['pct_change'] = df.groupby('stock_id')['price'].pct_change()
df = df[df['pct_change'].abs() < 0.5]  # Remove sudden 50%+ spikes
df.drop(columns='pct_change', inplace=True)

# Final sort & reset index
df.reset_index(drop=True, inplace=True)

# Show result
df.head()

print(f"✅ Rows remaining after cleanup: {len(df)}")


✅ Rows remaining after cleanup: 135767


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns='pct_change', inplace=True)


In [24]:
import numpy as np
import pandas as pd

# Parameters
WINDOW_SIZE = 60      # last 10 minutes (at 10s interval)
LOOKAHEAD = 3         # next 30 seconds
THRESHOLD = 0.003     # 0.3% for NEUTRAL zone

# Output containers
X = []
y = []
meta = []

def get_label(curr_price, future_price, threshold):
    change = (future_price - curr_price) / curr_price
    if change > threshold:
        return 1  # UP
    elif change < -threshold:
        return 0  # DOWN
    else:
        return 2  # NEUTRAL

# Slide over each stock separately
for stock_id, group in df.groupby('stock_id'):
    prices = group['price'].values
    timestamps = group['trade_time_stamp'].values

    for i in range(WINDOW_SIZE, len(prices) - LOOKAHEAD):
        window = prices[i - WINDOW_SIZE:i]             # last 60 prices
        current_price = prices[i - 1]                  # price at t
        future_price = prices[i + LOOKAHEAD - 1]       # price at t+3 (≈30s later)

        label = get_label(current_price, future_price, THRESHOLD)

        X.append(window)
        y.append(label)
        meta.append({
            'stock_id': stock_id,
            'predict_time': timestamps[i + LOOKAHEAD - 1]
        })

# Convert to arrays
X = np.array(X)
y = np.array(y)
meta_df = pd.DataFrame(meta)

# Output summary
classes, counts = np.unique(y, return_counts=True)
class_map = {0: "DOWN", 1: "UP", 2: "NEUTRAL"}
print("✅ Class distribution:")
for c, cnt in zip(classes, counts):
    print(f"{class_map[c]}: {cnt} samples")

print(f"✅ Feature shape: X = {X.shape}, y = {y.shape}")


✅ Class distribution:
DOWN: 1020 samples
UP: 1105 samples
NEUTRAL: 132445 samples
✅ Feature shape: X = (134570, 60), y = (134570,)


In [25]:
# Reshape to (samples, 60, 1) since WINDOW_SIZE is now 60
X = X.reshape((X.shape[0], X.shape[1], 1))

# Normalize each feature column-wise
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.reshape(X.shape[0], -1)).reshape(X.shape)

# Split
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y, test_size=0.2, stratify=y, random_state=42
)


In [26]:
class PriceDirectionDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)  # (N, 30, 1)
        self.y = torch.tensor(y, dtype=torch.long)     # Labels: 0, 1, 2

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = PriceDirectionDataset(X_train, y_train)
val_ds = PriceDirectionDataset(X_val, y_val)

train_loader = DataLoader(train_ds, batch_size=256, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=256)


In [27]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader

class CNNPriceClassifier(nn.Module):
    def __init__(self, input_length=60):
        super().__init__()
        self.input_length = input_length

        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=3)
        self.bn1 = nn.BatchNorm1d(64)
        self.pool1 = nn.MaxPool1d(2)

        self.conv2 = nn.Conv1d(64, 128, kernel_size=3)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(2)

        # Dynamically compute flattened size
        with torch.no_grad():
            dummy = torch.zeros(1, 1, input_length) # Batch size 1, 1 channel, input_length
            x = self.pool1(F.relu(self.bn1(self.conv1(dummy))))
            x = self.pool2(F.relu(self.bn2(self.conv2(x))))
            self.flattened_size = x.view(1, -1).shape[1]


        self.fc1 = nn.Linear(self.flattened_size, 64)
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(64, 3)

    def forward(self, x):
        x = x.permute(0, 2, 1)  # (B, 1, 60)
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)

In [29]:
from collections import Counter
import torch

# Count class frequencies in y_train
class_counts = Counter(y_train)
total = sum(class_counts.values())

# Inverse frequency weights
weights = torch.tensor([
    total / class_counts[0],  # DOWN
    total / class_counts[1],  # UP
    total / class_counts[2]   # NEUTRAL
], dtype=torch.float32)

weights = weights / weights.sum()  # Optional: normalize
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Model, optimizer, criterion
model = CNNPriceClassifier(input_length=60)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss(weight=weights.to(device))

# Training loop
for epoch in range(1, 31):
    model.train()
    total_loss = 0
    for X_batch, y_batch in train_loader:
        X_batch, y_batch = X_batch.to(device), y_batch.to(device)

        optimizer.zero_grad()
        output = model(X_batch)
        loss = criterion(output, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            output = model(X_batch)
            preds = torch.argmax(output, dim=1)
            correct += (preds == y_batch).sum().item()
            total += y_batch.size(0)

    acc = correct / total
    print(f"Epoch {epoch}: Loss = {total_loss:.4f} | Val Accuracy = {acc:.4f}")


Epoch 1: Loss = 466.8561 | Val Accuracy = 0.9842
Epoch 2: Loss = 458.6734 | Val Accuracy = 0.9842
Epoch 3: Loss = 459.8231 | Val Accuracy = 0.9842
Epoch 4: Loss = 458.5150 | Val Accuracy = 0.9842


KeyboardInterrupt: 

In [None]:
import joblib

joblib.dump(model, "crypto_xgb_model.pkl")


['crypto_xgb_model.pkl']