In [None]:
# Import necessary libraries for data manipulation and numerical operations
import pandas as pd 
import numpy as np

In [None]:
# Load the Technology stocks dataset
df = pd.read_csv("../Datasets/category_csvs/Technology_stocks_data.csv")

In [None]:
# ============================================================================
# LABEL GENERATION: Predict if majority of next 5 days will be UP days
# ============================================================================
import numpy as np
import pandas as pd

# Sort by stock symbol and date to ensure proper time series ordering
df = df.sort_values(['Stock_symbol', 'Date']).reset_index(drop=True)

# Step 1: Compute daily returns for each stock
df['Daily_Return'] = df.groupby('Stock_symbol')['Close'].pct_change()

# Step 2: Mark up/down days (1 = up day, 0 = down or flat day)
df['Up_Day'] = (df['Daily_Return'] > 0).astype(int)

# Step 3: Count UP days in the next 5 trading days (forward-looking window)
df['Up_Count_5d'] = df.groupby('Stock_symbol')['Up_Day'].transform(
    lambda x: x.shift(-1).rolling(window=5, min_periods=5).sum()
)

# Step 4: Create binary label - majority vote (>=3 up days = UP trend)
df['Label'] = np.where(df['Up_Count_5d'] >= 3, 1, 0)

# Step 5: Remove rows without a full 5-day forward window (end of data)
df_filtered = df.dropna(subset=['Up_Count_5d', 'Label'])

print(f"Original samples: {len(df)}")
print(f"After filtering: {len(df_filtered)}")

print("\nClass distribution:")
print(df_filtered['Label'].value_counts())

Original samples: 47362
After filtering: 47122

Class distribution:
Label
1    25794
0    21328
Name: count, dtype: int64


In [None]:
# Ensure data is sorted by stock symbol and date for time series operations
df = df.sort_values(["Stock_symbol", "Date"]).reset_index(drop=True)

In [None]:
# ============================================================================
# MOMENTUM FEATURES: Short, medium, and long-term price returns
# ============================================================================

# Group by stock symbol for per-stock calculations
g = df.groupby("Stock_symbol")

# Calculate percentage returns over different time windows
df["Return_1d"]  = g["Adj Close"].pct_change(1)   # 1-day return
df["Return_5d"]  = g["Adj Close"].pct_change(5)   # 5-day return (weekly)
df["Return_20d"] = g["Adj Close"].pct_change(20)  # 20-day return (monthly)

In [None]:
# ============================================================================
# MOVING AVERAGE FEATURES: Trend indicators
# ============================================================================

# Calculate moving averages
df["MA5"]  = g["Adj Close"].transform(lambda x: x.rolling(5).mean())   # 5-day MA
df["MA20"] = g["Adj Close"].transform(lambda x: x.rolling(20).mean())  # 20-day MA

# Calculate price relative to moving averages (>1 means above MA)
df["MA5_Ratio"]  = df["Adj Close"] / df["MA5"]
df["MA20_Ratio"] = df["Adj Close"] / df["MA20"]

In [None]:
# ============================================================================
# TREND FEATURE: Relationship between short and long-term moving averages
# ============================================================================

# Positive values indicate short-term MA above long-term MA (bullish)
df["Trend_5_20"] = (df["MA5"] - df["MA20"]) / df["MA20"]

In [None]:
# ============================================================================
# DISTANCE FROM EXTREMES: Position relative to recent highs/lows
# ============================================================================

# Get 20-day high and low values
df["High_20"] = g["High"].transform(lambda x: x.rolling(20).max())
df["Low_20"]  = g["Low"].transform(lambda x: x.rolling(20).min())

# Calculate distance from extremes (negative = below high/low)
df["Dist_20High"] = (df["Adj Close"] - df["High_20"]) / df["High_20"]
df["Dist_20Low"]  = (df["Adj Close"] - df["Low_20"]) / df["Low_20"]

In [None]:
# ============================================================================
# VOLATILITY FEATURES: Short and medium-term price volatility
# ============================================================================

# Calculate rolling standard deviation of returns (volatility measure)
df["Vol_5d"]  = g["Return_1d"].transform(lambda x: x.rolling(5).std())   # 5-day volatility
df["Vol_20d"] = g["Return_1d"].transform(lambda x: x.rolling(20).std())  # 20-day volatility

In [None]:
# ============================================================================
# VOLATILITY RATIO: Recent vs historical volatility
# ============================================================================

# Ratio > 1 indicates increasing volatility
df["Vol_Ratio"] = df["Vol_5d"] / df["Vol_20d"]

In [None]:
# ============================================================================
# GAP FEATURE: Overnight price gap
# ============================================================================

# Calculate gap between today's open and yesterday's close
prev_close = g["Adj Close"].shift(1)
df["Gap"] = (df["Open"] - prev_close) / prev_close

In [None]:
# ============================================================================
# VOLUME CHANGE: Day-over-day volume change
# ============================================================================

# Daily percentage change in trading volume
df["Vol_Change"] = g["Volume"].pct_change()

In [None]:
# ============================================================================
# VOLUME RATIO: Current volume relative to 20-day average
# ============================================================================

# Calculate 20-day average volume
df["Vol_MA20"] = g["Volume"].transform(lambda x: x.rolling(20).mean())

# Ratio > 1 indicates above-average volume
df["Vol_Ratio_20"] = df["Volume"] / df["Vol_MA20"]

In [None]:
# ============================================================================
# PRICE-VOLUME SCORE: Combined price and volume momentum
# ============================================================================

# Positive score = price up with high volume (strong signal)
df["PV_Score"] = df["Return_1d"] * df["Vol_Ratio_20"]

In [None]:
# ============================================================================
# VOLUME TREND: Short-term vs medium-term volume trend
# ============================================================================

# Calculate 5-day average volume
df["Vol_MA5"] = g["Volume"].transform(lambda x: x.rolling(5).mean())

# Ratio > 1 indicates increasing volume trend
df["Vol_Trend"] = df["Vol_MA5"] / df["Vol_MA20"]

In [None]:
# ============================================================================
# NET SENTIMENT: Difference between positive and negative sentiment
# ============================================================================

# Net sentiment score (positive - negative article sentiment)
df["Sent_Net"] = df["positive"] - df["negative"]

In [None]:
# ============================================================================
# SENTIMENT MOVING AVERAGE: Smoothed sentiment trend
# ============================================================================

# 5-day moving average of net sentiment
df["Sent_MA5"] = g["Sent_Net"].transform(
    lambda x: x.rolling(5).mean()
)

In [None]:
# ============================================================================
# SENTIMENT MOMENTUM: Change in sentiment over 5 days
# ============================================================================

# Positive = improving sentiment
df["Sent_Mom5"] = g["Sent_Net"].diff(5)

In [None]:
# ============================================================================
# NEWS INTENSITY: Current news coverage relative to average
# ============================================================================

# Calculate 20-day average article count
df["Art_MA20"] = g["article_count"].transform(
    lambda x: x.rolling(20).mean()
)

# Ratio > 1 indicates higher than average news coverage
df["News_Intensity"] = df["article_count"] / df["Art_MA20"]

In [None]:
# ============================================================================
# LAGGED FEATURES: Historical values to capture delayed effects
# ============================================================================

# Lagged return and sentiment features
df["Ret_Lag1"]  = g["Return_1d"].shift(1)   # Yesterday's return
df["Ret_Lag5"]  = g["Return_5d"].shift(5)   # Return from 5 days ago
df["Sent_Lag3"] = g["Sent_Net"].shift(3)    # Sentiment from 3 days ago

In [None]:
# ============================================================================
# FEATURE LIST: All engineered features for the model
# ============================================================================

features = [
    # Momentum features - capture price trends
    "Return_1d","Return_5d","Return_20d",
    "MA5_Ratio","MA20_Ratio","Trend_5_20",
    "Dist_20High","Dist_20Low",

    # Volatility features - capture price stability/risk
    "Vol_5d","Vol_20d",
    "Vol_Ratio","Gap",

    # Volume features - capture trading activity
    "Vol_Change","Vol_Ratio_20","PV_Score","Vol_Trend",

    # Sentiment features - capture market sentiment from news
    "Sent_Net","Sent_MA5","Sent_Mom5",
    "News_Intensity",

    # Lagged features - capture historical effects
    "Ret_Lag1","Ret_Lag5","Sent_Lag3"
]

In [None]:
# Display the raw features (before normalization)
df[features]

Unnamed: 0,Return_1d,Return_5d,Return_20d,MA5_Ratio,MA20_Ratio,Trend_5_20,Dist_20High,Dist_20Low,Vol_5d,Vol_20d,...,Vol_Ratio_20,PV_Score,Vol_Trend,Sent_Net,Sent_MA5,Sent_Mom5,News_Intensity,Ret_Lag1,Ret_Lag5,Sent_Lag3
0,,,,,,,,,,,...,,,,0.0,,,,,,
1,0.080029,,,,,,,,,,...,,,,0.0,,,,,,
2,-0.190918,,,,,,,,,,...,,,,0.0,,,,0.080029,,
3,-0.080374,,,,,,,,,,...,,,,0.0,,,,-0.190918,,0.0
4,0.213160,,,1.030031,,,,,,,...,,,,0.0,0.000000,,,-0.080374,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47357,0.014143,-0.025169,0.174666,0.995012,1.079556,0.084968,-0.053580,0.174822,0.017633,0.028121,...,0.671484,0.009497,1.232345,0.0,-0.017359,-0.228216,0.0,-0.028988,0.166917,0.0
47358,0.015454,-0.002850,0.176874,1.010972,1.087282,0.075481,-0.038954,0.183440,0.019696,0.028142,...,0.534997,0.008268,0.849365,0.0,0.000000,0.086795,0.0,0.014143,0.144200,0.0
47359,0.022605,0.033073,0.190065,1.027027,1.102075,0.073073,-0.017230,0.199652,0.020442,0.028314,...,0.540786,0.012224,0.751300,0.0,0.000000,0.000000,0.0,0.015454,0.112789,0.0
47360,0.001053,0.023643,0.179144,1.023248,1.094066,0.069209,-0.016195,0.187266,0.020454,0.028368,...,0.690363,0.000727,0.669530,0.0,0.000000,0.000000,0.0,0.022605,0.125491,0.0


In [None]:
# ============================================================================
# FEATURE NORMALIZATION: Z-score normalization using rolling window
# ============================================================================
import numpy as np
import pandas as pd

# Use 252-day window (approximately 1 trading year)
WINDOW = 252

# Normalize each feature using rolling mean and standard deviation
for f in features:
    # Calculate rolling statistics per stock
    rolling_mean = g[f].transform(
        lambda x: x.rolling(WINDOW).mean()
    )
    
    rolling_std = g[f].transform(
        lambda x: x.rolling(WINDOW).std()
    )
    
    # Z-score normalization: (value - mean) / std
    # Add small epsilon to avoid division by zero
    df[f + "_z"] = (df[f] - rolling_mean) / (rolling_std + 1e-8)

In [None]:
# Create list of normalized feature names (all ending with '_z')
final_features = [f + "_z" for f in features]

In [None]:
# Remove rows with missing values in features or labels
df_model = df.dropna(subset=final_features + ["Label"])

In [None]:
# ============================================================================
# IMPORT ML LIBRARIES
# ============================================================================
import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    roc_auc_score,        # Area under ROC curve
    accuracy_score,       # Overall accuracy
    precision_score,      # Precision (TP / (TP + FP))
    recall_score,         # Recall (TP / (TP + FN))
    f1_score,            # Harmonic mean of precision and recall
    log_loss,            # Logarithmic loss
    classification_report # Detailed classification metrics
)

In [None]:
# Define feature columns (all normalized features) and target variable
FEATURES = [c for c in df_model.columns if c.endswith("_z")]
TARGET = "Label"

In [None]:
# ============================================================================
# PREPARE DATA FOR TIME SERIES WALK-FORWARD VALIDATION
# ============================================================================
import numpy as np
import pandas as pd

# Create a copy and ensure Date column is datetime type
df_model = df_model.copy()
df_model["Date"] = pd.to_datetime(df_model["Date"])

# Extract all unique months for potential monthly splits
months = (
    df_model["Date"]
    .dt.to_period("M")
    .drop_duplicates()
    .sort_values()
)

In [None]:
# ============================================================================
# MODEL PIPELINE: XGBoost Classifier with optimized hyperparameters
# ============================================================================
from xgboost import XGBClassifier

def make_pipeline():
    """
    Create XGBoost pipeline with tuned hyperparameters
    
    Returns:
        Pipeline: Scikit-learn pipeline with XGBoost classifier
    """
    pipe = Pipeline([
        ("model", XGBClassifier(
            n_estimators=400,        # Number of boosting rounds
            max_depth=5,             # Maximum tree depth
            learning_rate=0.03,      # Step size shrinkage
            subsample=0.8,           # Fraction of samples for each tree
            colsample_bytree=0.8,    # Fraction of features for each tree
            eval_metric="auc",       # Evaluation metric
            random_state=42,         # For reproducibility
            n_jobs=-1                # Use all CPU cores
        ))
    ])
    
    return pipe

In [None]:
# ============================================================================
# WALK-FORWARD VALIDATION: Train on past year, test on next month
# ============================================================================
import numpy as np
import pandas as pd

TRAIN_DAYS = 252      # ~1 year of training data
TEST_DAYS  = 21       # ~1 month of testing data

results = []

# Get sorted unique dates
dates = df_model["Date"].sort_values().unique()

# Walk forward through time
for i in range(TRAIN_DAYS, len(dates), TEST_DAYS):
    # Define training period (1 year lookback)
    train_start = dates[i - TRAIN_DAYS]
    train_end   = dates[i - 1]
    
    # Define test period (1 month forward)
    test_start  = dates[i]
    test_end    = dates[min(i + TEST_DAYS - 1, len(dates)-1)]
    
    # ----------------------------
    # Split data into train and test sets
    # ----------------------------
    train = df_model[
        (df_model["Date"] >= train_start) &
        (df_model["Date"] <= train_end)
    ]
    
    test = df_model[
        (df_model["Date"] >= test_start) &
        (df_model["Date"] <= test_end)
    ]
    
    # Skip if no test data available
    if len(test) == 0:
        continue
    
    # Separate features and target
    X_train = train[FEATURES]
    y_train = train[TARGET]
    
    X_test  = test[FEATURES]
    y_test  = test[TARGET]
    
    # ----------------------------
    # Train the model
    # ----------------------------
    pipe = make_pipeline()
    pipe.fit(X_train, y_train)
    
    # ----------------------------
    # Generate predictions
    # ----------------------------
    y_prob = pipe.predict_proba(X_test)[:,1]  # Probability of positive class
    y_pred = (y_prob > 0.5).astype(int)       # Binary predictions
    
    # ----------------------------
    # Evaluate model performance
    # ----------------------------
    auc  = roc_auc_score(y_test, y_prob)
    acc  = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    ll = log_loss(y_test, y_prob)
    
    # Store results for this fold
    results.append({
        "train_start": train_start,
        "train_end": train_end,
        "test_start": test_start,
        "test_end": test_end,
        "auc": auc,
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "log_loss": ll,
        "n_train": len(train),
        "n_test": len(test)
    })
    
    # Print progress for each fold
    print(
        f"{test_start} | "
        f"AUC: {auc:.3f} | "
        f"ACC: {acc:.3f} | "
        f"Precision: {precision:.3f} | "
        f"Recall: {recall:.3f} | "
        f"F1: {f1:.3f} | "
        f"Log Loss: {ll:.3f}"
    )

2022-01-28 00:00:00 | AUC: 0.735 | ACC: 0.663 | Precision: 0.654 | Recall: 0.682 | F1: 0.668 | Log Loss: 0.600
2022-03-01 00:00:00 | AUC: 0.796 | ACC: 0.711 | Precision: 0.727 | Recall: 0.787 | F1: 0.756 | Log Loss: 0.537
2022-03-30 00:00:00 | AUC: 0.743 | ACC: 0.686 | Precision: 0.471 | Recall: 0.608 | F1: 0.531 | Log Loss: 0.580
2022-04-29 00:00:00 | AUC: 0.761 | ACC: 0.682 | Precision: 0.720 | Recall: 0.632 | F1: 0.673 | Log Loss: 0.590
2022-05-31 00:00:00 | AUC: 0.749 | ACC: 0.667 | Precision: 0.616 | Recall: 0.693 | F1: 0.652 | Log Loss: 0.597
2022-06-30 00:00:00 | AUC: 0.838 | ACC: 0.773 | Precision: 0.798 | Recall: 0.837 | F1: 0.817 | Log Loss: 0.490
2022-08-01 00:00:00 | AUC: 0.801 | ACC: 0.702 | Precision: 0.641 | Recall: 0.839 | F1: 0.727 | Log Loss: 0.567
2022-08-30 00:00:00 | AUC: 0.828 | ACC: 0.754 | Precision: 0.560 | Recall: 0.688 | F1: 0.617 | Log Loss: 0.491
2022-09-29 00:00:00 | AUC: 0.811 | ACC: 0.725 | Precision: 0.778 | Recall: 0.751 | F1: 0.764 | Log Loss: 0.518
2

In [None]:
# ============================================================================
# AGGREGATE RESULTS: Overall and year-by-year performance
# ============================================================================
import numpy as np
import pandas as pd

# Convert results to DataFrame
results_df = pd.DataFrame(results)

print("\n==== OVERALL PERFORMANCE ====\n")

# Calculate mean metrics across all folds
print("Mean AUC: ", results_df["auc"].mean())
print("Mean ACC: ", results_df["accuracy"].mean())
print("Mean Precision: ", results_df["precision"].mean())
print("Mean Recall: ", results_df["recall"].mean())
print("Mean F1: ", results_df["f1"].mean())
print("Mean Log Loss: ", results_df["log_loss"].mean())

print("\nBy Year:")

# Extract year from test start date
results_df["year"] = results_df["test_start"].dt.year

# Show performance breakdown by year
print(
    results_df.groupby("year")[["auc","accuracy"]].mean()
)


==== OVERALL PERFORMANCE ====

Mean AUC:  0.7706336624984312
Mean ACC:  0.7039930924495074
Mean Precision:  0.7016993688694898
Mean Recall:  0.7129008572512185
Mean F1:  0.7039794986299723
Mean Log Loss:  0.5592356082286257

By Year:
           auc  accuracy
year                    
2022  0.776395  0.708288
2023  0.764348  0.699308
