In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout, Bidirectional, Attention, Layer
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [2]:
def create_features(df):
    df['SMA_5'] = df['Close'].rolling(window=5).mean()
    df['SMA_20'] = df['Close'].rolling(window=20).mean()
    df['Daily_Range'] = df['High'] - df['Low']
    df['Volume_SMA_5'] = df['Volume'].rolling(window=5).mean()
    df['Return_Volatility'] = df['Returns'].rolling(window=10).std()

    # Additional indicators
    df['RSI_14'] = compute_rsi(df['Close'], 14)
    df['Momentum_10'] = df['Close'] - df['Close'].shift(10)
    df['MACD'] = df['Close'].ewm(span=12, adjust=False).mean() - df['Close'].ewm(span=26, adjust=False).mean()
    df['ATR'] = df[['High', 'Low', 'Close']].apply(lambda x: max(x['High'] - x['Low'], abs(x['High'] - x['Close']), abs(x['Low'] - x['Close'])), axis=1).rolling(window=14).mean()

    # Lag features
    for lag in range(1, 6):  # Lags for the last 5 days
        df[f'Close_lag_{lag}'] = df['Close'].shift(lag)
    
    # Price changes
    df['Close_pct_change'] = df['Close'].pct_change()  # Percent change of close price

    return df.dropna()

In [3]:
# RSI Calculation
def compute_rsi(series, period=14):
    delta = series.diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

In [4]:
# Load and preprocess data
PG_data = pd.read_csv('PG_data.csv')
PG_data = create_features(PG_data)

features = ['Open', 'High', 'Low', 'Close', 'Volume', 'Returns',
            'SMA_5', 'SMA_20', 'Daily_Range', 'Volume_SMA_5', 
            'Return_Volatility', 'RSI_14', 'Momentum_10', 'MACD', 
            'ATR', 'Close_pct_change', 'Close_lag_1', 'Close_lag_2', 
            'Close_lag_3', 'Close_lag_4', 'Close_lag_5']

X = PG_data[features]
y = PG_data['Stock_Direction']

In [5]:
# Train-test split (time series order)
split_index = int(len(X) * 0.8)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

In [6]:
# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [7]:
# Train XGBoost model with optimized hyperparameters
xgb_model = xgb.XGBClassifier(
    learning_rate=0.005,
    n_estimators=500,
    max_depth=6,
    min_child_weight=2,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=1.5,
    random_state=42
)

xgb_model.fit(X_train_resampled, y_train_resampled)
xgb_predictions = xgb_model.predict(X_test)

In [8]:
# Evaluate XGBoost
print("\nXGBoost Model Performance:")
print("Accuracy:", accuracy_score(y_test, xgb_predictions))
print("\nDetailed Classification Report:")
print(classification_report(y_test, xgb_predictions))


XGBoost Model Performance:
Accuracy: 0.49358059914407987

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.71      0.56       323
           1       0.55      0.31      0.40       378

    accuracy                           0.49       701
   macro avg       0.51      0.51      0.48       701
weighted avg       0.51      0.49      0.47       701



In [9]:
# Normalize features for LSTM using StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
# Create sequences for LSTM/GRU
def create_sequences(data, labels, seq_length=10):
    sequences, targets = [], []
    for i in range(len(data) - seq_length):
        sequences.append(data[i:i + seq_length])
        targets.append(labels.iloc[i + seq_length])
    return np.array(sequences), np.array(targets)

X_seq, y_seq = create_sequences(X_scaled, y)

In [11]:
# Train-test split for sequences
split_idx = int(len(X_seq) * 0.8)
X_train_seq, X_test_seq = X_seq[:split_idx], X_seq[split_idx:]
y_train_seq, y_test_seq = y_seq[:split_idx], y_seq[split_idx:]

In [14]:
# Build improved LSTM/GRU model with attention mechanism
class AttentionLayer(Layer):
    def __init__(self):
        super(AttentionLayer, self).__init__()

    def call(self, inputs):
        attention_weights = tf.keras.backend.softmax(inputs, axis=1)
        return tf.reduce_sum(attention_weights * inputs, axis=1)

lstm_gru_model = Sequential([
    Bidirectional(GRU(64, return_sequences=True, activation='relu'), input_shape=(10, len(features))),
    Dropout(0.3),
    AttentionLayer(),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

lstm_gru_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [15]:
# Callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-5)

In [16]:
# Train LSTM/GRU model
history = lstm_gru_model.fit(
    X_train_seq, y_train_seq,
    epochs=100,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

Epoch 1/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 50ms/step - accuracy: 0.4953 - loss: 0.7170 - val_accuracy: 0.5107 - val_loss: 0.6938 - learning_rate: 0.0010
Epoch 2/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.5250 - loss: 0.6944 - val_accuracy: 0.5286 - val_loss: 0.7041 - learning_rate: 0.0010
Epoch 3/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 30ms/step - accuracy: 0.5260 - loss: 0.6918 - val_accuracy: 0.4821 - val_loss: 0.6973 - learning_rate: 0.0010
Epoch 4/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.5133 - loss: 0.6945 - val_accuracy: 0.5286 - val_loss: 0.7047 - learning_rate: 0.0010
Epoch 5/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.5358 - loss: 0.6889 - val_accuracy: 0.5286 - val_loss: 0.7014 - learning_rate: 2.0000e-04
Epoch 6/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [17]:
# Evaluate LSTM/GRU
lstm_gru_predictions = (lstm_gru_model.predict(X_test_seq) > 0.5).astype(int)

print("\nLSTM/GRU Model Performance:")
print("Accuracy:", accuracy_score(y_test_seq, lstm_gru_predictions))
print("\nDetailed Classification Report:")
print(classification_report(y_test_seq, lstm_gru_predictions))

[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 68ms/step

LSTM/GRU Model Performance:
Accuracy: 0.5450643776824035

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.51      0.32      0.39       322
           1       0.56      0.74      0.64       377

    accuracy                           0.55       699
   macro avg       0.53      0.53      0.51       699
weighted avg       0.54      0.55      0.52       699

