In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [2]:
# Create features for prediction
def create_features(df):
    df['SMA_5'] = df['Close'].rolling(window=5).mean()
    df['SMA_20'] = df['Close'].rolling(window=20).mean()
    df['Daily_Range'] = df['High'] - df['Low']
    df['Volume_SMA_5'] = df['Volume'].rolling(window=5).mean()
    df['Return_Volatility'] = df['Returns'].rolling(window=10).std()

    # Additional indicators
    df['RSI_14'] = compute_rsi(df['Close'], 14)
    df['Momentum_10'] = df['Close'] - df['Close'].shift(10)
    
    return df.dropna()

In [3]:
# RSI Calculation
def compute_rsi(series, period=14):
    delta = series.diff(1)
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / loss
    return 100 - (100 / (1 + rs))

In [4]:
# Load and preprocess data
PG_data = pd.read_csv('PG_data.csv')
PG_data = create_features(PG_data)

features = ['Open', 'High', 'Low', 'Close', 'Volume', 'Returns',
            'SMA_5', 'SMA_20', 'Daily_Range', 'Volume_SMA_5', 
            'Return_Volatility', 'RSI_14', 'Momentum_10']

X = PG_data[features]
y = PG_data['Stock_Direction']

In [5]:
# Train-test split (time series order)
split_index = int(len(X) * 0.8)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

In [6]:
# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [7]:
# Train XGBoost model with optimized hyperparameters
xgb_model = xgb.XGBClassifier(
    learning_rate=0.005,
    n_estimators=300,
    max_depth=6,
    min_child_weight=2,
    subsample=0.8,
    colsample_bytree=0.8,
    scale_pos_weight=1.5,
    random_state=42
)

xgb_model.fit(X_train_resampled, y_train_resampled)
xgb_predictions = xgb_model.predict(X_test)

In [8]:
# Evaluate XGBoost
print("\nXGBoost Model Performance:")
print("Accuracy:", accuracy_score(y_test, xgb_predictions))
print("\nDetailed Classification Report:")
print(classification_report(y_test, xgb_predictions))


XGBoost Model Performance:
Accuracy: 0.5035663338088445

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.46      0.51      0.49       323
           1       0.54      0.50      0.52       378

    accuracy                           0.50       701
   macro avg       0.50      0.50      0.50       701
weighted avg       0.51      0.50      0.50       701



In [9]:
# Normalize features for LSTM
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [10]:
# Create sequences for LSTM
def create_sequences(data, labels, seq_length=10):
    sequences, targets = [], []
    for i in range(len(data) - seq_length):
        sequences.append(data[i:i + seq_length])
        targets.append(labels.iloc[i + seq_length])
    return np.array(sequences), np.array(targets)

X_seq, y_seq = create_sequences(X_scaled, y)

In [11]:
# Train-test split for sequences
split_idx = int(len(X_seq) * 0.8)
X_train_seq, X_test_seq = X_seq[:split_idx], X_seq[split_idx:]
y_train_seq, y_test_seq = y_seq[:split_idx], y_seq[split_idx:]

In [12]:
# Build improved LSTM model
lstm_model = Sequential([
    Bidirectional(LSTM(64, return_sequences=True, activation='relu'), input_shape=(10, len(features))),
    Dropout(0.3),
    Bidirectional(LSTM(32, activation='relu')),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  super().__init__(**kwargs)


In [13]:
# Callbacks
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-5)

In [14]:
# Train LSTM model
history = lstm_model.fit(
    X_train_seq, y_train_seq,
    epochs=100,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

Epoch 1/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 57ms/step - accuracy: 0.5072 - loss: 0.6936 - val_accuracy: 0.5286 - val_loss: 0.6919 - learning_rate: 0.0010
Epoch 2/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.5235 - loss: 0.6923 - val_accuracy: 0.5286 - val_loss: 0.6935 - learning_rate: 0.0010
Epoch 3/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.5237 - loss: 0.6910 - val_accuracy: 0.5286 - val_loss: 0.6922 - learning_rate: 0.0010
Epoch 4/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 28ms/step - accuracy: 0.5075 - loss: 0.6939 - val_accuracy: 0.5286 - val_loss: 0.6923 - learning_rate: 0.0010
Epoch 5/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 29ms/step - accuracy: 0.5027 - loss: 0.6933 - val_accuracy: 0.5286 - val_loss: 0.6925 - learning_rate: 2.0000e-04
Epoch 6/100
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [15]:
# Evaluate LSTM
lstm_predictions = (lstm_model.predict(X_test_seq) > 0.5).astype(int)

print("\nLSTM Model Performance:")
print("Accuracy:", accuracy_score(y_test_seq, lstm_predictions))
print("\nDetailed Classification Report:")
print(classification_report(y_test_seq, lstm_predictions))

[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 81ms/step

LSTM Model Performance:
Accuracy: 0.5393419170243204

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       322
           1       0.54      1.00      0.70       377

    accuracy                           0.54       699
   macro avg       0.27      0.50      0.35       699
weighted avg       0.29      0.54      0.38       699



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
