## Predictive Modelling 
### ● predicting stock price movements(Trend) using a classical ML model XGBoost, and an LSTM deep learning model. 

#### Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

####  Create features for prediction

In [9]:
def create_features(df):
    # Calculate Returns, Tomorrow, and Stock_Direction first
    df['Returns'] = df['Close'].pct_change() * 100
    df['Tomorrow'] = df['Close'].shift(-1)
    df['Stock_Direction'] = (df['Tomorrow'] > df['Close']).astype(int)
    
    # Technical indicators
    df['SMA_5'] = df['Close'].rolling(window=5).mean()
    df['SMA_20'] = df['Close'].rolling(window=20).mean()
    
    # Price ranges
    df['Daily_Range'] = df['High'] - df['Low']
    
    # Volume indicators
    df['Volume_SMA_5'] = df['Volume'].rolling(window=5).mean()
    
    # Return based features
    df['Return_Volatility'] = df['Returns'].rolling(window=10).std()
    
    return df

#### Prepare data

In [10]:
PG_data = pd.read_csv('PG_stock_data.csv')

PG_data = create_features(PG_data)

In [11]:
# Select features for prediction
features = ['Open', 'High', 'Low', 'Close', 'Volume', 'Returns', 
           'SMA_5', 'SMA_20', 'Daily_Range', 'Volume_SMA_5', 'Return_Volatility']

X = PG_data[features]
y = PG_data['Stock_Direction']

#### Split the data (80:20)

In [12]:
# Note: We can't use random split as this is time series data
split_index = int(len(X) * 0.8)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

print("Training set shape:", X_train.shape)
print("Testing set shape:", X_test.shape)

Training set shape: (2817, 11)
Testing set shape: (705, 11)


### Build and train XGBoost model

In [13]:
xgb_model = xgb.XGBClassifier(
   learning_rate=0.01,
   n_estimators=100,
   max_depth=4,
   min_child_weight=1,
   subsample=0.8,
   random_state=42
)

In [14]:
# Train the model
xgb_model.fit(X_train, y_train)

In [15]:
# Make predictions
xgb_predictions = xgb_model.predict(X_test)

In [16]:
# Evaluate XGBoost model
print("\nXGBoost Model Performance:")
print("Accuracy:", accuracy_score(y_test, xgb_predictions))
print("\nDetailed Classification Report:")
print(classification_report(y_test, xgb_predictions))


XGBoost Model Performance:
Accuracy: 0.4808510638297872

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.47      0.92      0.62       326
           1       0.60      0.10      0.18       379

    accuracy                           0.48       705
   macro avg       0.53      0.51      0.40       705
weighted avg       0.54      0.48      0.38       705



### LSTM Model

In [18]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler

In [19]:
# Scale the features for LSTM
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [20]:
# Reshape data for LSTM (samples, time steps, features)
def create_sequences(data, seq_length=10):
    sequences = []
    targets = []
    for i in range(len(data) - seq_length):
        seq = data[i:(i + seq_length)]
        target = y.iloc[i + seq_length]
        sequences.append(seq)
        targets.append(target)
    return np.array(sequences), np.array(targets)

In [21]:
# Create sequences
seq_length = 10
X_seq, y_seq = create_sequences(X_scaled, seq_length)

In [22]:
# Split sequences
split_idx = int(len(X_seq) * 0.8)
X_train_seq = X_seq[:split_idx]
X_test_seq = X_seq[split_idx:]
y_train_seq = y_seq[:split_idx]
y_test_seq = y_seq[split_idx:]

In [23]:
# Build LSTM model
lstm_model = Sequential([
    LSTM(50, activation='relu', input_shape=(seq_length, len(features))),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

  super().__init__(**kwargs)


In [24]:
# Compile model
lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train model
history = lstm_model.fit(
    X_train_seq, y_train_seq,
    epochs=50,
    batch_size=32,
    validation_split=0.1,
    verbose=1
)

Epoch 1/50
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 25ms/step - accuracy: 0.4758 - loss: nan - val_accuracy: 0.4698 - val_loss: nan
Epoch 2/50
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.5011 - loss: nan - val_accuracy: 0.4698 - val_loss: nan
Epoch 3/50
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.4885 - loss: nan - val_accuracy: 0.4698 - val_loss: nan
Epoch 4/50
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.4703 - loss: nan - val_accuracy: 0.4698 - val_loss: nan
Epoch 5/50
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.4911 - loss: nan - val_accuracy: 0.4698 - val_loss: nan
Epoch 6/50
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.4770 - loss: nan - val_accuracy: 0.4698 - val_loss: nan
Epoch 7/50
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s

In [25]:
# Evaluate LSTM model
lstm_predictions = (lstm_model.predict(X_test_seq) > 0.5).astype(int)
print("\nLSTM Model Performance:")
print("Accuracy:", accuracy_score(y_test_seq, lstm_predictions))
print("\nDetailed Classification Report:")
print(classification_report(y_test_seq, lstm_predictions))

[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step  

LSTM Model Performance:
Accuracy: 0.46088193456614507

Detailed Classification Report:
              precision    recall  f1-score   support

           0       0.46      1.00      0.63       324
           1       0.00      0.00      0.00       379

    accuracy                           0.46       703
   macro avg       0.23      0.50      0.32       703
weighted avg       0.21      0.46      0.29       703



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
