In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [2]:
#Load data
PG_data = pd.read_csv('PG_data.csv')

In [3]:
# Select basic features
basic_features = ['Open', 'High', 'Low', 'Close', 'Volume', 'Returns']

In [4]:
# Prepare data
X = PG_data[basic_features]
y = PG_data['Stock_Direction']

In [5]:
# Split data (keeping time series nature)
split_index = int(len(X) * 0.8)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

In [6]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [7]:
# 1. XGBoost with GridSearch
xgb_params = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 200],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.8, 0.9]
}

xgb_model = xgb.XGBClassifier(random_state=42)
xgb_grid = GridSearchCV(xgb_model, xgb_params, cv=5, scoring='accuracy')
xgb_grid.fit(X_train_scaled, y_train)

In [8]:
print("XGBoost Best Parameters:", xgb_grid.best_params_)
xgb_predictions = xgb_grid.predict(X_test_scaled)
print("\nXGBoost Performance:")
print(classification_report(y_test, xgb_predictions))

XGBoost Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.9}

XGBoost Performance:
              precision    recall  f1-score   support

           0       0.47      0.92      0.62       325
           1       0.60      0.10      0.17       379

    accuracy                           0.48       704
   macro avg       0.54      0.51      0.40       704
weighted avg       0.54      0.48      0.38       704



In [9]:
# 2. LSTM
# Reshape data for LSTM
def create_sequences(X, y, time_steps=10):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:(i + time_steps)])
        ys.append(y.iloc[i + time_steps])
    return np.array(Xs), np.array(ys)

time_steps = 10
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train, time_steps)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test, time_steps)

lstm_model = Sequential([
    LSTM(64, input_shape=(time_steps, X_train.shape[1]), return_sequences=True),
    Dropout(0.2),
    LSTM(32),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy'])

lstm_model.fit(X_train_seq, y_train_seq,
               epochs=50,
               batch_size=32,
               validation_split=0.1,
               verbose=0)

lstm_predictions = (lstm_model.predict(X_test_seq) > 0.5).astype(int)
print("\nLSTM Performance:")
print(classification_report(y_test_seq, lstm_predictions))

  super().__init__(**kwargs)


[1m22/22[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 81ms/step

LSTM Performance:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       319
           1       0.54      1.00      0.70       375

    accuracy                           0.54       694
   macro avg       0.27      0.50      0.35       694
weighted avg       0.29      0.54      0.38       694



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import numpy as np
from datetime import datetime, timedelta

def predict_next_14_days(lstm_model, last_known_data, scaler, time_steps=10):
    """
    Predict the next 14 days using the trained LSTM model.
    
    Parameters:
    -----------
    lstm_model : keras.Model
        Trained LSTM model
    last_known_data : numpy.ndarray
        Last known scaled data points (should be at least time_steps in length)
    scaler : sklearn.preprocessing.StandardScaler
        The scaler used to transform the original data
    time_steps : int
        Number of time steps used in the LSTM model
        
    Returns:
    --------
    predictions : numpy.ndarray
        Array of predictions for the next 14 days
    dates : list
        List of corresponding dates for the predictions
    """
    # Initialize predictions array
    predictions = []
    
    # Get the last time_steps worth of data
    current_sequence = last_known_data[-time_steps:].copy()
    
    # Generate predictions for next 14 days
    for _ in range(14):
        # Reshape the sequence for prediction
        sequence = current_sequence.reshape(1, time_steps, last_known_data.shape[1])
        
        # Make prediction
        pred = lstm_model.predict(sequence, verbose=0)
        binary_pred = (pred > 0.5).astype(int)[0][0]
        predictions.append(binary_pred)
        
        # Update sequence for next prediction
        # Create a new row with the prediction
        new_row = current_sequence[-1].copy()  # Copy the last row of features
        current_sequence = np.roll(current_sequence, -1, axis=0)
        current_sequence[-1] = new_row  # Update the last row
    
    # Generate dates for the predictions
    last_date = datetime.now()
    dates = [(last_date + timedelta(days=i+1)).strftime('%Y-%m-%d') 
             for i in range(14)]
    
    return np.array(predictions), dates

# Example usage:
# Assuming X_test_scaled contains your scaled features
last_known_data = X_test_scaled[-time_steps:]  # Get the last time_steps worth of data

# Make predictions
predictions, dates = predict_next_14_days(lstm_model, last_known_data, scaler, time_steps)

# Print predictions with dates
print("\nPredictions for the next 14 days:")
print("Date\t\tPrediction")
print("-" * 30)
for date, pred in zip(dates, predictions):
    print(f"{date}\t{pred}")

# Optional: Calculate prediction statistics
positive_days = np.sum(predictions == 1)
negative_days = np.sum(predictions == 0)
print(f"\nSummary:")
print(f"Positive predictions: {positive_days} days")
print(f"Negative predictions: {negative_days} days")
print(f"Positive ratio: {positive_days/14:.2%}")

In [10]:
# 3. KNN with GridSearch
knn_params = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

knn = KNeighborsClassifier()
knn_grid = GridSearchCV(knn, knn_params, cv=5, scoring='accuracy')
knn_grid.fit(X_train_scaled, y_train)

In [11]:
print("KNN Best Parameters:", knn_grid.best_params_)
knn_predictions = knn_grid.predict(X_test_scaled)
print("\nKNN Performance:")
print(classification_report(y_test, knn_predictions))

KNN Best Parameters: {'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}

KNN Performance:
              precision    recall  f1-score   support

           0       0.48      0.72      0.58       325
           1       0.58      0.34      0.43       379

    accuracy                           0.51       704
   macro avg       0.53      0.53      0.50       704
weighted avg       0.54      0.51      0.50       704



In [12]:
# 4. Random Forest with GridSearch
rf_params = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'class_weight': ['balanced', None]
}

In [None]:
rf = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf, rf_params, cv=5, scoring='accuracy')
rf_grid.fit(X_train_scaled, y_train)

In [None]:
print("Random Forest Best Parameters:", rf_grid.best_params_)
rf_predictions = rf_grid.predict(X_test_scaled)
print("\nRandom Forest Performance:")
print(classification_report(y_test, rf_predictions))

In [None]:
# Compare all models
models = {
    'XGBoost': xgb_predictions,
    'LSTM': lstm_predictions,
    'KNN': knn_predictions,
    'Random Forest': rf_predictions
}

In [None]:
print("\nModel Comparison:")
for model_name, predictions in models.items():
    if model_name == 'LSTM':
        acc = accuracy_score(y_test_seq, predictions)
    else:
        acc = accuracy_score(y_test, predictions)
    print(f"{model_name} Accuracy: {acc:.4f}")