In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout

In [7]:
pg_data = pd.read_csv('PG_data.csv')

X = pg_data[["Close", "High", "Low", "Open", "Volume", "Returns", "Tomorrow"]]  # Features
y = pg_data["Stock_Direction"]  # Target

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Normalization
scaler = StandardScaler()
scaled_data = scaler.fit_transform(pg_data)

In [10]:
# Initialize and train the XGBoost model
xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions
xgb_predictions = xgb_model.predict(X_test)

# Evaluate the model
xgb_accuracy = accuracy_score(y_test, xgb_predictions)
print(f"XGBoost Accuracy: {xgb_accuracy}")
print(classification_report(y_test, xgb_predictions))

XGBoost Accuracy: 0.8267045454545454
              precision    recall  f1-score   support

           0       0.83      0.81      0.82       349
           1       0.82      0.84      0.83       355

    accuracy                           0.83       704
   macro avg       0.83      0.83      0.83       704
weighted avg       0.83      0.83      0.83       704



In [18]:
train_preds = xgb_model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_preds)
print(f"Training Accuracy: {train_accuracy}")

Training Accuracy: 0.9424715909090909


In [11]:
# Check training performance
train_predictions = xgb_model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)

# Compare with test performance
print(f"Training Accuracy: {train_accuracy}")
print(f"Test Accuracy: {xgb_accuracy}")

Training Accuracy: 0.9811789772727273
Test Accuracy: 0.8267045454545454


In [13]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
import numpy as np

# First, let's check training vs test performance with the current model
xgb_model = XGBClassifier(random_state=42)

# Fit the model
xgb_model.fit(X_train, y_train)

# Get training performance
train_predictions = xgb_model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)

# Get test performance (which we already had)
test_predictions = xgb_model.predict(X_test)
test_accuracy = accuracy_score(y_test, test_predictions)

print(f"Training Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

# Let's also do cross-validation for a more robust estimate
cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=5)
print("\nCross-validation scores:", cv_scores)
print(f"Average CV score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

Training Accuracy: 0.9811789772727273
Test Accuracy: 0.8267045454545454

Cross-validation scores: [0.74822695 0.73001776 0.74955595 0.72824156 0.79396092]
Average CV score: 0.750 (+/- 0.047)


In [17]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

xgb_model = XGBClassifier(
    max_depth=5,             # Slightly deeper trees
    n_estimators=150,        
    learning_rate=0.1,       
    reg_lambda=0.1,          # Very light regularization
    reg_alpha=0.1,          
    subsample=0.9,           # Use 90% of data per tree
    colsample_bytree=0.9,    # Use 90% of features per tree
    random_state=42
)

# Fit and evaluate
xgb_model.fit(X_train, y_train)

# Get performances
train_predictions = xgb_model.predict(X_train)
test_predictions = xgb_model.predict(X_test)

train_accuracy = accuracy_score(y_train, train_predictions)
test_accuracy = accuracy_score(y_test, test_predictions)

print(f"Training Accuracy: {train_accuracy}")
print(f"Test Accuracy: {test_accuracy}")

# Cross-validation
cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=5)
print("\nCross-validation scores:", cv_scores)
print(f"Average CV score: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

Training Accuracy: 0.9424715909090909
Test Accuracy: 0.7982954545454546

Cross-validation scores: [0.7393617  0.69449378 0.74955595 0.72646536 0.75310835]
Average CV score: 0.733 (+/- 0.042)
