In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import mlflow
import mlflow.sklearn

In [4]:
# Load dataset
df = pd.read_csv("completed_Apples_stock price dataset.csv")

In [5]:
df.head(3)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,timestamp,stock_price,nasdaq_index,sp500_index,inflation_rate,unemployment_rate,interest_rate,market_sentiment,date,time
0,0,4,2010-01-01 04:00:00,98.983464,8002.448861,2997.154387,4.270254,4.942654,1.929987,-0.223077,2010-01-01,04:00:00
1,1,5,2010-01-01 05:00:00,99.022103,8011.832789,3004.510779,2.321292,4.095568,1.664753,-0.828119,2010-01-01,05:00:00
2,2,6,2010-01-01 06:00:00,108.127409,8028.157784,3010.735533,2.62614,5.447153,2.995546,-0.813849,2010-01-01,06:00:00


In [6]:
import json
try:
    with open("model_scores.json", "r") as f:
            final_scores = json.load(f)
except FileNotFoundError: 
    final_scores = {}   # start fresh if file doesn't exist

In [7]:
# Create lag features (example: last 5 days of stock price)
for lag in range(1, 6):
    df[f'lag_{lag}'] = df['stock_price'].shift(lag)

In [8]:
# Drop NaN rows created by shifting
df = df.dropna()

In [9]:
# Define features (lags + exogenous variables)
features = [f'lag_{lag}' for lag in range(1, 6)] + ["nasdaq_index", "market_sentiment"]
x = df[features]
y = df['stock_price']
features

['lag_1',
 'lag_2',
 'lag_3',
 'lag_4',
 'lag_5',
 'nasdaq_index',
 'market_sentiment']

In [10]:
# Train-test split (80/20)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=False)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((40469, 7), (10118, 7), (40469,), (10118,))

In [11]:
xgb = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=5,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
xgb.fit(x_train, y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [12]:
y_pred = xgb.predict(x_test)

RMSE = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("RMSE:", RMSE)
print("R²:", r2)

RMSE: 68.4450561269124
R²: -2.2212164134592585


In [13]:
# cross validation

#from sklearn.tree import DecisionTreeClassifier
#model = DecisionTreeClassifier(criterion='gini', max_depth=15)
#import xgboost as xgb

xgb_model = XGBRegressor(n_estimators=400, learning_rate=0.01, max_depth=5)

# Set up ShuffleSplit cross-validator
from sklearn.model_selection import ShuffleSplit, cross_validate
shuffle_split = ShuffleSplit(n_splits=200, test_size=0.3, random_state=42)

# Perform cross-validation and collect both train and test scores
cv_results = cross_validate(xgb_model, x, y, cv=shuffle_split, scoring={'r2':'r2', 'rmse':'neg_root_mean_squared_error'}, return_train_score=True)

# Extract train and test scores
train_r2 = cv_results['train_r2'].mean()
test_r2 = cv_results['test_r2'].mean()
train_rmse = -cv_results['train_rmse'].mean()
test_rmse = -cv_results['test_rmse'].mean()

# Show individual scores and their means
#print("cross validation: Train R2:" , np.round(train_scores.mean(),2))
#print("cross validation: test R2:" , np.round(test_scores.mean(),2))
print("Cross Validation REsults:")
print("Train R2::", train_r2)
print("Test  R2::", test_r2)
print("Train RMSE::", train_rmse)
print("Test  RMSE::", test_rmse)

with mlflow.start_run(run_name=f"XGBoost"):            
    mlflow.log_metric("RMSE", float(RMSE))
    mlflow.log_metric("r2Square", float(r2))
    #mlflow.log_artifact(f"XGBoost_{n_estimators}_{learning_rate}_{max_depth}.png") 
    mlflow.sklearn.log_model(xgb, name="XGBReg")

Cross Validation REsults:
Train R2:: 0.9943875207773405
Test  R2:: 0.9940290080916938
Train RMSE:: 6.846305538061444
Test  RMSE:: 7.061370677020139


2026/01/31 10:32:22 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2026/01/31 10:32:22 INFO mlflow.store.db.utils: Updating database tables
2026/01/31 10:32:22 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/31 10:32:22 INFO alembic.runtime.migration: Will assume non-transactional DDL.
2026/01/31 10:32:22 INFO alembic.runtime.migration: Context impl SQLiteImpl.
2026/01/31 10:32:22 INFO alembic.runtime.migration: Will assume non-transactional DDL.
The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exce

In [14]:
final_scores["XGBoost"] = {"AIC": "-", "RMSE": train_rmse, "R2": train_r2, "MAE": "-" }

In [15]:
import json

# Save model score
with open("model_scores.json", "w") as f:
    json.dump(final_scores, f, indent=4)