In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

from sklearn import linear_model, ensemble, neighbors
from xgboost import XGBRegressor

sns.set(style="whitegrid")

In [None]:
df = pd.read_csv('../datasets/XAU_1h_data.csv', delimiter=';', encoding='utf-8')

df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
print(f"Dropped {df['Date'].isna().sum()} rows due to invalid dates.")
df = df.dropna(subset=['Date'])

df.head(10)

In [None]:
df['target'] = df['Close'].shift(-24)

df = df.dropna(subset=['target'])

X = df.drop(columns=['target'])
y = df['target']

print(f"Features shape: {X.shape}, Target shape: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.select_dtypes(include=[np.number]))
X_test_scaled = scaler.transform(X_test.select_dtypes(include=[np.number]))

print("Data scaled and ready for model training.")

In [None]:
models = {
    'Linear Regression': linear_model.LinearRegression(),
    'Random Forest': ensemble.RandomForestRegressor(),
    'Gradient Boosting': ensemble.GradientBoostingRegressor(),
    'KNN': neighbors.KNeighborsRegressor(),
    'XGBoost': XGBRegressor(verbosity=0)
}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
    rmse = mean_squared_error(y_test, preds) 
    r2 = r2_score(y_test, preds)

    print(f"{name}: RMSE={rmse:.4f}, R2={r2:.4f}")
       
    # Plotting predictions vs actual
    plt.figure(figsize=(12, 6))
    plt.plot(y_test.values[:100], label='Actual', marker='o', linewidth=2)
    plt.plot(preds[:100], label='Predicted', marker='x', linestyle='--')
    plt.title(f"{name} - Predictions vs Actual (First 100)\nRMSE={rmse:.2f}, R²={r2:.4f}", fontsize=14)
    plt.xlabel("Sample Index")
    plt.ylabel("Gold Price")
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
import mlflow
from mlflow.models import infer_signature

mlflow.set_tracking_uri("arn:aws:sagemaker:eu-north-1:<account_number>:mlflow-tracking-server/mlflow-demo-tracking-server")

In [None]:
mlflow.set_experiment("gold_prices_experiment")

model = linear_model.LinearRegression()
model.fit(X_train_scaled, y_train)
preds = model.predict(X_test_scaled)
rmse = mean_squared_error(y_test, preds) 
r2 = r2_score(y_test, preds)

params = {
    "model": "LinearRegression"
}

# Start an MLflow run
with mlflow.start_run():
    # Log the hyperparameters
    mlflow.log_params(params)

    # Log the loss metric
    mlflow.log_metric("r2 score", r2)

    # Infer the model signature
    signature = infer_signature(X_test_scaled, preds)

    # Log the model, which inherits the parameters and metric
    model_info = mlflow.sklearn.log_model(
        sk_model=linear_model.LinearRegression(),
        artifact_path="gold_price_linear_regression_model",
        signature=signature,
        input_example=X_train_scaled,
        registered_model_name="gold-price-model-turelit",
    )