In [1]:
import numpy as np
import pandas as pd
import datetime as dt
from scipy.fft import fft
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, explained_variance_score, mean_absolute_error, mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve,roc_auc_score
from sklearn.metrics import classification_report
from xgboost import XGBRegressor
from xgboost import plot_importance
from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv("CleanedData.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Date,GoldPricePerGram,LP 95,LP 92,USDRate
0,0,2016-01-01,4907.4,138.0,127.0,144.0
1,1,2016-01-04,5025.5,138.0,127.0,144.0
2,2,2016-01-05,4974.7,138.0,127.0,144.0
3,3,2016-01-06,5051.7,138.0,127.0,144.0
4,4,2016-01-07,5112.0,138.0,127.0,144.0


In [3]:
df.drop("Unnamed: 0", axis=1, inplace=True)

In [4]:
df['Date'] = pd.to_datetime(df['Date']).dt.date

In [5]:
df.sort_values('Date', inplace=True)

**Feature Engineering**

Lag Features

In [6]:
def lag_feature(data,target_column,lagsteps):
  for i in range(1,lagsteps+1):
    data[f"lag_{i}"] = data[target_column].shift(i)
  return data

In [7]:
lagged_data = lag_feature(df, 'GoldPricePerGram',7)

In [8]:
lagged_data.head()

Unnamed: 0,Date,GoldPricePerGram,LP 95,LP 92,USDRate,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7
0,2016-01-01,4907.4,138.0,127.0,144.0,,,,,,,
1,2016-01-04,5025.5,138.0,127.0,144.0,4907.4,,,,,,
2,2016-01-05,4974.7,138.0,127.0,144.0,5025.5,4907.4,,,,,
3,2016-01-06,5051.7,138.0,127.0,144.0,4974.7,5025.5,4907.4,,,,
4,2016-01-07,5112.0,138.0,127.0,144.0,5051.7,4974.7,5025.5,4907.4,,,


Rolling Mean

In [9]:
def rolling_mean(data,target_var,window_size):
  data['rolling_mean'] = data[target_var].rolling(window=window_size).mean()
  return data

In [10]:
roll_data = rolling_mean(lagged_data,'GoldPricePerGram',7)

In [11]:
roll_data.tail()

Unnamed: 0,Date,GoldPricePerGram,LP 95,LP 92,USDRate,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,rolling_mean
3154,2025-04-25,31880.4,361.0,299.0,299.6,32220.8,31637.7,32540.2,32885.2,31858.8,31858.8,31858.8,32125.985714
3155,2025-04-26,31880.4,361.0,299.0,299.6,31880.4,32220.8,31637.7,32540.2,32885.2,31858.8,31858.8,32129.071429
3156,2025-04-27,31880.4,361.0,299.0,299.6,31880.4,31880.4,32220.8,31637.7,32540.2,32885.2,31858.8,32132.157143
3157,2025-04-28,32157.5,361.0,299.0,299.6,31880.4,31880.4,31880.4,32220.8,31637.7,32540.2,32885.2,32028.2
3158,2025-04-29,31901.6,361.0,299.0,299.5,32157.5,31880.4,31880.4,31880.4,32220.8,31637.7,32540.2,31936.971429


Fourier Transformation

In [12]:
def fourier_transformation(data,target_var):
  values = data[target_var].values
  fourier_transform = fft(values)
  data['fourier_transform'] = np.abs(fourier_transform)
  return data

In [13]:
fourier_data = fourier_transformation(roll_data,'GoldPricePerGram')
fourier_data.head(21)

Unnamed: 0,Date,GoldPricePerGram,LP 95,LP 92,USDRate,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,rolling_mean,fourier_transform
0,2016-01-01,4907.4,138.0,127.0,144.0,,,,,,,,,42331990.0
1,2016-01-04,5025.5,138.0,127.0,144.0,4907.4,,,,,,,,12547360.0
2,2016-01-05,4974.7,138.0,127.0,144.0,5025.5,4907.4,,,,,,,4937524.0
3,2016-01-06,5051.7,138.0,127.0,144.0,4974.7,5025.5,4907.4,,,,,,2627601.0
4,2016-01-07,5112.0,138.0,127.0,144.0,5051.7,4974.7,5025.5,4907.4,,,,,4289890.0
5,2016-01-08,5095.5,138.0,127.0,144.0,5112.0,5051.7,4974.7,5025.5,4907.4,,,,2119154.0
6,2016-01-11,5058.6,138.0,127.0,144.0,5095.5,5112.0,5051.7,4974.7,5025.5,4907.4,,5032.2,1749745.0
7,2016-01-12,5014.5,138.0,127.0,144.0,5058.6,5095.5,5112.0,5051.7,4974.7,5025.5,4907.4,5047.5,1924423.0
8,2016-01-13,5021.3,138.0,127.0,144.0,5014.5,5058.6,5095.5,5112.0,5051.7,4974.7,5025.5,5046.9,1035336.0
9,2016-01-14,5028.1,138.0,127.0,144.0,5021.3,5014.5,5058.6,5095.5,5112.0,5051.7,4974.7,5054.528571,1463943.0


Handling Missing Values

In [14]:
fourier_data.isnull().sum()

Unnamed: 0,0
Date,0
GoldPricePerGram,0
LP 95,0
LP 92,0
USDRate,0
lag_1,1
lag_2,2
lag_3,3
lag_4,4
lag_5,5


In [15]:
mean_df = df.head(21)
mean_df.describe()

Unnamed: 0,GoldPricePerGram,LP 95,LP 92,USDRate,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,rolling_mean,fourier_transform
count,21.0,21.0,21.0,21.0,20.0,19.0,18.0,17.0,16.0,15.0,14.0,15.0,21.0
mean,5064.442857,138.0,127.0,144.0,5060.735,5056.242105,5050.25,5044.352941,5038.775,5036.566667,5034.014286,5062.082857,4074783.0
std,64.062724,0.0,0.0,0.0,63.372917,61.750253,57.576581,53.452562,49.834372,50.76666,51.674631,30.596728,9154281.0
min,4907.4,138.0,127.0,144.0,4907.4,4907.4,4907.4,4907.4,4907.4,4907.4,4907.4,5031.057143,573437.4
25%,5024.7,138.0,127.0,144.0,5024.325,5023.95,5023.575,5023.2,5022.725,5022.25,5021.775,5042.628571,914713.3
50%,5058.6,138.0,127.0,144.0,5055.15,5051.7,5049.35,5047.0,5037.55,5028.1,5026.8,5051.314286,1362712.0
75%,5112.0,138.0,127.0,144.0,5099.625,5093.75,5087.075,5072.3,5072.0,5065.45,5056.875,5073.778571,2119154.0
max,5164.1,138.0,127.0,144.0,5164.1,5164.1,5150.5,5133.6,5112.0,5112.0,5112.0,5125.3,42331990.0


In [16]:
fourier_data['lag_1'].fillna(mean_df['lag_1'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  fourier_data['lag_1'].fillna(mean_df['lag_1'].mean(), inplace=True)


In [17]:
fourier_data.fillna({
    'lag_1': mean_df['lag_1'].mean(),
    'lag_2': mean_df['lag_2'].mean(),
    'lag_3': mean_df['lag_3'].mean(),
    'lag_4': mean_df['lag_4'].mean(),
    'lag_5': mean_df['lag_5'].mean(),
    'lag_6': mean_df['lag_6'].mean(),
    'lag_7': mean_df['lag_7'].mean(),
    'rolling_mean': mean_df['rolling_mean'].mean()
}, inplace=True)

In [18]:
fourier_data.isnull().sum()

Unnamed: 0,0
Date,0
GoldPricePerGram,0
LP 95,0
LP 92,0
USDRate,0
lag_1,0
lag_2,0
lag_3,0
lag_4,0
lag_5,0


Adding more time features

In [20]:
fourier_data['Date'] = pd.to_datetime(fourier_data['Date'])

In [21]:
fourier_data['day'] = fourier_data['Date'].dt.day
fourier_data['month'] = fourier_data['Date'].dt.month
fourier_data['dayofweek'] = fourier_data['Date'].dt.dayofweek
fourier_data['quarter'] = fourier_data['Date'].dt.quarter
fourier_data['weekofyear'] = fourier_data['Date'].dt.isocalendar().week.astype(int)
#  trend feature
fourier_data['trend'] = (fourier_data['Date'] - fourier_data['Date'].min()).dt.days

Model Building

In [22]:
X = fourier_data.drop(['GoldPricePerGram','Date'], axis=1)
y = fourier_data['GoldPricePerGram']

In [23]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.2, random_state = 1)

In [24]:
param_grid = {
'learning_rate': [0.01, 0.1, 0.2],
'max_depth': [3, 5, 7],
'subsample': [0.8, 0.9, 1.0],
'reg_alpha': [0, 0.01, 0.1, 1],
'reg_lambda': [0, 0.01, 0.1, 1]

}

grid_search = GridSearchCV(XGBRegressor(), param_grid, cv=3)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_

In [26]:
best_params

{'learning_rate': 0.1,
 'max_depth': 5,
 'reg_alpha': 0.01,
 'reg_lambda': 0.01,
 'subsample': 0.8}

In [27]:
xgb_model = XGBRegressor(**best_params)

xgb_model.fit(X_train, y_train)

In [28]:
y_train_pred = xgb_model.predict(X_train)

y_test_pred = xgb_model.predict(X_test)

In [29]:
def evaluate_model(y_true, y_pred, dataset_name, X):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)

    print(f"{dataset_name} Evaluation:")
    print(f"  R² Score         : {r2:.4f}")
    print(f"  MAE              : {mae:.4f}")
    print(f"  RMSE             : {rmse:.4f}")
    print("-" * 30)


In [30]:
evaluate_model(y_train, y_train_pred, "Train", X_train)
evaluate_model(y_test, y_test_pred, "Test", X_test)

Train Evaluation:
  R² Score         : 0.9999
  MAE              : 56.2452
  RMSE             : 80.9660
------------------------------
Test Evaluation:
  R² Score         : 0.9995
  MAE              : 94.8882
  RMSE             : 157.2303
------------------------------


Random Walk Checking

In [31]:
y_pred_rw = y_test[:-1]
y_true_rw = y_test[1:]

rmse_rw = np.sqrt(mean_squared_error(y_true_rw, y_pred_rw))
mae_rw = mean_absolute_error(y_true_rw, y_pred_rw)

print(f"Random Walk RMSE: {rmse_rw:.4f}")
print(f"Random Walk MAE : {mae_rw:.4f}")

Random Walk RMSE: 10143.8063
Random Walk MAE : 7999.6220


In [32]:
evaluate_model(y_test, y_test_pred, "Test", X_test)

Test Evaluation:
  R² Score         : 0.9995
  MAE              : 94.8882
  RMSE             : 157.2303
------------------------------


**Conclusion**

The XGBoost model demonstrated exceptional performance in forecasting gold prices, significantly outperforming the Random Walk baseline. With a remarkably low RMSE of 157.23 and MAE of 94.89 compared to the Random Walk RMSE of 10,143.81 and MAE of 7,999.62, the model captures the underlying patterns in the data with high accuracy. An R² score of 0.9995 further confirms the model’s ability to explain nearly all the variance in the target variable. These results validate XGBoost as a highly effective method for gold price prediction in this study.