In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV

# --- Load & Prepare Data ---
file_path = "cleandata.csv"
df = pd.read_csv(file_path)

df['Date'] = pd.to_datetime(df['Date'], format='%b-%Y')
df = df.sort_values('Date').set_index('Date')

target = 'LNG 174K CBM (2-stroke dual fuel) Spot Rate (avg., $/day)'

# --- Feature Engineering ---
df['spot_rate_lag_1'] = df[target].shift(1)
df['spot_rate_lag_2'] = df[target].shift(2)
df['spot_rate_lag_3'] = df[target].shift(3)
df['spot_rate_roll3'] = df[target].rolling(3).mean()
df['spot_rate_roll6'] = df[target].rolling(6).mean()
df['spot_rate_std3'] = df[target].rolling(3).std()
df['month'] = df.index.month
df['year'] = df.index.year

df = df.dropna()

# --- Train/Test Split ---
cutoff_date = pd.Timestamp("2023-12-01")
df_train = df[df.index <= cutoff_date]
df_test = df[(df.index > cutoff_date) & (df.index <= "2024-12-01")]

X_train = df_train.drop(columns=[target])
y_train = df_train[target]
X_test = df_test.drop(columns=[target])
y_test = df_test[target]

# --- Grid Search for Best Hyperparameters ---
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0]
}

grid = GridSearchCV(
    estimator=XGBRegressor(random_state=42),
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=3,
    verbose=1
)

grid.fit(X_train, y_train)
print("✅ Best Params:", grid.best_params_)
model = grid.best_estimator_

# --- Fit with Early Stopping ---
model.set_params(early_stopping_rounds=10, eval_metric='mae')
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

# --- Forecast ---
y_pred = model.predict(X_test)

# --- Evaluation ---
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Naive baseline: predict last value
naive_pred = y_test.shift(1).fillna(method='bfill')
naive_mae = mean_absolute_error(y_test, naive_pred)

print(f"📊 MAE: ${mae:,.2f} per day")
print(f"📊 RMSE: ${rmse:,.2f} per day")
print(f"🧪 Naive MAE: ${naive_mae:,.2f} per day")

# --- Combine for Analysis ---
forecast_results = pd.DataFrame({
    'Date': y_test.index,
    'Actual Spot Rate': y_test.values,
    'Predicted Spot Rate': y_pred
}).set_index('Date')

# Optional: Save or plot


Fitting 3 folds for each of 16 candidates, totalling 48 fits
✅ Best Params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
📊 MAE: $36,757.08 per day
📊 RMSE: $43,251.69 per day
🧪 Naive MAE: $9,031.25 per day


  naive_pred = y_test.shift(1).fillna(method='bfill')
