In [None]:
python

Copy
# Load natural gas and coal prices
ng_data = pd.read_csv('natural_gas_prices.csv', parse_dates=['Date'], index_col='Date')
coal_data = pd.read_csv('coal_prices.csv', parse_dates=['Date'], index_col='Date')

# Load economic indicators
gdp_data = pd.read_csv('gdp_data.csv', parse_dates=['Date'], index_col='Date')
inflation_data = pd.read_csv('inflation_data.csv', parse_dates=['Date'], index_col='Date')

In [None]:
# Check for missing values
print(ng_data.isnull().sum())
print(coal_data.isnull().sum())

# Fill missing values
ng_data.fillna(method='ffill', inplace=True)
coal_data.fillna(method='ffill', inplace=True)

# Remove outliers (example using z-score)
from scipy import stats
ng_data = ng_data[(np.abs(stats.zscore(ng_data['Price'])) < 3)]
coal_data = coal_data[(np.abs(stats.zscore(coal_data['Price'])) < 3)]

In [None]:
# Combine datasets for EDA
merged_ng = pd.merge(ng_data, gdp_data, on='Date', how='inner')
merged_coal = pd.merge(coal_data, gdp_data, on='Date', how='inner')

# Plotting Natural Gas Prices vs. GDP
plt.figure(figsize=(12, 6))
sns.scatterplot(x='GDP', y='Price', data=merged_ng)
plt.title('Natural Gas Prices vs. GDP')
plt.xlabel('GDP Growth Rate')
plt.ylabel('Natural Gas Price (USD)')
plt.show()

# Correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(merged_ng.corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix for Natural Gas Data')
plt.show()

In [None]:
from statsmodels.tsa.arima.model import ARIMA

# Fit ARIMA model
ng_model = ARIMA(ng_data['Price'], order=(5, 1, 0))
ng_model_fit = ng_model.fit()
print(ng_model_fit.summary())

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

# Prepare features and target
X = merged_ng[['GDP', 'Inflation']]  # Add more features as necessary
y = merged_ng['Price']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit Random Forest model
rf_model = RandomForestRegressor(n_estimators=100)
rf_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Predictions
ng_predictions = ng_model_fit.forecast(steps=len(X_test))
rf_predictions = rf_model.predict(X_test)

# Calculate metrics
ng_rmse = np.sqrt(mean_squared_error(y_test, ng_predictions))
rf_rmse = np.sqrt(mean_squared_error(y_test, rf_predictions))

print(f'ARIMA RMSE: {ng_rmse}')
print(f'Random Forest RMSE: {rf_rmse}')

In [None]:
# Feature Importance for Random Forest
importances = rf_model.feature_importances_
feature_names = X.columns

# Visualize feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x=importances, y=feature_names)
plt.title('Feature Importance in Random Forest Model')
plt.show()

In [None]:
python

Copy
# Backtesting example (using last year as test)
backtest_data = ng_data[-365:]  # Last year of data
backtest_predictions = ng_model_fit.forecast(steps=len(backtest_data))

# Compare predictions with actual values
plt.figure(figsize=(14, 7))
plt.plot(backtest_data.index, backtest_data['Price'], label='Actual Prices', color='blue')
plt.plot(backtest_data.index, backtest_predictions, label='Predicted Prices', color='orange')
plt.title('Backtesting Natural Gas Prices')
plt.xlabel('Date')
plt.ylabel('Price (USD)')
plt.legend()
plt.show()