In [3]:
import os
import pandas as pd
import numpy as np
from datetime import datetime
import calendar
import statsmodels.api as sm
from pandas import Series
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from math import sqrt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
from random import random
#!pip install pmdarima --quiet
import pmdarima as pm

# Code adapted from https://medium.com/data-science/time-series-forecasting-with-arima-sarima-and-sarimax-ee61099e78f6
# Plot data to view
def plot_data(df, feature):
    plt.figure(figsize=(15,7))
    plt.title(str(feature)+" by Month")
    plt.xlabel('Month')
    plt.ylabel(str(feature))
    plt.plot(df)
    plt.show()

#Determine rolling statistics to find trends
def rolling_statistics(df):
    df["rolling_avg"] = df.rolling(window=12).mean() #window size 12 denotes 12 months, giving rolling mean at yearly level
    df["rolling_std"] = df.rolling(window=12).std()

    #Plot rolling statistics
    plt.figure(figsize=(15,7))
    plt.plot(df, color='#379BDB', label='Original')
    plt.plot(df["rolling_avg"], color='#D22A0D', label='Rolling Mean')
    plt.plot(df["rolling_std"], color='#142039', label='Rolling Std')
    plt.legend(loc='best')
    plt.title('Rolling Mean & Standard Deviation')
    plt.show(block=False)

#Augmented Dickey-Fuller Test to test if the time series is stationary
#If ADF has p <= 0.05, data are stationary
def ADF(df):
    print('Results of Dickey Fuller Test for temperature:')
    dftest = adfuller(df, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value

    print(dfoutput)

    return dfoutput

#Standard ARIMA Model
def fit_ARIMA_model(df):
    model = pm.auto_arima(df, 
                      start_p=1, 
                      start_q=1,
                      test='adf', # use adftest to find optimal 'd'
                      max_p=12, max_q=12, # maximum p and q
                      m=1, # frequency of series (if m==1, seasonal is set to FALSE automatically)
                      d=None,# let model determine 'd'
                      seasonal=False, # No Seasonality for standard ARIMA
                      trace=False, #logs 
                      error_action='warn', #shows errors ('ignore' silences these)
                      suppress_warnings=True,
                      stepwise=True

    return model

# SARIMAX Model
def fit_SARIMAX_model(df, exog):
    model = pm.auto_arima(df, start_p=1, start_q=1, exogenous=exog,
                         test='adf', # use adftest to find optimal 'd'
                         max_p=12, max_q=12, m=12, #12 is the frequency of the cycle
                         start_P=0, seasonal=True,
                         d=None, D=1, 
                         trace=False,
                         error_action='ignore',  
                         suppress_warnings=True, 
                         stepwise=True)

    return model

def plot_diagnostics(model):
    model.plot_diagnostics(figsize=(15,12))
    plt.show
    
def ARIMA_forecast(ARIMA_model, df, periods):
    # Forecast
    n_periods = periods
    fitted, confint = ARIMA_model.predict(n_periods=n_periods, return_conf_int=True)
    index_of_fc = pd.date_range(df.index[-1] + pd.DateOffset(months=1), periods = n_periods, freq='MS')

    # make series for plotting purpose
    fitted_series = pd.Series(fitted, index=index_of_fc)
    lower_series = pd.Series(confint[:, 0], index=index_of_fc)
    upper_series = pd.Series(confint[:, 1], index=index_of_fc)

    # Plot
    plt.figure(figsize=(15,7))
    plt.plot(df, color='#1f76b4')
    plt.plot(fitted_series, color='darkgreen')
    plt.fill_between(lower_series.index, 
                    lower_series, 
                    upper_series, 
                    color='k', alpha=.15)

    plt.title("ARIMA Forecast")
    plt.show()

    return fitted_series, lower_series, upper_series

def SARIMAX_forecast(SARIMAX_model, df, exog, periods):
    # Forecast
    n_periods = periods

    forecast_df = pd.DataFrame({"month":pd.date_range(df.index[-1], periods = n_periods, freq='MS').month},
                    index = pd.date_range(df.index[-1] + pd.DateOffset(months=1), periods = n_periods, freq='MS'))

    fitted, confint = SARIMAX_model.predict(n_periods=n_periods, 
                                            return_conf_int=True,
                                            exogenous=exog)
    index_of_fc = pd.date_range(df.index[-1] + pd.DateOffset(months=1), periods = n_periods, freq='MS')

    # make series for plotting purpose
    fitted_series = pd.Series(fitted, index=index_of_fc)
    lower_series = pd.Series(confint[:, 0], index=index_of_fc)
    upper_series = pd.Series(confint[:, 1], index=index_of_fc)

    # Plot
    plt.figure(figsize=(15,7))
    plt.plot(df, color='#1f76b4')
    plt.plot(fitted_series, color='darkgreen')
    plt.fill_between(lower_series.index, 
                    lower_series, 
                    upper_series, 
                    color='k', alpha=.15)

    plt.title("SARIMAX Forecast")
    plt.show()

    return fitted_series, lower_series, upper_series

# date-time parsing function for loading the dataset
def parser(x):
    return datetime.strptime('190'+x, '%Y-%m')
    
def rmse (y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))
    
def mape (y_true, y_pred):
    return 100*K.mean(K.sqrt(K.square(y_true - y_pred))/y_true)
    
def pearson (y_true, y_pred):
    return (K.square(K.mean((y_true - K.mean(y_true))*(y_pred - K.mean(y_pred)))))/(K.mean(K.square(y_true - K.mean(y_true)))*K.mean(K.square(y_pred - K.mean(y_pred))))

# create a differenced series
#Taken from: https://machinelearningmastery.com/multi-step-time-series-forecasting-long-short-term-memory-networks-python/
def difference(dataset, interval=1):
    diff = list()
    for i in range(interval, len(dataset)):
        value = dataset[i] - dataset[i - interval]
        diff.append(value)
    return Series(diff)

def prepare_data(data):

    # prepare data for normalization
    series = Series(data)
    values = series.values
    values = values.reshape((len(values), 1))

    # train the normalization
    scaler = StandardScaler()
    scaler = scaler.fit(values)
    print('Mean: %f, StandardDeviation: %f' % (scaler1.mean_, sqrt(scaler1.var_)))
    
    # normalize the dataset and print
    standardized = scaler.transform(values)

    scaled_values = np.array(standardized[:,0])

    return scaler, scaled_values 

def plot_kfold(cv, X, y, ax, n_splits, xlim_max=105):
    
    #Plots the indices for a cross-validation object.
    #Taken from https://www.geeksforgeeks.org/cross-validation-using-k-fold-with-scikit-learn/
    
    #Parameters:
    #cv: Cross-validation object
    #X: Feature set
    #y: Target variable
    #ax: Matplotlib axis object
    #n_splits: Number of folds in the cross-validation
    #xlim_max: Maximum limit for the x-axis
        
    # Set color map for the plot
    cmap_cv = plt.cm.coolwarm
    cv_split = cv.split(X=X, y=y)
        
    for i_split, (train_idx, test_idx) in enumerate(cv_split):
        # Create an array of NaNs and fill in training/testing indices
        indices = np.full(len(X), np.nan)
        indices[test_idx], indices[train_idx] = 1, 0
            
        # Plot the training and testing indices
        ax_x = range(len(indices))
        ax_y = [i_split + 0.5] * len(indices)
        ax.scatter(ax_x, ax_y, c=indices, marker="_", 
                   lw=10, cmap=cmap_cv, vmin=-0.2, vmax=1.2)
    
        # Set y-ticks and labels
        y_ticks = np.arange(n_splits) + 0.5
        ax.set(yticks=y_ticks, yticklabels=range(n_splits),
               xlabel="Weather Station index (file_id)", ylabel="Fold",
               ylim=[n_splits, -0.2], xlim=[0, xlim_max])
    
        # Set plot title and create legend
        ax.set_title("KFold", fontsize=14)
        legend_patches = [Patch(color=cmap_cv(0.8), label="Testing set"), 
                          Patch(color=cmap_cv(0.02), label="Training set")]
        ax.legend(handles=legend_patches, loc=(1.03, 0.8))

#Main

#Configure
n_seq = 60
if n_seq > 46:
    n_lag = 179 - n_seq + 46
else:
    n_lag = 179
n_time_steps = 227
n_test = 1

print("Model Parameters:")
print("n_lag (number of input time steps): "+str(n_lag))
print("n_seq (number of output/future prediction time steps): "+str(n_seq))

# Create 2D array with file_ids to use for sample creation
array = np.array([
    6501, 6541, 6640, 6668, 6678, 
    6687, 6697, 6714, 6744, 6772, 
    6783, 6840, 6844, 6854, 6870, 
    6891, 6895, 6899, 6901, 6909, 
    6929, 6950, 6963, 6969, 6994, 
    7032, 7057, 7094, 7095, 7100, 
    7108, 7116, 7119, 7131, 7139, 
    7152, 7155, 7156, 7182, 7193, 
    7202, 7239, 7280, 7286, 7287, 
    7311, 7321, 7329, 7347, 7350, 
    7354, 7357, 7361, 7414, 7423, 
    7424, 7432, 7463, 7482, 7489, 
    7528, 7531, 7534, 7538, 7549, 
    7553, 7555, 7562, 7571, 7573, 
    7574, 7575, 7585, 7599, 7603, 
    7606, 7622, 7652, 7671, 7704, 
    7786, 7805, 7816, 7838, 7861, 
    7862, 7863, 7870, 7892, 7907, 
    7938, 7962, 7979, 7987, 7999, 
    8000, 8034, 8083, 8120, 8133, 
    8184, 8186, 8247, 8248, 9858])

#Create arrays holding the 5-fold cross-validation indices gathered for consistency across models
train_array = []
test_array = []
    
train_array.append([1, 2, 3, 5, 6, 7, 8, 9, 11, 13, 14, 15, 16, 17, 19, 20, 21, 22, 
                        23, 24, 25, 27, 28, 29, 32, 34, 35, 36, 37, 38, 39, 40, 41, 42, 
                        43, 44, 46, 48, 49, 50, 51, 52, 54, 55, 56, 57, 58, 59, 60, 61, 
                        62, 63, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 78, 79, 81, 
                        82, 83, 84, 85, 86, 87, 88, 90, 91, 92, 93, 95, 97, 98, 100, 101, 102, 103])
test_array.append([0, 4, 10, 12, 18, 26, 30, 31, 33, 45, 47, 53, 64, 65, 77, 80, 89, 94, 96, 99, 104])
    
train_array.append([0, 1, 2, 3, 4, 6, 7, 8, 10, 12, 13, 14, 17, 18, 19, 20, 21, 23, 
                        24, 25, 26, 27, 29, 30, 31, 32, 33, 34, 36, 37, 38, 41, 43, 45, 
                        46, 47, 48, 49, 50, 51, 52, 53, 54, 57, 58, 59, 60, 61, 63, 64, 
                        65, 66, 67, 68, 69, 70, 71, 73, 74, 75, 77, 80, 81, 82, 83, 84, 
                        86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 104])
test_array.append([5, 9, 11, 15, 16, 22, 28, 35, 39, 40, 42, 44, 55, 56, 62, 72, 76, 78, 79, 85, 103])
    
train_array.append([0, 1, 2, 4, 5, 9, 10, 11, 12, 14, 15, 16, 18, 20, 21, 22, 23, 26, 
                    28, 29, 30, 31, 32, 33, 35, 36, 37, 39, 40, 41, 42, 44, 45, 46, 47, 
                    48, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 
                    70, 71, 72, 74, 75, 76, 77, 78, 79, 80, 82, 83, 85, 86, 87, 88, 89, 
                    90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104])
test_array.append([3, 6, 7, 8, 13, 17, 19, 24, 25, 27, 34, 38, 43, 49, 66, 67, 68, 69, 73, 81, 84])

train_array.append([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 
                        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 33, 34, 
                        35, 37, 38, 39, 40, 42, 43, 44, 45, 47, 49, 51, 52, 53, 55, 56, 
                        60, 62, 64, 65, 66, 67, 68, 69, 71, 72, 73, 74, 76, 77, 78, 79, 
                        80, 81, 82, 84, 85, 86, 87, 88, 89, 92, 93, 94, 95, 96, 99, 102, 103, 104])
test_array.append([32, 36, 41, 46, 48, 50, 54, 57, 58, 59, 61, 63, 70, 75, 83, 90, 91, 97, 98, 100, 101])
    
train_array.append([0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 22,
                        24, 25, 26, 27, 28, 30, 31, 32, 33, 34, 35, 36, 38, 39, 40, 41, 
                        42, 43, 44, 45, 46, 47, 48, 49, 50, 53, 54, 55, 56, 57, 58, 59, 
                        61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 72, 73, 75, 76, 77, 78, 
                        79, 80, 81, 83, 84, 85, 89, 90, 91, 94, 96, 97, 98, 99, 100, 101, 103, 104])
test_array.append([1, 2, 14, 20, 21, 23, 29, 37, 51, 52, 60, 71, 74, 82, 86, 87, 88, 92, 93, 95, 102])
    
# Equations for three Principal Components from PCA using response variables combined with other predictors
#PC1=-0.0002714X1+0.02612X2+0.03858X3-0.007658X4+0.001592X5-0.02087X6+0.8564X7-0.1468X8+0.01192X9-0.0001049X10+0.01913X11+0.02076X12
#PC2=0.0003944X1+0.002204X2+0.01052X3+0.3248X4-0.0009976X5-0.04421X6+2.3406X7+0.06103X8+0.08841X9+0.00009018X10+0.05678X11-0.002022X12
#PC3=-0.00007998X1-0.0006124X2-0.001063X3-0.01855X4+0.00001956X5+0.01170X6+0.6076X7+0.4664X8-0.002995X9+0.008185X10+0.8815X11-0.0004730X12
    
# Equations for three Principal Components from PCA omitting both response variables,
#PC-1=-0.0004514X1+0.03194X2-0.04343X3+0.002243X4-0.02252X5+0.9877X6-0.2265X7+0.006144X8-0.0001488X9+0.02943X10
#PC-2=0.0001702X1+0.005484X2+0.2057X3-0.0003188X4-0.02584X5+1.6963X6-0.05890X7+0.05809X8+1.9748X9+0.03686X10
#PC-3=-0.00006323X1-0.001180X2-0.02384X3-0.00002833X4+0.01170X5+0.5204X6+0.4791X7-0.004318X8+0.008271X9+0.8765X10
    
# Get the current working directory 
current_directory = os.getcwd() 

# Print the current working directory 
print(current_directory)

# Define the directory containing the files 
path = current_directory+"\\Modeling\\"
print(path)

filename = path + 'Final_Monthly_Dataset.csv'

results_df = pd.read_csv(filename) #read csv data into a dataframe

results_df = results_df.drop(['Unnamed: 0', 'vapor_pressure'], axis=1)

print(results_df)

print(results_df.columns)

results_full_df = results_df #grab full dataset before removing last 4 years

results_df = results_df[pd.to_datetime(results_df['date']) < datetime.strptime("2021-1-1", "%Y-%m-%d")]

# Get test dataframe
results_test_df = results_full_df
results_test_df = results_test_df[pd.to_datetime(results_test_df['date']) > datetime.strptime("2020-12-31", "%Y-%m-%d")]
results_test_df = results_test_df.reset_index()

#Get one weather station
results_df = results_df[results_df['file_id'] == 6501]
results_test_df = results_test_df[results_test_df['file_id'] == 6501]

print(results_df)

# Get test values for specific humidity and temperature
specific_humidity_test_values = results_test_df['specific_humidity']
temperature_test_values = results_test_df['temperatures']

print(specific_humidity_test_values)
print(temperature_test_values)

# define a series for each column in the data frame that needs to be normalized
# Normalized columns: temperatures, specific_humidity, slp, water, region, wet_bulb_temperature,
# GHI, SNDP, latitude, longitude, elevation, Year, Month, Day, solar_activity, ONI

data = []
scaler = []
scaled_data = []

# Get Data
data.append(results_df['temperatures'])
data.append(results_df['specific_humidity'])
data.append(results_df['slp'])
data.append(results_df['water'])
data.append(results_df['region'])
data.append(results_df['wet_bulb_temperature'])
data.append(results_df['GHI'])
data.append(results_df['SNDP'])
data.append(results_df['solar_activity'])
data.append(results_df['ONI'])
#data11 = results_df['latitude'])
#data12 = results_df['longitude'])
#data13 = results_df['elevation'])
#data14 = results_df['Year'])
#data15 = results_df['Month'])
#data16 = results_df['Day'])

# Data Preparation: Scale Data
for i in range(10):
    scaler.append(StandardScaler())
    scaled_data.append([0])    
    scaler[i], scaled_data[i] = prepare_data(data)

dates = pd.date_range(start='2006-01-01', periods=180, freq='ME')

# ARMA model (for ARMA model, d=0 to remove Integrate (I) component in SARIMAX model and all seasonal parameters are zero)

# Predict temperature

print("ARMA Temperature Predictions:")

y = scaled_data[0] # y = temperature

for i in range(9):
    j=i+1
    exog.append(scaled_data[j])

# Reconstruct the data frame with standardized values
data = pd.DataFrame({'y': y, 'exog1': exog[0], 'exog2': exog[1], 'exog3': exog[2], 'exog4': exog[3], 
                     'exog5': exog[4], 'exog6': exog[5], 'exog7': exog[6], 'exog8': exog[7], 
                     'exog9': exog[8]}, index=dates)

# Fit the ARMA model
model_temp_ARMA = SARIMAX(data['y'], exog=None, order=(1, 0, 1), seasonal_order=(0, 0, 0, 0))
results_temp_ARMA = model_temp_ARMA.fit(disp=False)

# Print the summary of the model
print(results_temp_ARMA.summary())

# Forecasting
n_forecast = 47
forecast_temp_ARMA = results_temp_ARMA.get_forecast(steps=n_forecast, exog=exog[-n_forecast:])
forecast_temp_ARMA_mean = forecast_temp_ARMA.predicted_mean
forecast_temp_ARMA_ci = forecast_temp_ARMA.conf_int()

# Print the forecasted values
print(forecast_temp_ARMA_mean)
print(forecast_temp_ARMA_ci)

# Reshape data
forecast_temp_ARMA_mean = np.array(forecast_temp_ARMA_mean)
forecast_temp_ARMA_mean = forecast_temp_ARMA_mean.reshape(-1,1)

# Inverse transform and print forecast
inversed_temp_ARMA_mean = scaler1.inverse_transform(forecast_temp_ARMA_mean)
inversed_temp_ARMA_ci = scaler1.inverse_transform(forecast_temp_ARMA_ci)
print(inversed_temp_ARMA_mean)
print(inversed_temp_ARMA_ci)
print(temperature_test_values)

dates_predicted = pd.date_range(start='2021-01-01', periods=47, freq='ME')

combined_temp_ARMA = []
for i in range(len(temperature_test_values)):
    combined_temp_ARMA.append([dates_predicted[i], inversed_temp_ARMA_mean[i, 0], temperature_test_values[i]])

combined_temp_ARMA = pd.DataFrame(combined_temp_ARMA)
combined_temp_ARMA.columns = ['prediction_date', 'predicted_temp', 'actual_temp']

combined_temp_ARMA['error_pct'] = 100 * (combined_temp_ARMA['actual_temp'] - combined_temp_ARMA['predicted_temp'])/combined_temp_ARMA['actual_temp']

# Set display option to show all rows
pd.set_option('display.max_rows', 47)

print(combined_temp_ARMA.head(47))


# Predict specific humidity

print("ARMA Specific Humidity Predictions:")

y = scaled_data[1] # y = specific humidity
exog[0] = scaled_data[0] # exog[0] = temperature

# Reconstruct the data frame with standardized values
data = pd.DataFrame({'y': y, 'exog1': exog[0], 'exog2': exog[1], 'exog3': exog[2], 'exog4': exog[3], 
                     'exog5': exog[4], 'exog6': exog[5], 'exog7': exog[6], 'exog8': exog[7], 
                     'exog9': exog[8]}, index=dates)

# Fit the SARIMAX model
model_sh_ARMA = SARIMAX(data['y'], exog=None, order=(1, 0, 1), seasonal_order=(0, 0, 0, 0))
results_sh_ARMA = model_sh_ARMA.fit(disp=False)

# Print the summary of the model
print(results_sh_ARMA.summary())

# Forecasting
n_forecast = 47
forecast_sh_ARMA = results_sh_ARMA.get_forecast(steps=n_forecast, exog=exog[-n_forecast:])
forecast_sh_ARMA_mean = forecast_sh_ARMA.predicted_mean
forecast_sh_ARMA_ci = forecast_sh_ARMA.conf_int()

# Print the forecasted values
print(forecast_sh_ARMA_mean)
print(forecast_sh_ARMA_ci)

# Reshape data
forecast_sh_ARMA_mean = np.array(forecast_sh_ARMA_mean)
forecast_sh_ARMA_mean = forecast_sh_ARMA_mean.reshape(-1,1)

# Inverse transform and print forecast
inversed_sh_ARMA_mean = scaler1.inverse_transform(forecast_sh_ARMA_mean)
inversed_sh_ARMA_ci = scaler1.inverse_transform(forecast_sh_ARMA_ci)
print(inversed_sh_ARMA_mean)
print(inversed_sh_ARMA_ci)
print(specific_humidity_test_values)

dates_predicted = pd.date_range(start='2021-01-01', periods=47, freq='ME')

combined_sh_ARMA = []
for i in range(len(specific_humidity_test_values)):
    combined_sh_ARMA.append([dates_predicted[i], inversed_sh_ARMA_mean[i, 0], specific_humidity_test_values[i]])

combined_sh_ARMA = pd.DataFrame(combined_sh_ARMA)
combined_sh_ARMA.columns = ['prediction_date', 'predicted_sh', 'actual_sh']

combined_sh_ARMA['error_pct'] = 100 * (combined_sh_ARMA['actual_sh'] - combined_sh_ARMA['predicted_sh'])/combined_sh_ARMA['actual_sh']

# Set display option to show all rows
pd.set_option('display.max_rows', 47)

print(combined_sh_ARMA.head(47))

# SARIMAX model

# Predict temperature

print("SARIMAX Temperature Predictions:")

y = scaled_data[0] # y = temperature
exog[0] = scaled_data[1] # exog[0] = specific humidity

# Reconstruct the data frame with standardized values
data = pd.DataFrame({'y': y, 'exog1': exog[0], 'exog2': exog[1], 'exog3': exog[2], 'exog4': exog[3], 
                     'exog5': exog[4], 'exog6': exog[5], 'exog7': exog[6], 'exog8': exog[7], 
                     'exog9': exog[8]}, index=dates)

# Define the exogenous variables
exog = data[['exog1', 'exog2', 'exog3', 'exog4', 'exog5', 'exog6', 'exog7', 'exog8', 
             'exog9']]

print(exog)

# Fit the SARIMAX model
model_temp_SARIMAX = SARIMAX(data['y'], exog=exog, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
results_temp_SARIMAX = model_temp_SARIMAX.fit(disp=False)

# Print the summary of the model
print(results_temp_SARIMAX.summary())

# Forecasting
n_forecast = 47
forecast_temp_SARIMAX = results_temp_SARIMAX.get_forecast(steps=n_forecast, exog=exog[-n_forecast:])
forecast_temp_SARIMAX_mean = forecast_temp_SARIMAX.predicted_mean
forecast_temp_SARIMAX_ci = forecast_temp_SARIMAX.conf_int()

# Print the forecasted values
print(forecast_temp_SARIMAX_mean)
print(forecast_temp_SARIMAX_ci)

# Reshape data
forecast_temp_SARIMAX_mean = np.array(forecast_temp_SARIMAX_mean)
forecast_temp_SARIMAX_mean = forecast_temp_SARIMAX_mean.reshape(-1,1)

# Inverse transform and print forecast
inversed_temp_SARIMAX_mean = scaler1.inverse_transform(forecast_temp_SARIMAX_mean)
inversed_temp_SARIMAX_ci = scaler1.inverse_transform(forecast_temp_SARIMAX_ci)
print(inversed_temp_SARIMAX_mean)
print(inversed_temp_SARIMAX_ci)
print(temperature_test_values)

dates_predicted = pd.date_range(start='2021-01-01', periods=47, freq='ME')

combined_temp_SARIMAX = []
for i in range(len(temperature_test_values)):
    combined_temp_SARIMAX.append([dates_predicted[i], inversed_temp_SARIMAX_mean[i, 0], temperature_test_values[i]])

combined_temp_SARIMAX = pd.DataFrame(combined_temp_SARIMAX)
combined_temp_SARIMAX.columns = ['prediction_date', 'predicted_temp', 'actual_temp']

combined_temp_SARIMAX['error_pct'] = 100 * (combined_temp_SARIMAX['actual_temp'] - combined_temp_SARIMAX['predicted_temp'])/combined_temp_SARIMAX['actual_temp']

# Set display option to show all rows
pd.set_option('display.max_rows', 47)

print(combined_temp_SARIMAX.head(47))


# Predict specific humidity

print("SARIMAX Specific Humidity Predictions:")

y = scaled_data[1] # y = specific humidity
exog[0] = scaled_data[0] # exog[0] = temperature

# Reconstruct the data frame with standardized values
data = pd.DataFrame({'y': y, 'exog1': exog[0], 'exog2': exog[1], 'exog3': exog[2], 'exog4': exog[3], 
                     'exog5': exog[4], 'exog6': exog[5], 'exog7': exog[6], 'exog8': exog[7], 
                     'exog9': exog[8]}, index=dates)

# Define the exogenous variables
exog = data[['exog1', 'exog2', 'exog3', 'exog4', 'exog5', 'exog6', 'exog7', 'exog8', 
             'exog9']]

print(exog)

# Fit the SARIMAX model
model_sh_SARIMAX = SARIMAX(data['y'], exog=exog, order=(1, 1, 1), seasonal_order=(1, 1, 1, 12))
results_sh_SARIMAX = model_sh_SARIMAX.fit(disp=False)

# Print the summary of the model
print(results_sh_SARIMAX.summary())

# Forecasting
n_forecast = 47
forecast_sh_SARIMAX = results_sh_SARIMAX.get_forecast(steps=n_forecast, exog=exog[-n_forecast:])
forecast_sh_SARIMAX_mean = forecast_sh_SARIMAX.predicted_mean
forecast_sh_SARIMAX_ci = forecast_sh_SARIMAX.conf_int()

# Print the forecasted values
print(forecast_sh_SARIMAX_mean)
print(forecast_sh_SARIMAX_ci)

# Reshape data
forecast_sh_SARIMAX_mean = np.array(forecast_sh_SARIMAX_mean)
forecast_sh_SARIMAX_mean = forecast_sh_SARIMAX_mean.reshape(-1,1)

# Inverse transform and print forecast
inversed_sh_SARIMAX_mean = scaler1.inverse_transform(forecast_sh_SARIMAX_mean)
inversed_sh_SARIMAX_ci = scaler1.inverse_transform(forecast_sh_SARIMAX_ci)
print(inversed_sh_SARIMAX_mean)
print(inversed_sh_SARIMAX_ci)
print(specific_humidity_test_values)

dates_predicted = pd.date_range(start='2021-01-01', periods=47, freq='ME')

combined_sh_SARIMAX = []
for i in range(len(specific_humidity_test_values)):
    combined_sh_SARIMAX.append([dates_predicted[i], inversed_sh_SARIMAX_mean[i, 0], specific_humidity_test_values[i]])

combined_sh_SARIMAX = pd.DataFrame(combined_sh_SARIMAX)
combined_sh_SARIMAX.columns = ['prediction_date', 'predicted_sh', 'actual_sh']

combined_sh_SARIMAX['error_pct'] = 100 * (combined_sh_SARIMAX['actual_sh'] - combined_sh_SARIMAX['predicted_sh'])/combined_sh_SARIMAX['actual_sh']

# Set display option to show all rows
pd.set_option('display.max_rows', 47)

print(combined_sh_SARIMAX.head(47))



C:\Users\User
C:\Users\User\Modeling\
             date  file_id  temperatures          slp  wet_bulb_temperature  \
0      2006-01-31     6501     12.209677  1018.534543              7.759507   
1      2006-02-28     6501      8.174541  1021.230347              4.326557   
2      2006-03-31     6501     15.676613  1018.968548             10.491486   
3      2006-04-30     6501     22.464167  1014.686944             16.981874   
4      2006-05-31     6501     23.657258  1014.236828             18.675700   
...           ...      ...           ...          ...                   ...   
24057  2024-07-31     9858     28.604704  1013.969355             22.781443   
24058  2024-08-31     9858     29.114919  1015.112097             22.875429   
24059  2024-09-30     9858     24.570278  1014.393750             18.061455   
24060  2024-10-31     9858     21.159140  1018.230376             13.388460   
24061  2024-11-30     9858     12.112917  1015.400069              8.511346   

       specif

ValueError: setting an array element with a sequence.