In [22]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from functions.naming import rename_columns
import tensorflow as tf

# Assuming your data is in a pandas DataFrame named 'df' with a datetime index
# and a column named 'monthly_data'
# Replace this with your actual data loading process
# For simplicity, I'll create a dummy dataset
# Load the data
df = pd.read_excel("/Users/athanasioskaravangelis/Desktop/RSM BAM/Workshop/pko_forecasting/data/PKO_Initial_Dataset.xlsx")
#rename columns
df = rename_columns(df)
# select only the values after 2010-01-01
df['date'] = pd.to_datetime(df['date'], format='%b-%y')
df = df[df['date'] > '2009-12-02']

# Ensure the date column is the index
df.set_index('date', inplace=True)

# Specify columns to exclude
exclude_columns = [
    'pko_total_supply_malaysia', 'indonesia_disaster', 'malaysia_disaster',
    'pko_fob_malaysia', 'jet_fuel_us', 'jet_fuel_europe', 'soybean_oil_zlz2',
    'tallow_fob_us_gulf', 'bio_ethanol', 'rspo', 'palm_oil_cif_nwe', 'palm_olein_fob_malaysia',
    'palm_stearin_cif_rotterdam', 'fatty_alcohol_c12_14_fob_asia', 'fatty_alcohol_c16_18_fob_asia',
    'fatty_alcohol_c12_14_fd_nwe', 'jet_fuel_us_usd_mt'
]

# Filter the dataset to exclude specified columns
df_filtered = df.drop(columns=exclude_columns)

# Update NUM_FEATURES based on the filtered dataset
num_features = df_filtered.shape[1]
num_features

# fill nas with average values
df = df_filtered.fillna(df_filtered.mean())

# Normalize the data using MinMaxScaler
scaler = MinMaxScaler(feature_range=(0, 1))
df['pko_cif_rotterdam'] = scaler.fit_transform(df[['pko_cif_rotterdam']])

# Create sequences for the LSTM model
def create_sequences(data, seq_length):
    sequences = []
    targets = []
    for i in range(len(data) - seq_length):
        seq = data[i:i + seq_length]
        target = data[i + seq_length]
        sequences.append(seq)
        targets.append(target)
    return np.array(sequences), np.array(targets)

# Define the sequence length (number of past months to consider)
seq_length = 12

# Create sequences and targets
X, y = create_sequences(df['pko_cif_rotterdam'].values, seq_length)

# Reshape the input data for LSTM (samples, time steps, features)
X = X.reshape((X.shape[0], X.shape[1], 1))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# Build the LSTM model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(seq_length, 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')  # You can experiment with different optimizers and loss functions



In [23]:
# Train the model on the training set
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.1)

# Evaluate the model on the test set
loss = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')

Epoch 1/50


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test Loss: 0.022817624732851982


In [24]:
# Forecasting - predicting 12 months into the future
last_sequence = X_test[-1]
forecast = []

for i in range(12):
    prediction = model.predict(np.reshape(last_sequence, (1, seq_length, 1)))
    forecast.append(prediction[0, 0])
    last_sequence = np.roll(last_sequence, -1)
    last_sequence[:, -1] = prediction[0, 0]  # Corrected line

# Inverse transform the forecasted values to get the actual values
forecast = scaler.inverse_transform(np.array(forecast).reshape(-1, 1))

# Create a DataFrame for the forecasted values
forecast_dates = pd.date_range(start=df.index[-1], periods=13, freq='MS')[1:]
forecast_df = pd.DataFrame({'forecasted_data': forecast.flatten()}, index=forecast_dates)

# Print or visualize the forecasted data
print(forecast_df)


            forecasted_data
2023-12-01       979.184814
2024-01-01      1008.167603
2024-02-01      1028.095825
2024-03-01      1041.834473
2024-04-01      1051.318970
2024-05-01      1057.867798
2024-06-01      1062.394287
2024-07-01      1065.524536
2024-08-01      1067.690063
2024-09-01      1069.188599
2024-10-01      1070.224121
2024-11-01      1070.937744
