# Classic and Deep Learning Time Series Forecasting

## ARIMA/SARIMA Example

In [58]:
import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.statespace.sarimax import SARIMAX
import matplotlib.pyplot as plt

In [None]:
path = '~/.cache/kagglehub/datasets/chirag19/air-passengers/versions/1/AirPassengers.csv'

data = pd.read_csv(path)

data.head()

In [None]:
# Load the dataset
# from statsmodels.datasets.airline import load_pandas
#data = load_pandas().data
data['Month'] = pd.date_range(start='1949-01', periods=len(data), freq='ME')
data.set_index('Month', inplace=True)

data.head()

In [None]:

# Log transform to stabilize variance
data['Log_Passengers'] = np.log(data['#Passengers'])

data.head()

In [None]:
# Seasonal decomposition
decomposition = seasonal_decompose(data['Log_Passengers'], model='additive')
decomposition.plot()
plt.show()

In [None]:
# SARIMA model
model = SARIMAX(data['Log_Passengers'], 
                order=(1, 1, 1), 
                seasonal_order=(1, 1, 1, 12), 
                freq='ME')
results = model.fit()

# Summary and diagnostics
print(results.summary())
results.plot_diagnostics(figsize=(15, 10))
plt.show()

In [None]:

# Forecasting
forecast = results.get_forecast(steps=24)
forecast_index = pd.date_range(data.index[-1] + pd.DateOffset(months=1), periods=24, freq='ME')
forecast_values = np.exp(forecast.predicted_mean)  # Convert back from log
confidence_intervals = np.exp(forecast.conf_int())

# Plot
plt.figure(figsize=(10, 6))
plt.plot(data['#Passengers'], label='Observed')
plt.plot(forecast_index, forecast_values, label='Forecast', color='red')
plt.fill_between(forecast_index, confidence_intervals.iloc[:, 0], confidence_intervals.iloc[:, 1], color='pink', alpha=0.3)
plt.legend()
plt.show()

## Neural Network Example -- TensorFlow/Keras

Dataset: Energy Consumption Dataset (available via UCI Machine Learning Repository)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM

In [None]:

# Load dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv"
data = pd.read_csv(url)
data['date'] = pd.to_datetime(data['date'])
data.set_index('date', inplace=True)
data = data['Appliances'].resample('H').mean().fillna(method='ffill')  # Resample and fill missing

# Normalize data
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data.values.reshape(-1, 1))

# Prepare data for LSTM
def create_sequences(data, seq_length):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i + seq_length])
        y.append(data[i + seq_length])
    return np.array(X), np.array(y)

seq_length = 24
X, y = create_sequences(data_scaled, seq_length)
X_train, X_test = X[:int(len(X) * 0.8)], X[int(len(X) * 0.8):]
y_train, y_test = y[:int(len(y) * 0.8)], y[int(len(y) * 0.8):]

# LSTM model
model = Sequential([
    LSTM(50, activation='relu', input_shape=(seq_length, 1)),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse')

# Train model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=32, verbose=1)

# Evaluate and predict
predictions = model.predict(X_test)
predictions_rescaled = scaler.inverse_transform(predictions)
y_test_rescaled = scaler.inverse_transform(y_test.reshape(-1, 1))

# Plot results
plt.figure(figsize=(10, 6))
plt.plot(y_test_rescaled, label='True Values')
plt.plot(predictions_rescaled, label='Predicted Values', alpha=0.7)
plt.legend()
plt.show()

## Neural Network Example -- PyTorch

Dataset: Energy Consumption Dataset (UCI Machine Learning Repository)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

Load dataset from https://archive.ics.uci.edu/dataset/374/appliances+energy+prediction

In [2]:
import os

file_path = 'energydata_complete.csv'
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00374/energydata_complete.csv"

if os.path.exists(file_path):
    data = pd.read_csv(file_path)
else:
    data = pd.read_csv(url)
    data.to_csv(file_path, index=False)

data.head()


Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [36]:
# Save the dataframe locally if it doesn't exist
if not os.path.exists(file_path):
    data.to_csv(file_path, index=False)

### Column Descriptions

| Column | Description |
| ---- | ----------- |
| date | time year-month-day hour:minute:second |
| Appliances | energy use in Wh |
| lights | energy use of light fixtures in the house in Wh |
| T1 | Temperature in kitchen area, in Celsius |
| RH_1 | Humidity in kitchen area, in % |
| T2 | Temperature in living room area, in Celsius |
| RH_2 | Humidity in living room area, in % |
| T3 | Temperature in laundry room area |
| RH_3 | Humidity in laundry room area, in % |
| T4 | Temperature in office room, in Celsius |
| RH_4 | Humidity in office room, in % |
| T5 | Temperature in bathroom, in Celsius |
| RH_5 | Humidity in bathroom, in % |
| T6 | Temperature outside the building (north side), in Celsius |
| RH_6 | Humidity outside the building (north side), in % |
| T7 | Temperature in ironing room , in Celsius |
| RH_7 | Humidity in ironing room, in % |
| T8 | Temperature in teenager room 2, in Celsius |
| RH_8 | Humidity in teenager room 2, in % |
| T9 | Temperature in parents room, in Celsius |
| RH_9 | Humidity in parents room, in % |
| To | Temperature outside (from Chievres weather station), in Celsius |
| Pressure | (from Chievres weather station), in mm Hg |
| RH_out | Humidity outside (from Chievres weather station), in % |
| Wind speed | (from Chievres weather station), in m/s |
| Visibility | (from Chievres weather station), in km |
| Tdewpoint | (from Chievres weather station), Â°C |
| rv1 | Random variable 1, nondimensional |
| rv2 | Random variable 2, nondimensional |

Where indicated, hourly data (then interpolated) from the nearest airport weather station (Chievres Airport, Belgium) was downloaded from a public data set from Reliable Prognosis, rp5.ru. Permission was obtained from Reliable Prognosis for the distribution of the 4.5 months of weather data.

In [None]:
data.info()

In [3]:

data['date'] = pd.to_datetime(data['date'])
data.set_index('date', inplace=True)

data.head()


Unnamed: 0_level_0,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,...,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,...,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389
2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,...,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


We're interested in the `Appliances` column, which is the energy use of the appliances in Wh. 

First, we'll resample the data to hourly resolution and fill missing values using the forward fill method.

In [4]:
data = data['Appliances'].resample('h').mean().fillna(method='ffill')  # Resample and fill missing

data.head()

  data = data['Appliances'].resample('h').mean().fillna(method='ffill')  # Resample and fill missing


date
2016-01-11 17:00:00     55.000000
2016-01-11 18:00:00    176.666667
2016-01-11 19:00:00    173.333333
2016-01-11 20:00:00    125.000000
2016-01-11 21:00:00    103.333333
Freq: h, Name: Appliances, dtype: float64

Scale the values to be between 0 and 1 and convert to a numpy array.

In [5]:
# Normalize data
scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data.values.reshape(-1, 1))

print(type(data_scaled))
print(data_scaled.shape)


<class 'numpy.ndarray'>
(3290, 1)


In [6]:

# Prepare data for LSTM
class TimeSeriesDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, index):
        X = self.data[index:index + self.seq_length]
        y = self.data[index + self.seq_length]
        return torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32)


In [7]:

seq_length = 24
dataset = TimeSeriesDataset(data_scaled, seq_length)

print(len(dataset))

3266


In [8]:

# Split data into training and testing
train_size = int(len(dataset) * 0.8)
test_size = len(dataset) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(len(train_loader))
print(len(test_loader))


82
21


In [None]:
# let's look at the first batch
for X, y in train_loader:
    print(X.shape)
    print(y.shape)
    break


In [9]:

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, input_size=1, hidden_size=50, output_size=1):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.fc(x[:, -1, :])  # Use the output of the last time step
        return x


In [10]:
model = LSTMModel()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [None]:
# Train the model
epochs = 20
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for X, y in train_loader:
        X = X.unsqueeze(-1)  # Add input dimension
        y = y.unsqueeze(-1)  # Add target dimension

        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss/len(train_loader):.4f}")


In [None]:

# Evaluate the model
model.eval()
predictions = []
actuals = []
with torch.no_grad():
    for X, y in test_loader:
        X = X.unsqueeze(-1)
        y = y.unsqueeze(-1)
        preds = model(X)
        predictions.extend(preds.numpy())
        actuals.extend(y.numpy())


In [None]:

# Rescale predictions and actuals to original scale
predictions_rescaled = scaler.inverse_transform(predictions)
actuals_rescaled = scaler.inverse_transform(actuals)


In [None]:

# Plot results
plt.figure(figsize=(10, 6))
plt.plot(actuals_rescaled, label='True Values')
plt.plot(predictions_rescaled, label='Predicted Values', alpha=0.7)
plt.legend()
plt.show()

# Case Study and Discussion

## Real-world case study: Application of time series analysis

- **Case Study**: Let's explore a real-world case study where time series analysis is applied.
  - **Industry**: Choose an industry (e.g., finance, healthcare, retail).
  - **Problem Statement**: Define the problem that needs to be addressed using time series analysis.
  - **Data Collection**: Describe the data collection process and the type of data used.
  - **Model Selection**: Select appropriate time series models for the analysis.
  - **Analysis**: Perform the time series analysis and interpret the results.
  - **Outcome**: Discuss the outcomes and how the analysis helped in decision-making.
  
## Group discussion on potential projects or applications

- **Group Discussion**: Let's engage in a group discussion to brainstorm potential projects or applications of time series analysis.
  - **Project Ideas**: Share and discuss various project ideas that can benefit from time series analysis.
  - **Application Areas**: Identify different application areas such as finance, healthcare, retail, and more.
  - **Challenges**: Discuss the potential challenges and limitations of applying time series analysis in these projects.
  - **Collaboration**: Explore opportunities for collaboration and knowledge sharing within the group.