<a href="https://colab.research.google.com/github/torrhen/cable-temperature-prediction/blob/master/cable_temperature_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# data hosted in public repository
DATA_CSV_URL = "https://raw.githubusercontent.com/torrhen/cable-temperature-prediction/master/cable.csv"

In [2]:
import os
# create new project folder
os.makedirs('cable_temperature_prediction', exist_ok=True)

In [3]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'

## cable_temperature_prediction/data.py

In [4]:
#%writefile cable_temperature_prediction/data.py
import pandas as pd
import datetime as dt
import numpy as np
import torch

STANDARD_DEVIATION_OUTLIER_THRESHOLD = 2
ROUNDING_PRECISION = 3

# create dataframe from raw data taken from GitHub repository
def create_dataframe(url):
  # set the timestamp column as the index of the dataframe
  df = pd.read_csv(url, index_col=0, parse_dates=[0], infer_datetime_format=True)
  # resample data to ensure no missing timestamps
  df = df.resample("5min").mean()
  # replace all nan values using the last valid observation for every column
  df = df.pad() # pad() is equivalent to fillna(method="ffill")
  df = df.round(ROUNDING_PRECISION)
  # remove values outside 2 SD of the mean of each calendar month
  df = remove_outliers(df)

  return df

# group data by calendar month and replace values outside 2 SD of the mean with the last valid observed value for every column
def remove_outliers(df):
  # add new column string the integer month of each timestamp
  df["month"] = df.index.month

  # replace data points beyond 2 SD of the mean for each calendar month with np.nan
  def remove_data(group, std):
    group[np.abs(group - group.mean()) > std * group.std()] = np.nan
    return group

  transformed_df = df.groupby("month", as_index=False).transform(lambda x: remove_data(x, STANDARD_DEVIATION_OUTLIER_THRESHOLD))
  # fill empty data with the last valid observation for all columns
  transformed_df = transformed_df.fillna(method="ffill").round(ROUNDING_PRECISION)
  # remove integer month column
  df.drop(columns=["month"], inplace=True)

  return transformed_df


In [5]:
# create dataframe from data downloaded from GitHUb
df = create_dataframe(DATA_CSV_URL)

In [6]:
test_df = df[:20]
test_df
#a = test_df['Thermocouple 7'].shift(5)
#a

Unnamed: 0_level_0,Thermocouple 1,Thermocouple 2,Thermocouple 3,Thermocouple 4,Thermocouple 5,Thermocouple 6,Thermocouple 7,Load Current (Blue),Load Current (Yellow),Load Current (Red),Air Temperature
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2017-01-01 00:00:00,14.85,15.1,15.26,16.43,16.29,17.22,18.34,38.9,48.42,46.64,10.9
2017-01-01 00:05:00,14.85,15.09,15.26,16.42,16.29,17.21,18.34,39.14,48.8,47.02,10.88
2017-01-01 00:10:00,14.85,15.09,15.26,16.42,16.29,17.22,18.34,39.25,48.53,46.77,10.87
2017-01-01 00:15:00,14.84,15.09,15.26,16.42,16.28,17.2,18.34,38.71,46.77,45.48,10.86
2017-01-01 00:20:00,14.85,15.09,15.26,16.42,16.29,17.21,18.34,38.89,47.1,45.66,10.85
2017-01-01 00:25:00,14.82,15.07,15.23,16.39,16.26,17.19,18.31,38.84,47.19,45.1,10.81
2017-01-01 00:30:00,14.82,15.07,15.23,16.4,16.26,17.18,18.3,39.48,47.55,45.34,10.79
2017-01-01 00:35:00,14.82,15.07,15.23,16.39,16.27,17.18,18.31,39.48,46.87,44.93,10.78
2017-01-01 00:40:00,14.82,15.07,15.23,16.4,16.26,17.18,18.31,40.34,46.71,43.72,10.77
2017-01-01 00:45:00,14.82,15.07,15.22,16.39,16.26,17.18,18.3,38.91,46.84,43.83,10.74


In [7]:
y = test_df.pop('Thermocouple 7')
y

Timestamp
2017-01-01 00:00:00    18.34
2017-01-01 00:05:00    18.34
2017-01-01 00:10:00    18.34
2017-01-01 00:15:00    18.34
2017-01-01 00:20:00    18.34
2017-01-01 00:25:00    18.31
2017-01-01 00:30:00    18.30
2017-01-01 00:35:00    18.31
2017-01-01 00:40:00    18.31
2017-01-01 00:45:00    18.30
2017-01-01 00:50:00    18.30
2017-01-01 00:55:00    18.30
2017-01-01 01:00:00    18.29
2017-01-01 01:05:00    18.31
2017-01-01 01:10:00    18.30
2017-01-01 01:15:00    18.29
2017-01-01 01:20:00    18.29
2017-01-01 01:25:00    18.29
2017-01-01 01:30:00    18.28
2017-01-01 01:35:00    18.28
Freq: 5T, Name: Thermocouple 7, dtype: float64

In [8]:
y = y[5:]
y

Timestamp
2017-01-01 00:25:00    18.31
2017-01-01 00:30:00    18.30
2017-01-01 00:35:00    18.31
2017-01-01 00:40:00    18.31
2017-01-01 00:45:00    18.30
2017-01-01 00:50:00    18.30
2017-01-01 00:55:00    18.30
2017-01-01 01:00:00    18.29
2017-01-01 01:05:00    18.31
2017-01-01 01:10:00    18.30
2017-01-01 01:15:00    18.29
2017-01-01 01:20:00    18.29
2017-01-01 01:25:00    18.29
2017-01-01 01:30:00    18.28
2017-01-01 01:35:00    18.28
Freq: 5T, Name: Thermocouple 7, dtype: float64

In [9]:
x = test_df
x

Unnamed: 0_level_0,Thermocouple 1,Thermocouple 2,Thermocouple 3,Thermocouple 4,Thermocouple 5,Thermocouple 6,Load Current (Blue),Load Current (Yellow),Load Current (Red),Air Temperature
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-01-01 00:00:00,14.85,15.1,15.26,16.43,16.29,17.22,38.9,48.42,46.64,10.9
2017-01-01 00:05:00,14.85,15.09,15.26,16.42,16.29,17.21,39.14,48.8,47.02,10.88
2017-01-01 00:10:00,14.85,15.09,15.26,16.42,16.29,17.22,39.25,48.53,46.77,10.87
2017-01-01 00:15:00,14.84,15.09,15.26,16.42,16.28,17.2,38.71,46.77,45.48,10.86
2017-01-01 00:20:00,14.85,15.09,15.26,16.42,16.29,17.21,38.89,47.1,45.66,10.85
2017-01-01 00:25:00,14.82,15.07,15.23,16.39,16.26,17.19,38.84,47.19,45.1,10.81
2017-01-01 00:30:00,14.82,15.07,15.23,16.4,16.26,17.18,39.48,47.55,45.34,10.79
2017-01-01 00:35:00,14.82,15.07,15.23,16.39,16.27,17.18,39.48,46.87,44.93,10.78
2017-01-01 00:40:00,14.82,15.07,15.23,16.4,16.26,17.18,40.34,46.71,43.72,10.77
2017-01-01 00:45:00,14.82,15.07,15.22,16.39,16.26,17.18,38.91,46.84,43.83,10.74


In [10]:
x = test_df[:-1]
x

Unnamed: 0_level_0,Thermocouple 1,Thermocouple 2,Thermocouple 3,Thermocouple 4,Thermocouple 5,Thermocouple 6,Load Current (Blue),Load Current (Yellow),Load Current (Red),Air Temperature
Timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2017-01-01 00:00:00,14.85,15.1,15.26,16.43,16.29,17.22,38.9,48.42,46.64,10.9
2017-01-01 00:05:00,14.85,15.09,15.26,16.42,16.29,17.21,39.14,48.8,47.02,10.88
2017-01-01 00:10:00,14.85,15.09,15.26,16.42,16.29,17.22,39.25,48.53,46.77,10.87
2017-01-01 00:15:00,14.84,15.09,15.26,16.42,16.28,17.2,38.71,46.77,45.48,10.86
2017-01-01 00:20:00,14.85,15.09,15.26,16.42,16.29,17.21,38.89,47.1,45.66,10.85
2017-01-01 00:25:00,14.82,15.07,15.23,16.39,16.26,17.19,38.84,47.19,45.1,10.81
2017-01-01 00:30:00,14.82,15.07,15.23,16.4,16.26,17.18,39.48,47.55,45.34,10.79
2017-01-01 00:35:00,14.82,15.07,15.23,16.39,16.27,17.18,39.48,46.87,44.93,10.78
2017-01-01 00:40:00,14.82,15.07,15.23,16.4,16.26,17.18,40.34,46.71,43.72,10.77
2017-01-01 00:45:00,14.82,15.07,15.22,16.39,16.26,17.18,38.91,46.84,43.83,10.74


In [11]:
print(len(x))
print(len(y))

19
15


In [12]:
from torch.utils.data import Dataset

class CableDataset(Dataset):
  def __init__(self, df, seq_len):
    self.dataframe = df
    self.seq_len = seq_len
    self.targets = self.create_targets(self.dataframe, self.seq_len) # torch.from_numpy(self.dataframe.pop('Thermocouple 7').values).type(torch.float32)
    self.data = self.create_data(self.dataframe, self.seq_len) # self.create_sequential_data(self.dataframe, self.seq_len)

  def create_data(self, df, seq_len):
    data_df = df
    data_df = data_df[:-1]
    data_tensor = self.create_sliding_window(data_df, seq_len)
    return data_tensor

  def create_targets(self, df, seq_len):
    targets_df = df.pop('Thermocouple 7')
    targets_df = targets_df[seq_len:]
    targets_tensor = torch.from_numpy(targets_df.values).type(torch.float32)
    return targets_tensor

  # creating features dataset
  def create_sliding_window(self, df, seq_len):
    column_tensors = []
    def generate_sliding_window(df_column, seq_len):
      window_data = []
      df_column.rolling(seq_len).apply((lambda x: window_data.append(torch.from_numpy(x.values)) or 0), raw=False)
      return torch.cat(window_data).reshape(-1, seq_len)

    for col in df.columns:
      tensor = generate_sliding_window(df[col], seq_len)
      column_tensors.append(tensor)

    return torch.cat(column_tensors, dim=1).reshape(-1, len(df.columns), seq_len).permute(0, 2, 1).type(torch.float32)

  # override
  def __len__(self):
    return len(self.data)

  # override
  def __getitem__(self, idx):
    data = self.data[idx]
    target = self.targets[idx]
    return data, target

In [13]:
train_df = df[:20000]
print(len(train_df))

20000


In [14]:
test_df = df[20000:25000]
print(len(test_df))

5000


In [15]:
train_data = CableDataset(train_df, 10)
test_data = CableDataset(test_df, 10)

print(train_data.data.shape)
print(train_data.targets.shape)
print(test_data.data.shape)
print(test_data.targets.shape)
print(train_data[2][0])
print(train_data[2][1])

torch.Size([19990, 10, 10])
torch.Size([19990])
torch.Size([4990, 10, 10])
torch.Size([4990])
tensor([[14.8500, 15.0900, 15.2600, 16.4200, 16.2900, 17.2200, 39.2500, 48.5300,
         46.7700, 10.8700],
        [14.8400, 15.0900, 15.2600, 16.4200, 16.2800, 17.2000, 38.7100, 46.7700,
         45.4800, 10.8600],
        [14.8500, 15.0900, 15.2600, 16.4200, 16.2900, 17.2100, 38.8900, 47.1000,
         45.6600, 10.8500],
        [14.8200, 15.0700, 15.2300, 16.3900, 16.2600, 17.1900, 38.8400, 47.1900,
         45.1000, 10.8100],
        [14.8200, 15.0700, 15.2300, 16.4000, 16.2600, 17.1800, 39.4800, 47.5500,
         45.3400, 10.7900],
        [14.8200, 15.0700, 15.2300, 16.3900, 16.2700, 17.1800, 39.4800, 46.8700,
         44.9300, 10.7800],
        [14.8200, 15.0700, 15.2300, 16.4000, 16.2600, 17.1800, 40.3400, 46.7100,
         43.7200, 10.7700],
        [14.8200, 15.0700, 15.2200, 16.3900, 16.2600, 17.1800, 38.9100, 46.8400,
         43.8300, 10.7400],
        [14.8200, 15.0700, 15.2300

In [16]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(dataset=train_data, batch_size=32, shuffle=True)
test_dataloader = DataLoader(dataset=test_data, batch_size=32, shuffle=False)

## cable_temperature_prediction/models.py

In [17]:
from torch import nn

class RecurrentNeuralNetwork(nn.Module):
  def __init__(self, input_size, hidden_size, output_size, num_layers):
    super().__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.output_size = output_size
    self.rnn = nn.RNN(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=self.num_layers, nonlinearity='relu', batch_first=True)
    self.block = nn.Sequential(
        nn.Flatten(),
        nn.Linear(in_features=input_size*hidden_size, out_features=1),
    )

  def forward(self, x):
    batch_size = x.shape[0]
    hidden = torch.randn(self.num_layers, batch_size, self.hidden_size).to(device)
    output, hidden = self.rnn(x, hidden)

    #print(output.shape)
    #output = output.view(-1, self.hidden_size)
    output = self.block(output)

    #print(output)
    return output, hidden
  

In [18]:
!pip install torchinfo

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [19]:
from torchinfo import summary

In [20]:
model_0 = RecurrentNeuralNetwork(10, 20, 1, 1).to(device)

summary(model_0, input_size=(32, 10, 10), col_names=['input_size', 'output_size', 'trainable'])

Layer (type:depth-idx)                   Input Shape               Output Shape              Trainable
RecurrentNeuralNetwork                   [32, 10, 10]              [32, 1]                   True
├─RNN: 1-1                               [32, 10, 10]              [32, 10, 20]              True
├─Sequential: 1-2                        [32, 10, 20]              [32, 1]                   True
│    └─Flatten: 2-1                      [32, 10, 20]              [32, 200]                 --
│    └─Linear: 2-2                       [32, 200]                 [32, 1]                   True
Total params: 841
Trainable params: 841
Non-trainable params: 0
Total mult-adds (M): 0.21
Input size (MB): 0.01
Forward/backward pass size (MB): 0.05
Params size (MB): 0.00
Estimated Total Size (MB): 0.07

## cable_temperature_prediction/train.py

In [21]:
# train step
def train_step(model, dataloader, loss_fn, optimizer, device):
  train_loss = 0.0
  # training mode
  model.train()
  for batch, (X, y) in enumerate(dataloader):
    # allocate data to device
    X, y = X.to(device), y.to(device)
    # forward pass
    output, hidden = model(X)
    # calculate loss
    loss = loss_fn(output, y)
    train_loss += loss
    # prevent accumulation of gradients
    optimizer.zero_grad()
    # backpropagation
    loss.backward()
    # gradient descent update
    optimizer.step()
  
  train_loss /= len(dataloader)

  return train_loss

In [22]:
# test step
def test_step(model, dataloader, loss_fn, device):
  test_loss = 0.0
  # evaluation mode
  model.eval()
  with torch.inference_mode():
    for batch, (X, y) in enumerate(dataloader):
      # allocate data to device
      X, y = X.to(device), y.to(device)
      # forward pass
      output, hidden = model(X)
      # calculate loss
      loss = loss_fn(output, y)
      test_loss += loss
      
    test_loss /= len(dataloader)
    #print(test_loss)
  
  return test_loss

In [23]:
# train and test model
def train(model, train_loader, test_loader, loss_fn, optimizer, device, epochs=10):
  results = {'train_loss':[], 'test_loss':[]}

  for epoch in range(epochs):
    train_loss = train_step(model, train_loader, loss_fn, optimizer, device)
    test_loss = test_step(model, test_loader, loss_fn, device)

    print(f"Epoch: {epoch + 1} | Train Loss: {train_loss:.5f} | Test Loss: {test_loss:.5f}")

    results['train_loss'].append(train_loss)
    results['test_loss'].append(test_loss)

  return results

In [24]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(params=model_0.parameters(), lr=1e-2)

In [25]:
model_0_results = train(model_0, train_dataloader, test_dataloader, loss_fn, optimizer, device)

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch: 1 | Train Loss: 45.62223 | Test Loss: 2.07735
Epoch: 2 | Train Loss: 38.79952 | Test Loss: 8.14022
Epoch: 3 | Train Loss: 37.60245 | Test Loss: 7.50932
Epoch: 4 | Train Loss: 36.87085 | Test Loss: 4.78639
Epoch: 5 | Train Loss: 37.18493 | Test Loss: 3.26544
Epoch: 6 | Train Loss: 36.74512 | Test Loss: 6.21570
Epoch: 7 | Train Loss: 36.88449 | Test Loss: 4.64607
Epoch: 8 | Train Loss: 36.65909 | Test Loss: 14.71825
Epoch: 9 | Train Loss: 37.02802 | Test Loss: 12.18341
Epoch: 10 | Train Loss: 36.62819 | Test Loss: 3.99554


In [26]:
predictions = [] # store predictions

model_0.eval()
with torch.inference_mode():
  for X, y in test_dataloader:
    # allocate to device
    X, y = X.to(device), y.to(device)
    # calculate logits
    output, hidden = model_0(X)
    # calculate int labels
    #y_labels = torch.argmax(torch.softmax(y_logits, dim=1), dim=1)

    predictions.append(output.cpu())
    
# create tensor using list of predictions
y_predictions = torch.cat(predictions)

In [27]:
y_predictions.shape

torch.Size([4990, 1])

In [28]:
y_predictions[:10]

tensor([[17.4868],
        [17.9934],
        [17.9963],
        [17.5996],
        [18.0600],
        [17.2229],
        [17.9543],
        [17.5153],
        [17.4415],
        [18.0616]])

In [30]:
test_data.targets[:10]

tensor([25.7300, 25.7300, 25.7200, 25.7300, 25.7300, 25.7400, 25.7300, 25.7300,
        25.7400, 25.7300])

In [29]:
# import matplotlib.pyplot as plt
# from cable_temperature_prediction import data

# df = data.create_dataframe(DATA_CSV_URL)

# # plot the data before and after the removal of outliers for a specific column
# column_name = 'Load Current (Red)'
# fig, ax = plt.subplots(2, 1, figsize=(20,5), sharex=True, sharey=True)
# df[column_name].plot(ax=ax[0], kind='line', alpha=0.8)
# ax[0].set_title("Before removal of outliers")
# tr_df = data.remove_outliers(df)
# tr_df[column_name].plot(ax=ax[1], kind='line', alpha=0.8)
# ax[1].set_title("After removal of outliers")
# plt.show()