In [32]:
import numpy as np
import pandas as pd
import pickle as pckl
import torch as torch
from torch import nn as nn
from torch import optim as opt
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib

In [2]:
torch.manual_seed(42)
np.random.seed(42)

In [3]:
df = pd.read_csv('data_fixed.csv', delimiter = ';')
df['Timestamp'] = pd.to_datetime(df['Timestamp']) 

In [4]:
class autoencoder(nn.Module):
    def __init__(self, input_shape, encoding_dim):
        super(autoencoder, self).__init__()

        self.encode = nn.Sequential(
            nn.Linear(input_shape, 128),
            nn.LeakyReLU(0.1),
            nn.Linear(128, 64),
            nn.LeakyReLU(0.1),
            nn.Linear(64, encoding_dim),
            )

        self.decode = nn.Sequential(
            nn.Linear(encoding_dim, 64),
            nn.LeakyReLU(0.1),
            nn.Linear(64, 128),
            nn.LeakyReLU(0.1),
            nn.Linear(128, input_shape)
            )

    def forward(self, x):
        x = self.encode(x)
        x = self.decode(x)
        return x

In [5]:
def fix_timeseries(df):
    df_new = df.copy()

    df_new['hour'] = df['Timestamp'].dt.hour
    df_new['minute'] = df['Timestamp'].dt.minute
    df_new['second'] = df['Timestamp'].dt.second

    df_new['sin_h'] = np.sin(2* np.pi * df_new['hour']/24)
    df_new['cos_h'] = np.cos(2* np.pi * df_new['hour']/24)
    df_new['sin_m'] = np.sin(2* np.pi * df_new['minute']/60)
    df_new['cos_m'] = np.cos(2* np.pi * df_new['minute']/60)
    df_new['sin_s'] = np.sin(2* np.pi * df_new['second']/60)
    df_new['cos_s'] = np.cos(2* np.pi * df_new['second']/60)
    print(df_new)
    return df_new

In [6]:
def prep_data(df, batch_size = 50):
    df_copy = df.copy()

    df_with_time = fix_timeseries(df_copy)
    timestamps = df_with_time['Timestamp'].values
    
    numeric_df = df_with_time.copy()
    for col in df.columns:
        max_value = numeric_df[col][numeric_df[col] != np.inf].max()
        numeric_df[col].replace([np.inf, -np.inf], max_value, inplace=True)
    numeric_df.fillna(numeric_df.mean(numeric_only=True), inplace=True)
    numeric_df = numeric_df.drop('Timestamp', axis = 1)
    
    feature_names = numeric_df.columns.tolist()
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(numeric_df)
    test_ratio = 0.2
    train_size = int((1 - test_ratio) * len(data_scaled))
    X_train = data_scaled[:train_size]
    X_test = data_scaled[train_size:]
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    X_train_tensor = torch.FloatTensor(X_train).to(device)
    X_test_tensor = torch.FloatTensor(X_test).to(device)

    train_dataset = TensorDataset(X_train_tensor, X_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, X_test_tensor)
    
    train_loader = DataLoader(train_dataset, batch_size = batch_size)
    test_loader = DataLoader(test_dataset, batch_size = batch_size)

    return train_loader, timestamps,feature_names,test_loader

In [9]:
input_dim = 15
batch_size = 50
epoch = 100
encoding_dim = 2

In [10]:
train_loader, timestamps,feature_names,test_loader= prep_data(df, batch_size)

                Timestamp  Temperature (°C)  Humidity (%)  Raw VOC  IR Light  \
0     2025-03-14 19:41:00             23.11         32.73       83       216   
1     2025-03-14 19:41:00             23.11         32.69       84       222   
2     2025-03-14 19:41:00             23.11         32.67       84       259   
3     2025-03-14 19:41:00             23.10         32.67       84       234   
4     2025-03-14 19:41:00             23.11         32.64       84       231   
...                   ...               ...           ...      ...       ...   
41738 2025-03-21 14:21:00             25.25         25.51      390     15847   
41739 2025-03-21 14:21:00             25.29         25.51      390     15467   
41740 2025-03-21 14:21:00             25.29         25.58      390     12129   
41741 2025-03-21 14:21:00             25.29         25.75      390     13109   
41742 2025-03-21 14:21:00             25.32         25.86      390      9780   

       Visible Light  CO2 (ppm)  hour  

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  numeric_df[col].replace([np.inf, -np.inf], max_value, inplace=True)


In [11]:
model = autoencoder(input_dim, encoding_dim)

In [12]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [13]:
for epoch in range(50):
    running_loss = 0.0
    batch_count = 0
    
    model.train()
    
    data_iterator = iter(train_loader)
    has_batches = False
    
    while True:
        try:
            data = next(data_iterator)
            has_batches = True
            inputs, _ = data
            inputs = inputs.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, inputs)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            batch_count += 1
        except StopIteration:
            break
        except Exception as e:
            print(f"Error processing batch: {str(e)}")
            break
    
    if not has_batches:
        print(f"Epoch {epoch+1}/50: No data was processed! DataLoader is empty.")
        break
    
    if batch_count > 0:
        epoch_loss = running_loss / batch_count
        print(f"Epoch {epoch+1}/50, Loss: {epoch_loss:.6f}, Batches: {batch_count}")
    else:
        print(f"Epoch {epoch+1}/50: Processed 0 batches.")
        break

Epoch 1/50, Loss: 0.389727, Batches: 668
Epoch 2/50, Loss: 0.413500, Batches: 668
Epoch 3/50, Loss: 0.401040, Batches: 668
Epoch 4/50, Loss: 0.374436, Batches: 668
Epoch 5/50, Loss: 0.365186, Batches: 668
Epoch 6/50, Loss: 0.356173, Batches: 668
Epoch 7/50, Loss: 0.348795, Batches: 668
Epoch 8/50, Loss: 0.326799, Batches: 668
Epoch 9/50, Loss: 0.322164, Batches: 668
Epoch 10/50, Loss: 0.312373, Batches: 668
Epoch 11/50, Loss: 0.310723, Batches: 668
Epoch 12/50, Loss: 0.298795, Batches: 668
Epoch 13/50, Loss: 0.286429, Batches: 668
Epoch 14/50, Loss: 0.282153, Batches: 668
Epoch 15/50, Loss: 0.274675, Batches: 668
Epoch 16/50, Loss: 0.277014, Batches: 668
Epoch 17/50, Loss: 0.260454, Batches: 668
Epoch 18/50, Loss: 0.252191, Batches: 668
Epoch 19/50, Loss: 0.242800, Batches: 668
Epoch 20/50, Loss: 0.235695, Batches: 668
Epoch 21/50, Loss: 0.222951, Batches: 668
Epoch 22/50, Loss: 0.223785, Batches: 668
Epoch 23/50, Loss: 0.226837, Batches: 668
Epoch 24/50, Loss: 0.227319, Batches: 668
E

In [34]:
filename = 'autoe_model.sav'
joblib.dump(model, filename)

['autoe_model.sav']