In [1]:
import numpy as np
import pandas as pd
import pickle as pckl
import torch as torch
from torch import nn as nn
from torch import optim as optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import joblib

In [2]:
torch.manual_seed(42)
np.random.seed(42)

In [3]:
df = pd.read_csv('data_fixed.csv', delimiter = ';')
df['Timestamp'] = pd.to_datetime(df['Timestamp']) 

In [4]:
class autoencoder(nn.Module):
    def __init__(self, input_shape, encoding_dim):
        super(autoencoder, self).__init__()

        self.encode = nn.Sequential(
            nn.Linear(input_shape, 64),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.1),
            nn.Linear(64, 32),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.1),
            nn.Linear(32, encoding_dim),
            )

        self.decode = nn.Sequential(
            nn.Linear(encoding_dim, 32),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.1),
            nn.Linear(32, 64),
            nn.LeakyReLU(0.1),
            nn.Dropout(0.1),
            nn.Linear(64, input_shape)
            )

    def forward(self, x):
        x = self.encode(x)
        x = self.decode(x)
        return x

In [5]:
def fix_timeseries(df):
    df_new = df.copy()

    df_new['hour'] = df['Timestamp'].dt.hour
    df_new['minute'] = df['Timestamp'].dt.minute
    df_new['second'] = df['Timestamp'].dt.second

    df_new['sin_h'] = np.sin(2* np.pi * df_new['hour']/24)
    df_new['cos_h'] = np.cos(2* np.pi * df_new['hour']/24)
    df_new['sin_m'] = np.sin(2* np.pi * df_new['minute']/60)
    df_new['cos_m'] = np.cos(2* np.pi * df_new['minute']/60)
    df_new['sin_s'] = np.sin(2* np.pi * df_new['second']/60)
    df_new['cos_s'] = np.cos(2* np.pi * df_new['second']/60)
    
    return df_new

In [6]:
def prep_data(df, batch_size = 50):
    df_copy = df.copy()

    df_with_time = fix_timeseries(df_copy)
    timestamps = df_with_time['Timestamp'].values
    
    numeric_df = df_with_time.copy()
    error_values = ['#NAMN', '#NAMN?', '#NAME?', '#DIV/0!', '#N/A', '#NULL!', '#NUM!', '#REF!', '#VALUE!']
    
    for col in numeric_df.columns:
        
        if numeric_df[col].dtype == 'object' or numeric_df[col].astype(str).str.contains('#').any():
            numeric_df[col] = numeric_df[col].astype(str).replace(error_values, '1e9')
            numeric_df[col] = pd.to_numeric(numeric_df[col], errors='coerce')
    
    
    for col in numeric_df.columns:
        if pd.api.types.is_numeric_dtype(numeric_df[col]):
            valid_values = numeric_df[col][~np.isinf(numeric_df[col])]
            if len(valid_values) > 0:
                max_value = valid_values.max()
                numeric_df[col] = numeric_df[col].replace([np.inf, -np.inf], max_value)
    numeric_df = numeric_df.fillna(numeric_df.mean(numeric_only=True))
    
    numeric_df = numeric_df.drop(['Timestamp', 'hour', 'minute', 'second'], axis=1, errors='ignore')
    
    feature_names = numeric_df.columns.tolist()
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(numeric_df)
    test_ratio = 0.2
    train_size = int((1 - test_ratio) * len(data_scaled))
    X_train = data_scaled[:train_size]
    X_test = data_scaled[train_size:]
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    X_train_tensor = torch.FloatTensor(X_train).to(device)
    X_test_tensor = torch.FloatTensor(X_test).to(device)

    train_dataset = TensorDataset(X_train_tensor, X_train_tensor)
    test_dataset = TensorDataset(X_test_tensor, X_test_tensor)
    
    train_loader = DataLoader(train_dataset, batch_size = batch_size)
    test_loader = DataLoader(test_dataset, batch_size = batch_size)

    return train_loader, timestamps,feature_names,test_loader,scaler

In [7]:
def train_autoencoder(epochs, model, train_loader, test_loader=None, weight_decay=1e-4, early_stop_patience=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    model = model.to(device)
    
   
    optimizer = optim.Adam(model.parameters(), lr=0.0001, weight_decay=weight_decay)
    criterion = nn.MSELoss()
    
    best_loss = float('inf')
    patience_counter = 0
    best_model_state = None
    
    train_losses = []
    val_losses = []
    
    for epoch in range(epochs):
        
        running_loss = 0.0
        batch_count = 0
        
        model.train() 
        data_iterator = iter(train_loader)
        has_batches = False
        
        while True:
            try:
                data = next(data_iterator)
                has_batches = True
                inputs, targets = data
                
                inputs = inputs.to(device)
                targets = targets.to(device)

                optimizer.zero_grad()
                outputs = model(inputs)

                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()
                
                running_loss += loss.item()
                batch_count += 1
                
            except StopIteration:
                break
            except Exception as e:
                print(f"Error processing batch: {str(e)}")
                break
        
        if not has_batches:
            print(f"Epoch {epoch+1}/{epochs}: No data was processed! DataLoader is empty.")
            break
        
        if batch_count > 0:
            epoch_loss = running_loss / batch_count
            train_losses.append(epoch_loss)
            print(f"Epoch {epoch+1}/{epochs}, Train Loss: {epoch_loss:.6f}, Batches: {batch_count}")
        else:
            print(f"Epoch {epoch+1}/{epochs}: Processed 0 batches.")
            break

        if test_loader is not None:
            model.eval() 
            val_loss = 0.0
            val_batch_count = 0
            
            with torch.no_grad(): 
                for val_inputs, val_targets in test_loader:
                    val_inputs = val_inputs.to(device)
                    val_targets = val_targets.to(device)
                    
                    val_outputs = model(val_inputs)
                    batch_loss = criterion(val_outputs, val_targets)
                    
                    val_loss += batch_loss.item()
                    val_batch_count += 1
            
            if val_batch_count > 0:
                avg_val_loss = val_loss / val_batch_count
                val_losses.append(avg_val_loss)
                print(f"Validation Loss: {avg_val_loss:.6f}")
                
                if avg_val_loss < best_loss:
                    best_loss = avg_val_loss
                    patience_counter = 0
                    
                    best_model_state = model.state_dict().copy()
                else:
                    patience_counter += 1
                    if patience_counter >= early_stop_patience:
                        print(f"Early stopping triggered after {epoch+1} epochs")
                        
                        if best_model_state is not None:
                            model.load_state_dict(best_model_state)
                        break
    if test_loader is not None and best_model_state is not None:
        model.load_state_dict(best_model_state)
    
    return model

In [8]:
input_dim = 12
batch_size = 50
epoch = 100
encoding_dim = 10

In [9]:
train_loader, timestamps,feature_names,test_loader,scaler= prep_data(df, batch_size)

In [10]:
model = autoencoder(input_dim, encoding_dim)

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

In [12]:
trained_model = train_autoencoder(epoch, model, train_loader,test_loader, weight_decay = 1e-4, early_stop_patience = 10)

Epoch 1/100, Train Loss: 0.935381, Batches: 846
Validation Loss: 1.077201
Epoch 2/100, Train Loss: 0.705277, Batches: 846
Validation Loss: 0.963068
Epoch 3/100, Train Loss: 0.546224, Batches: 846
Validation Loss: 0.931332
Epoch 4/100, Train Loss: 0.491398, Batches: 846
Validation Loss: 0.916885
Epoch 5/100, Train Loss: 0.456508, Batches: 846
Validation Loss: 0.902183
Epoch 6/100, Train Loss: 0.419543, Batches: 846
Validation Loss: 0.868128
Epoch 7/100, Train Loss: 0.384434, Batches: 846
Validation Loss: 0.836574
Epoch 8/100, Train Loss: 0.361428, Batches: 846
Validation Loss: 0.819335
Epoch 9/100, Train Loss: 0.343237, Batches: 846
Validation Loss: 0.801155
Epoch 10/100, Train Loss: 0.326024, Batches: 846
Validation Loss: 0.771740
Epoch 11/100, Train Loss: 0.309534, Batches: 846
Validation Loss: 0.744127
Epoch 12/100, Train Loss: 0.295630, Batches: 846
Validation Loss: 0.720973
Epoch 13/100, Train Loss: 0.282658, Batches: 846
Validation Loss: 0.705522
Epoch 14/100, Train Loss: 0.270327

In [28]:
filename = 'autoe_model.pth'
scala = 'autoencoderscaler.sav'
torch.save(trained_model.state_dict(), filename)
joblib.dump(scaler,scala)

['autoencoderscaler.sav']