In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import rgb2hex
import torch

In [None]:
file_path = 'Project/measures_v2.csv'

df = pd.read_csv(file_path)
df

In [None]:
#Sorting the value of the measurement sessions

#The 'mergesort' algorithm is a stable sort, meaning that it maintains the
#relative order of rows with equal 'profile_id' values. However, this code
#creates a new DataFrame sorted_df with the sorted data.
print(df.head(10))
df.sort_values(by='profile_id', kind='mergesort', inplace=True)
unique_values = df['profile_id'].unique()
print(unique_values)

In [None]:
# Remove the specified columns
df.drop(['stator_tooth', 'coolant', 'stator_winding', 'torque'], axis=1, inplace=True)

In [None]:
# Let's rename the stator_yoke variable to stator_temp
df.rename(columns={'stator_yoke': 'stator_temp'}, inplace=True)

# Check the first few rows to confirm the columns are removed
print(df.head())

**Scaling the features**

In [None]:
# Separating features and target
features = df.drop('pm', axis=1)
target = df['pm']

# Normalize the feature data
scaler = MinMaxScaler()
features_normalized = scaler.fit_transform(features)

In [None]:
features_normalized_df = pd.DataFrame(features_normalized, columns=features.columns)
features_normalized_df['profile_id'] = df['profile_id']  # adding profile_id back for grouping

**Model Input**

Prepering the input of our model. We want to have different time series to input in our model based on the profile_id. 
Because of the cycle have different lengths, we decided to make each one of the same lengths flattened the array, adding zeros at the end.

In [None]:
grouped = features_normalized_df.groupby('profile_id')

max_length = max(grouped.size())

padded_sequences = []
sequence_targets = []

for name, group in grouped:
    sequence_length = len(group)
    padding_length = max_length - sequence_length

    padded_sequence = np.vstack((group.drop('profile_id', axis=1).values, 
                                 np.zeros((padding_length, group.shape[1] - 1))))
    padded_sequences.append(padded_sequence)

    padded_target = np.pad(target[group.index], (0, padding_length), 'constant')
    sequence_targets.append(padded_target)

padded_sequences = np.array(padded_sequences)
sequence_targets = np.array(sequence_targets)

We creating windows of 50 timestamps for each sequence based on the profile_id, to create more input for our model.

In [None]:
from sklearn.model_selection import train_test_split

unique_profile_ids = df['profile_id'].unique()
train_profile_ids, test_profile_ids = train_test_split(unique_profile_ids, test_size=0.2, random_state=42)

def filter_sequences(sequences, targets, profile_ids, all_profile_ids):
    indices = [i for i, p_id in enumerate(all_profile_ids) if p_id in profile_ids]
    return sequences[indices], targets[indices]

train_data, train_targets = filter_sequences(padded_sequences, sequence_targets, train_profile_ids, unique_profile_ids)
test_data, test_targets = filter_sequences(padded_sequences, sequence_targets, test_profile_ids, unique_profile_ids)

In [None]:
print('the len of the train data is ' + str(len(train_data)))
print('the len of the test data is '+ str(len(test_data)))

In [None]:
from torch.utils.data import DataLoader, TensorDataset


def create_subsequences(data, target, sequence_length, step_size):
    subsequences = []
    targets = []

    flattened_data = data.reshape(-1, data.shape[-1])  
    flattened_target = target.flatten()

    for i in range(0, len(flattened_data) - sequence_length, step_size):
        subsequences.append(flattened_data[i:i + sequence_length])
        targets.append(flattened_target[i + sequence_length])

    return np.array(subsequences), np.array(targets)


sequence_length = 50
step_size = 50
train_subsequences, train_sub_targets = create_subsequences(train_data, train_targets, sequence_length, step_size)


train_data_tensor = torch.tensor(train_subsequences, dtype=torch.float32)
train_targets_tensor = torch.tensor(train_sub_targets, dtype=torch.float32)


train_dataset = TensorDataset(train_data_tensor, train_targets_tensor)
train_loader = DataLoader(train_dataset, batch_size=50, shuffle=False)

**LSTM**

In [None]:
import torch.nn as nn

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))

        out = self.fc(out[:, -1, :])
        return out

input_size = train_data.shape[2]  
hidden_size = 50  
num_layers = 2  
output_size = 1  

model = LSTMModel(input_size, hidden_size, num_layers, output_size)


criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  

# Training loop
num_epochs = 50  
for epoch in range(num_epochs):
    for inputs, targets in train_loader:
        
        outputs = model(inputs)
        loss = criterion(outputs, targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

**LSTM + CROSS VALIDATION**

In [None]:
from sklearn.model_selection import KFold

def filter_sequences(sequences, targets, profile_ids, all_profile_ids):
    indices = [i for i, p_id in enumerate(all_profile_ids) if p_id in profile_ids]
    return sequences[indices], targets[indices]

# Number of folds
n_folds = 11
kf = KFold(n_splits=n_folds)

# Convert profile IDs to a list for indexing
train_profile_id_list = list(train_profile_ids)

# Store the performance metrics for each fold
performance_metrics = []

for train_index, val_index in kf.split(train_profile_id_list):
    # Get profile IDs for current fold
    train_ids = [train_profile_id_list[i] for i in train_index]
    val_ids = [train_profile_id_list[i] for i in val_index]

    # Filter sequences based on profile IDs
    X_train, y_train = filter_sequences(padded_sequences, sequence_targets, train_ids, unique_profile_ids)
    X_val, y_val = filter_sequences(padded_sequences, sequence_targets, val_ids, unique_profile_ids)

    # Flatten and create subsequences
    train_subsequences, train_sub_targets = create_subsequences(X_train, y_train, sequence_length, step_size)
    val_subsequences, val_sub_targets = create_subsequences(X_val, y_val, sequence_length, step_size)

    # Convert to PyTorch tensors and create DataLoaders
    train_loader = DataLoader(TensorDataset(torch.tensor(train_subsequences, dtype=torch.float32),
                                            torch.tensor(train_sub_targets, dtype=torch.float32)),
                              batch_size=50, shuffle=True)
    val_loader = DataLoader(TensorDataset(torch.tensor(val_subsequences, dtype=torch.float32),
                                          torch.tensor(val_sub_targets, dtype=torch.float32)),
                            batch_size=50)

    # Initialize the model, loss function, and optimizer
    model = LSTMModel(input_size, hidden_size, num_layers, output_size)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()

    # Train the model on the training data
    for epoch in range(num_epochs):
        for inputs, targets in train_loader:
        
            outputs = model(inputs)
            loss = criterion(outputs, targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

    # Evaluate the model on the validation data
    total_loss = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()

    avg_loss = total_loss / len(val_loader)
    performance_metrics.append(avg_loss)
    print(f'Validation Loss for fold: {avg_loss:.4f}')

# Calculate and print the average performance across all folds
average_performance = np.mean(performance_metrics)
print(f'Average Performance across all folds: {average_performance}')