In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, RepeatVector, TimeDistributed
import warnings
warnings.filterwarnings('ignore', category=pd.errors.PerformanceWarning)
warnings.filterwarnings('ignore', category=UserWarning, module='matplotlib')
warnings.filterwarnings('ignore', category=np.ComplexWarning)

In [None]:
data_path = "../data/raw/waterDataTraining.csv"
df = pd.read_csv(data_path)
df.head(5)

In [None]:
def data_preprocessing(df, columns_to_drop):
    processed_df = df.copy()
    processed_df['Time'] = pd.to_datetime(processed_df['Time'])
    processed_df.set_index('Time', inplace=True)
    processed_df.drop(columns=columns_to_drop, inplace=True)
    processed_df['pH'].interpolate(method='time', inplace=True)
    
    return processed_df

In [None]:
columns_to_drop = ['Tp', 'Cl', 'Redox', 'Leit', 'Trueb', 'Cl_2', 'Fm','Fm_2']
data = data_preprocessing(df, columns_to_drop)

In [None]:
scaler = MinMaxScaler()

data['pH_normalized'] = scaler.fit_transform(data[['pH']])

In [None]:
def create_sequences(data, seq_length):
    sequences = []
    for i in range(len(data) - seq_length):
        sequences.append(data[i: i + seq_length])
    return np.array(sequences)

sequence_length = 10
ph_series = data['pH_normalized'].values
sequences = create_sequences(ph_series, sequence_length)

train_size = int(len(sequences) * 0.8)
train_sequences = sequences[:train_size]
test_sequences = sequences[train_size:]

In [None]:
model = Sequential([
    LSTM(50, activation='relu', input_shape=(sequence_length, 1), return_sequences=True),
    LSTM(50, activation='relu', return_sequences=False),
    RepeatVector(sequence_length),
    LSTM(50, activation='relu', return_sequences=True),
    LSTM(50, activation='relu', return_sequences=True),
    TimeDistributed(Dense(1))
])

model.compile(optimizer='adam', loss='mse')

model.summary()

In [None]:
X_train = train_sequences.reshape(-1, sequence_length, 1)
X_test = test_sequences.reshape(-1, sequence_length, 1)

history = model.fit(X_train, X_train, epochs=20, batch_size=32, validation_split=0.1)

In [None]:
train_reconstructions = model.predict(X_train)
test_reconstructions = model.predict(X_test)

train_errors = np.mean(np.abs(train_reconstructions - X_train), axis=1)
test_errors = np.mean(np.abs(test_reconstructions - X_test), axis=1)

threshold = np.mean(train_errors) + 2 * np.std(train_errors)

test_anomalies = test_errors > threshold
df['LSTM_Anomaly'] = np.concatenate((np.zeros(sequence_length), test_anomalies), axis=None)


In [None]:
def plot_anomalies(df):
    plt.figure(figsize=(12, 6))
    plt.plot(df.index, df['pH'], label='pH', color='blue')
    plt.scatter(df.index[df['EVENT']], df['pH'][df['EVENT']], color='red', label='True Anomaly', zorder=5)
    plt.scatter(df.index[df['LSTM_Anomaly']], df['pH'][df['LSTM_Anomaly']], facecolors='none', edgecolors='orange', label='LSTM Detected Anomaly', zorder=6)
    plt.legend()
    plt.title('LSTM Anomaly Detection')
    plt.xlabel('Time')
    plt.ylabel('pH')
    plt.show()

plot_anomalies(df)
