In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import pandas as pd
import plotly.graph_objects as go
import warnings
from tqdm import tqdm

# Ignore all warnings
warnings.filterwarnings("ignore")

# Load the data
df = pd.read_csv("final_data.csv")
df['Time (hours)'] = pd.to_datetime(df['Time (hours)'])

# Assuming your columns have already been processed for 'sma', 'std', and 'peak'
df['sma'] = df['Flow Rate (m³/s)'].rolling(window=12).mean()
df['std'] = df['Flow Rate (m³/s)'].rolling(window=12).std()
df['bb_up'] = df['sma'] + 1.75 * df['std']
df['peak'] = 0

# Drop NA values to prevent errors
df = df.dropna()

# Identifying the peaks
in_high = False
for i in tqdm(range(1, len(df))):
    if in_high == False:
        if df['Flow Rate (m³/s)'].iloc[i] > df['bb_up'].iloc[i] and df['Flow Rate (m³/s)'].iloc[i-1] < df['bb_up'].iloc[i-1]:
            in_high = True
            df['peak'].iloc[i] = 1
    else:
        if df['Flow Rate (m³/s)'].iloc[i] < df['sma'].iloc[i] and df['Flow Rate (m³/s)'].iloc[i-1] > df['sma'].iloc[i]:
            in_high = False
        else:
            df['peak'].iloc[i] = 1

# Create Plotly figure
fig = go.Figure()

# Plot the entire flow_rate line in blue
fig.add_trace(go.Scatter(
    x=df['Time (hours)'],
    y=df['Flow Rate (m³/s)'],
    mode='lines',
    name='Flow Rate (m³/s)',
    line=dict(color='blue')
))

# Plot the peak regions in red
peak_start = None
for i in range(1, len(df)):
    if df['peak'].iloc[i] == 1 and peak_start is None:
        # Start of a peak region
        peak_start = i
    elif df['peak'].iloc[i] == 0 and peak_start is not None:
        # End of a peak region
        fig.add_trace(go.Scatter(
            x=df['Time (hours)'].iloc[peak_start:i],
            y=df['Flow Rate (m³/s)'].iloc[peak_start:i],
            mode='lines',
            name='Peak Region',
            line=dict(color='red')
        ))
        peak_start = None

# If the last data point is part of a peak region, plot it
if peak_start is not None:
    fig.add_trace(go.Scatter(
        x=df['Time (hours)'].iloc[peak_start:],
        y=df['Flow Rate (m³/s)'].iloc[peak_start:],
        mode='lines',
        name='Peak Region',
        line=dict(color='red')
    ))

# Customize layout
fig.update_layout(
    title='Flow Rate with Peak Regions in Red',
    xaxis_title='Timestamp',
    yaxis_title='Flow Rate',
    template='plotly_dark',  # Optional, set to 'plotly' for a lighter theme
    showlegend=False
)

# Show the plot
fig.show()
df.to_csv("peaks.csv")

  0%|          | 0/8748 [00:00<?, ?it/s]

100%|██████████| 8748/8748 [00:01<00:00, 8165.96it/s]


In [38]:
len(df[df['leak']==1])

35

In [50]:
import numpy as np
import pandas as pd

def temp_anomaly(df):
    # Calculate the 95th and 5th percentiles over a rolling window of 15 days (15*24 hours)
    df['95_per'] = df['Water Temperature (°C)'].rolling(window=15*24).apply(lambda x: np.percentile(x, 95), raw=False)
    df['5_per'] = df['Water Temperature (°C)'].rolling(window=15*24).apply(lambda x: np.percentile(x, 5), raw=False)
    
    # Initialize the 'temp_anomaly' column to 'Normal'
    df['temp_anomaly'] = 'Normal'
    
    # Use np.where to classify anomalies based on percentiles
    df['temp_anomaly'] = np.where(df['Water Temperature (°C)'] > df['95_per'], 'high', df['temp_anomaly'])
    df['temp_anomaly'] = np.where(df['Water Temperature (°C)'] < df['5_per'], 'low', df['temp_anomaly'])
    
    return df

def pressure_anomaly(df):
    # Calculate the 95th and 5th percentiles over a rolling window of 15 days (15*24 hours)
    df['95_per_press'] = df['Pressure (Pa)'].rolling(window=15*24).apply(lambda x: np.percentile(x, 95), raw=False)
    df['5_per_press'] = df['Pressure (Pa)'].rolling(window=15*24).apply(lambda x: np.percentile(x, 5), raw=False)
    
    # Initialize the 'press_anomaly' column to 'Normal'
    df['press_anomaly'] = 'Normal'
    
    # Use np.where to classify anomalies based on percentiles
    df['press_anomaly'] = np.where(df['Pressure (Pa)'] > df['95_per_press'], 'high', df['press_anomaly'])
    df['press_anomaly'] = np.where(df['Pressure (Pa)'] < df['5_per_press'], 'low', df['press_anomaly'])
    
    return df


In [59]:
df=temp_anomaly(df)
df=pressure_anomaly(df)
import pandas as pd
import plotly.graph_objects as go


fig = go.Figure()

# Plot the water temperature (main temperature line)
fig.add_trace(go.Scatter(
    x=df['Time (hours)'],  # X-axis (time in hours)
    y=df['Water Temperature (°C)'],  # Y-axis (water temperature in °C)
    mode='lines',  # Line mode
    name='Water Temperature (°C)',  # Name of the line
    line=dict(color='blue')  # Blue color for the temperature line
))

# Plot the high anomalies in red
fig.add_trace(go.Scatter(
    x=df[df['temp_anomaly'] == 'high']['Time (hours)'],  # Filter data for high anomalies
    y=df[df['temp_anomaly'] == 'high']['Water Temperature (°C)'],  # Filter temperature for high anomalies
    mode='markers',  # Scatter plot (markers)
    name='High Temperature Anomaly',  # Name for the high anomaly
    marker=dict(color='red', size=5)  # Red markers for high anomalies
))

# Plot the low anomalies in green
fig.add_trace(go.Scatter(
    x=df[df['temp_anomaly'] == 'low']['Time (hours)'],  # Filter data for low anomalies
    y=df[df['temp_anomaly'] == 'low']['Water Temperature (°C)'],  # Filter temperature for low anomalies
    mode='markers',  # Scatter plot (markers)
    name='Low Temperature Anomaly',  # Name for the low anomaly
    marker=dict(color='green', size=5)  # Green markers for low anomalies
))

# Customize the layout
fig.update_layout(
    title='Water Temperature with High and Low Anomalies',
    xaxis_title='Time (Hours)',
    yaxis_title='Temperature (°C)',
    template='plotly_dark',  # Optional: Use a dark theme
    showlegend=True  # Show the legend
)

# Show the plot
fig.show()

fig = go.Figure()

# Plot the water temperature (main temperature line)
fig.add_trace(go.Scatter(
    x=df['Time (hours)'],  # X-axis (time in hours)
    y=df['Pressure (Pa)'],  # Y-axis (water temperature in °C)
    mode='lines',  # Line mode
    name='Pressure (Pa)',  # Name of the line
    line=dict(color='blue')  # Blue color for the temperature line
))

# Plot the high anomalies in red
fig.add_trace(go.Scatter(
    x=df[df['press_anomaly'] == 'high']['Time (hours)'],  # Filter data for high anomalies
    y=df[df['press_anomaly'] == 'high']['Pressure (Pa)'],  # Filter temperature for high anomalies
    mode='markers',  # Scatter plot (markers)
    name='High Pressure Anomaly',  # Name for the high anomaly
    marker=dict(color='red', size=5)  # Red markers for high anomalies
))

# Plot the low anomalies in green
fig.add_trace(go.Scatter(
    x=df[df['press_anomaly'] == 'low']['Time (hours)'],  # Filter data for low anomalies
    y=df[df['press_anomaly'] == 'low']['Pressure (Pa)'],  # Filter temperature for low anomalies
    mode='markers',  # Scatter plot (markers)
    name='Low Pressure Anomaly',  # Name for the low anomaly
    marker=dict(color='green', size=5)  # Green markers for low anomalies
))

# Customize the layout
fig.update_layout(
    title='Water Pressure with High and Low Anomalies',
    xaxis_title='Time (Hours)',
    yaxis_title='Pressure (Pa)',
    template='plotly_dark',  # Optional: Use a dark theme
    showlegend=True  # Show the legend
)

# Show the plot
fig.show()
df.to_csv('pressure_temp_anomaly.csv')

In [53]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

df_pred = df[['Flow Rate (m³/s)', 'Pressure (Pa)', 'Water Temperature (°C)', 'leak']]

train_df, test_df = train_test_split(df_pred, test_size=0.2, random_state=42)
X_train=train_df.drop(["leak"],axis=1)
Y_train=train_df['leak']
X_test=test_df.drop(["leak"],axis=1)
Y_test=test_df['leak']
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss',nestimators=200,learning_rate=0.1)
model.fit(X_train,Y_train)
Y_hat=model.predict(X_test)
accuracy_score(Y_test, Y_hat)

0.9965714285714286

In [42]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

# Assuming df is your DataFrame with columns: 'Time', 'Flow Rate (m³/s)', 'Pressure (Pa)', 'Water Temperature (°C)'

# Step 1: Create lag features for the previous 48 periods
def create_lag_features(df, target_column, lag=48):
    """
    Creates lag features for the given target column.
    The lag features represent the values from the previous `lag` periods.
    """
    df_lagged = df.copy()
    for i in range(1, lag + 1):
        df_lagged[f'{target_column}_lag_{i}'] = df_lagged[target_column].shift(i)
    return df_lagged

# Define the target columns
target_columns = ['Flow Rate (m³/s)', 'Pressure (Pa)', 'Water Temperature (°C)']
lag = 48

# Create lag features for all target columns
df_lagged = df.copy()
for col in target_columns:
    df_lagged = create_lag_features(df_lagged, col, lag)

# Step 2: Drop rows with NaN values due to shifting
df_lagged.dropna(inplace=True)

# Step 3: Prepare the features (X) and target (y) for each variable
X = df_lagged.drop(columns=['Flow Rate (m³/s)', 'Pressure (Pa)', 'Water Temperature (°C)'])
y_flow = df_lagged['Flow Rate (m³/s)']
y_pressure = df_lagged['Pressure (Pa)']
y_temperature = df_lagged['Water Temperature (°C)']

# Step 4: Scale the data using MinMaxScaler
scaler = MinMaxScaler()

# Scale features and targets
X_scaled = scaler.fit_transform(X)

# Scale the target variables separately for flow, pressure, and temperature
y_flow_scaled = scaler.fit_transform(y_flow.values.reshape(-1, 1))
y_pressure_scaled = scaler.fit_transform(y_pressure.values.reshape(-1, 1))
y_temperature_scaled = scaler.fit_transform(y_temperature.values.reshape(-1, 1))

# Step 5: Reshape the data for LSTM (samples, timesteps, features)
X_scaled = X_scaled.reshape((X_scaled.shape[0], 1, X_scaled.shape[1]))

# Step 6: Split the data into training and test sets (80% train, 20% test)
X_train, X_test, y_flow_train, y_flow_test = train_test_split(X_scaled, y_flow_scaled, test_size=0.2, random_state=42)
_, _, y_pressure_train, y_pressure_test = train_test_split(X_scaled, y_pressure_scaled, test_size=0.2, random_state=42)
_, _, y_temperature_train, y_temperature_test = train_test_split(X_scaled, y_temperature_scaled, test_size=0.2, random_state=42)

# Step 7: Build and train the LSTM model for Flow Rate, Pressure, and Temperature prediction

def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(units=50, return_sequences=True, input_shape=input_shape))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50, return_sequences=False))
    model.add(Dropout(0.2))
    model.add(Dense(units=1))
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Build models for each target (Flow Rate, Pressure, Temperature)
model_flow = build_lstm_model((X_train.shape[1], X_train.shape[2]))
model_pressure = build_lstm_model((X_train.shape[1], X_train.shape[2]))
model_temperature = build_lstm_model((X_train.shape[1], X_train.shape[2]))

# Train the models
model_flow.fit(X_train, y_flow_train, epochs=50, batch_size=32, validation_data=(X_test, y_flow_test), verbose=2)
model_pressure.fit(X_train, y_pressure_train, epochs=50, batch_size=32, validation_data=(X_test, y_pressure_test), verbose=2)
model_temperature.fit(X_train, y_temperature_train, epochs=50, batch_size=32, validation_data=(X_test, y_temperature_test), verbose=2)

# Step 8: Predict on the test data
y_flow_pred = model_flow.predict(X_test)
y_pressure_pred = model_pressure.predict(X_test)
y_temperature_pred = model_temperature.predict(X_test)

# Step 9: Inverse transform the predictions and true values to get them back to the original scale
y_flow_pred = scaler.inverse_transform(y_flow_pred)
y_pressure_pred = scaler.inverse_transform(y_pressure_pred)
y_temperature_pred = scaler.inverse_transform(y_temperature_pred)

y_flow_test = scaler.inverse_transform(y_flow_test)
y_pressure_test = scaler.inverse_transform(y_pressure_test)
y_temperature_test = scaler.inverse_transform(y_temperature_test)

# Step 10: Evaluate the model performance using Mean Squared Error
mse_flow = mean_squared_error(y_flow_test, y_flow_pred)
mse_pressure = mean_squared_error(y_pressure_test, y_pressure_pred)
mse_temperature = mean_squared_error(y_temperature_test, y_temperature_pred)

print(f"Mean Squared Error for Flow Rate: {mse_flow}")
print(f"Mean Squared Error for Pressure: {mse_pressure}")
print(f"Mean Squared Error for Temperature: {mse_temperature}")

# Step 11: Forecast for the next time period using the last available data
latest_data = X_scaled[-1].reshape(1, 1, -1)  # Get the most recent row of input data

next_flow_forecast = model_flow.predict(latest_data)
next_pressure_forecast = model_pressure.predict(latest_data)
next_temperature_forecast = model_temperature.predict(latest_data)

print(f"Next Flow Rate forecast: {scaler.inverse_transform(next_flow_forecast)[0][0]}")
print(f"Next Pressure forecast: {scaler.inverse_transform(next_pressure_forecast)[0][0]}")
print(f"Next Temperature forecast: {scaler.inverse_transform(next_temperature_forecast)[0][0]}")

# Step 12: Plotting actual vs predicted values for visualization (optional)
plt.figure(figsize=(10, 6))

# Plot Flow Rate
plt.subplot(3, 1, 1)
plt.plot(y_flow_test, label='Actual Flow Rate')
plt.plot(y_flow_pred, label='Predicted Flow Rate')
plt.legend()
plt.title('Flow Rate Prediction')

# Plot Pressure
plt.subplot(3, 1, 2)
plt.plot(y_pressure_test, label='Actual Pressure')
plt.plot(y_pressure_pred, label='Predicted Pressure')
plt.legend()
plt.title('Pressure Prediction')

# Plot Temperature
plt.subplot(3, 1, 3)
plt.plot(y_temperature_test, label='Actual Temperature')
plt.plot(y_temperature_pred, label='Predicted Temperature')
plt.legend()
plt.title('Temperature Prediction')

plt.tight_layout()
plt.show()


ModuleNotFoundError: No module named 'tensorflow'

In [None]:
prediction=model.predict([scaler.inverse_transform(next_flow_forecast)[0][0],scaler.inverse_transform(next_pressure_forecast)[0][0],scaler.inverse_transform(next_temperature_forecast)[0][0]])
if prediction :
    print("Beware there is a possibility of leak in the next hour")
if scaler.inverse_transform(next_pressure_forecast)[0][0]>df['95_per_press'].iloc[-1]:
    print("Beware there is a possibility of high pressure anomaly in the next hour")
elif scaler.inverse_transform(next_pressure_forecast)[0][0]<df['5_per_press'].iloc[-1]:
    print("Beware there is a possibility of low pressure anomaly in the next hour")

if scaler.inverse_transform(next_pressure_forecast)[0][0]>df['95_per'].iloc[-1]:
    print("Beware there is a possibility of high temperature anomaly in the next hour")
elif scaler.inverse_transform(next_pressure_forecast)[0][0]<df['5_per'].iloc[-1]:
    print("Beware there is a possibility of low temperature anomaly in the next hour")



In [54]:
import numpy as np
import xgboost as xgb

# Example: Load the pre-trained model
# model = xgb.XGBClassifier()
# model.load_model('xgboost_model.json')

# Input for prediction: reshape the input data to a 2D array with 3 features
input_data = np.array([[25.4, 28.16, 17.9]])

# Make prediction
prediction = model.predict(input_data)

# Check if the model predicts a leak (assuming 1 indicates leak)
if prediction[0] == 1:
    print("Beware, there is a possibility of leak in the next hour")

# Check for pressure anomaly (compare against percentiles)
if 28.16 > df['95_per_press'].iloc[-1]:
    print("Beware, there is a possibility of high pressure anomaly in the next hour")
elif 28.16 < df['5_per_press'].iloc[-1]:
    print("Beware, there is a possibility of low pressure anomaly in the next hour")

# Check for temperature anomaly (compare against percentiles)
if 17.9 > df['95_per'].iloc[-1]:
    print("Beware, there is a possibility of high temperature anomaly in the next hour")
elif 17.9 < df['5_per'].iloc[-1]:
    print("Beware, there is a possibility of low temperature anomaly in the next hour")


Beware, there is a possibility of low pressure anomaly in the next hour


In [44]:
df.iloc[-1]

Time (hours)                2023-12-31 23:00:00
Ambient Temperature (°C)              13.733087
Flow Rate (m³/s)                      98.390632
Pressure (Pa)                         49.697387
Water Temperature (°C)                17.997153
leak                                          0
sma                                   97.760836
std                                    0.491219
bb_up                                 98.620469
peak                                          1
Name: 8759, dtype: object