In [None]:
import pandas as pd
import os

# Load all CSV files from the "Preprocessed Data" folder
folder_path = 'Preprocessed_Data'
dataframes = []

for root, dirs, files in os.walk(folder_path):
    for file in files:
        if file.endswith(".csv"):
            file_path = os.path.join(root, file)
            df = pd.read_csv(file_path)
            dataframes.append(df)

# Combine all dataframes into a single dataframe
combined_data = pd.concat(dataframes, ignore_index=True)


  combined_data = pd.concat(dataframes, ignore_index=True)


In [None]:
combined_data.tail()

Unnamed: 0,Date Time,Slave_Device1_CH1_FLOW m3/h,Slave_Device1_CH2_VELOCITY m/s,Slave_Device1_CH3_Pressure BAR
4608531,2023-09-29 23:23:14,7.18,0.12,1.17
4608532,2023-09-29 23:30:16,8.53,0.15,1.16
4608533,2023-09-29 23:37:18,5.54,0.1,1.18
4608534,2023-09-29 23:44:16,5.76,0.1,1.19
4608535,2023-09-29 23:51:21,5.04,0.09,1.18


In [None]:
combined_data.describe()

Unnamed: 0,Slave_Device1_CH1_FLOW m3/h,Slave_Device1_CH2_VELOCITY m/s,Slave_Device1_CH3_Pressure BAR
count,4608536.0,4608536.0,4608536.0
mean,21.67178,0.2299581,1.169873
std,32.64266,0.33941,2.133565
min,0.0,0.0,0.0
25%,7.81,0.15,1.11
50%,10.25,0.19,1.21
75%,15.19,0.26,1.29
max,2266.0,36.0,134.0


In [None]:
# Convert 'Date Time' to datetime and extract time-based features
combined_data['Date Time'] = pd.to_datetime(combined_data['Date Time'])
combined_data['hour'] = combined_data['Date Time'].dt.hour
combined_data['day_of_week'] = combined_data['Date Time'].dt.dayofweek

# Rename columns for easier access
combined_data.rename(columns={
    'Slave_Device1_CH1_FLOW m3/h': 'flow',
    'Slave_Device1_CH3_Pressure BAR': 'pressure'
}, inplace=True)

# Create lag features for flow and pressure
combined_data['flow_lag1'] = combined_data['flow'].shift(1)
combined_data['pressure_lag1'] = combined_data['pressure'].shift(1)

# Calculate rolling mean for flow and pressure
combined_data['flow_roll_mean'] = combined_data['flow'].rolling(window=5).mean()
combined_data['pressure_roll_mean'] = combined_data['pressure'].rolling(window=5).mean()

# Drop rows with NaN values after creating lag and rolling features
combined_data.dropna(inplace=True)

# Define features and target variable
features = combined_data[['flow', 'pressure', 'hour', 'day_of_week', 
                          'flow_lag1', 'pressure_lag1', 
                          'flow_roll_mean', 'pressure_roll_mean']]

# labels = combined_data['burst_event']  # Assuming a 'burst_event' column for binary target


In [None]:
combined_data.columns

Index(['Date Time', 'flow', 'Slave_Device1_CH2_VELOCITY m/s', 'pressure',
       'hour', 'day_of_week', 'flow_lag1', 'pressure_lag1', 'flow_roll_mean',
       'pressure_roll_mean'],
      dtype='object')

In [None]:
import numpy as np

# Define thresholds
FLOW_THRESHOLD = 100  # Flow rate threshold in m³/s
PRESSURE_THRESHOLD = 3  # Pressure threshold in bar
FLOW_CHANGE_THRESHOLD = 10  # Threshold for a sudden increase in flow rate
PRESSURE_CHANGE_THRESHOLD = 0.5  # Threshold for a sudden increase in pressure

# Calculate change rates
combined_data['flow_change'] = combined_data['flow'].diff().abs()
combined_data['pressure_change'] = combined_data['pressure'].diff().abs()

# Define burst event based on threshold conditions
combined_data['burst_event'] = np.where(
    (combined_data['flow'] >= FLOW_THRESHOLD) |
    (combined_data['pressure'] >= PRESSURE_THRESHOLD) |
    (combined_data['flow_change'] >= FLOW_CHANGE_THRESHOLD) |
    (combined_data['pressure_change'] >= PRESSURE_CHANGE_THRESHOLD),
    1,  # Label as burst
    0   # No burst
)

# Drop rows with any NaN values in the dataset after feature engineering
combined_data.dropna(inplace=True)

# Now, you should have consistent row counts across all columns, allowing you to define features and labels
features = combined_data[['flow', 'pressure', 'hour', 'day_of_week', 
                          'flow_lag1', 'pressure_lag1', 
                          'flow_roll_mean', 'pressure_roll_mean']]
labels = combined_data['burst_event']


In [None]:
labels.describe()

count    4.608531e+06
mean     1.917878e-02
std      1.371530e-01
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      0.000000e+00
max      1.000000e+00
Name: burst_event, dtype: float64

In [None]:
# labels = combined_data['burst_event']

In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Define LightGBM dataset format
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Create a LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Set parameters (tuned for faster training)
params = {
    'objective': 'binary',          # for binary classification
    'boosting_type': 'gbdt',        # traditional gradient boosting
    'n_estimators': 100,            # number of boosting rounds
    'max_depth': 10,                # limit depth to reduce complexity
    'num_leaves': 31,               # default for LightGBM, balance with max_depth
    'learning_rate': 0.1,           # faster training
    'n_jobs': -1                    # use all available cores
}

# Train the model
model = lgb.train(params, train_data, valid_sets=[test_data])

# Evaluate the model
y_pred = model.predict(X_test)
y_pred_binary = [1 if p > 0.5 else 0 for p in y_pred]  # convert probabilities to binary predictions
print(confusion_matrix(y_test, y_pred_binary))
print(classification_report(y_test, y_pred_binary))




[LightGBM] [Info] Number of positive: 70987, number of negative: 3615837
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.132149 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1418
[LightGBM] [Info] Number of data points in the train set: 3686824, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.019254 -> initscore=-3.930582
[LightGBM] [Info] Start training from score -3.930582
[[904182    126]
 [   196  17203]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    904308
           1       0.99      0.99      0.99     17399

    accuracy                           1.00    921707
   macro avg       1.00      0.99      1.00    921707
weighted avg       1.00      1.00      1.00    921707



In [None]:
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.metrics import classification_report, confusion_matrix
# from sklearn.preprocessing import StandardScaler

# # Standardize features
# scaler = StandardScaler()
# features_scaled = scaler.fit_transform(features)

# # Split the dataset
# X_train, X_test, y_train, y_test = train_test_split(features_scaled, labels, test_size=0.2, random_state=42, shuffle=False)

# # Initialize and train the model with hyperparameter tuning
# model = RandomForestClassifier()
# param_grid = {
#     'n_estimators': [100, 200],
#     'max_depth': [None, 10, 20],
# }
# grid_search = GridSearchCV(model, param_grid, cv=5)
# grid_search.fit(X_train, y_train)

# # Model evaluation
# y_pred = grid_search.predict(X_test)
# print(confusion_matrix(y_test, y_pred))
# print(classification_report(y_test, y_pred))


KeyboardInterrupt: 

In [None]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam

# Example parameters
# TIME_STEPS = 30  # Number of time steps in each sequence
TIME_STEPS = 20
FEATURES = 3  # Number of features: FLOW, VELOCITY, PRESSURE

# Assuming `data` is your concatenated DataFrame with normalized values

# Prepare sequences
def create_sequences(data, time_steps=TIME_STEPS):
    sequences = []
    labels = []
    for i in range(len(data) - time_steps):
        seq = data.iloc[i:i + time_steps].values
        label = data.iloc[i + time_steps]['burst_event']  # Assuming 'Burst' is the target column
        sequences.append(seq)
        labels.append(label)
    return np.array(sequences), np.array(labels)

# Scale the data
scaler = MinMaxScaler()
combined_data[['flow', 'Slave_Device1_CH2_VELOCITY m/s', 'pressure']] = scaler.fit_transform(combined_data[['flow', 'Slave_Device1_CH2_VELOCITY m/s', 'pressure']])

# Split data into sequences
X, y = create_sequences(combined_data)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LSTM Model
# Reduced time steps and batch size
BATCH_SIZE = 64  # Increase batch size

# Simplified LSTM Model
model = Sequential([
    LSTM(25, input_shape=(TIME_STEPS, FEATURES)),  # Reduce units and remove return_sequences
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train with reduced epochs for testing
history = model.fit(X_train, y_train, epochs=5, batch_size=BATCH_SIZE, validation_split=0.2)



In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}')
print(f'Test Accuracy: {accuracy}')


In [None]:
# Predict on new data
y_pred = model.predict(X_test)
y_pred_classes = (y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions
