# 3.1 SWAT Data Preprocessing
First, We will perform pre processing steps on SWAT dataset.

## Importing important Libraries:

In [1]:
import pandas as pd                    # For data manipulation and preprocessing
import seaborn as sns                  # For data visualization
import numpy as np                     # For numerical operations
import matplotlib.pyplot as plt       # For plotting graphs
from sklearn.model_selection import train_test_split  # For splitting the dataset into training and testing sets
from sklearn.feature_selection import SelectKBest, f_classif  # For feature selection
from sklearn.feature_selection import RFE    # For recursive feature elimination
from sklearn.ensemble import RandomForestClassifier  # For building a random forest classifier model
from sklearn.linear_model import LogisticRegression  # For building a logistic regression classifier model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix  # For evaluating model performance

%matplotlib inline
sns.set_theme()

### Loading data

In [2]:
swat_data = pd.read_csv('SWaT_Dataset_Normal_v1_modified.csv')


### Describing data

In [3]:
# Train data
print('swat training data:')
swat_data.info()
swat_data.describe()


swat training data:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495000 entries, 0 to 494999
Data columns (total 53 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0    Timestamp     495000 non-null  object 
 1   FIT101         495000 non-null  float64
 2   LIT101         495000 non-null  float64
 3   MV101          495000 non-null  int64  
 4   P101           495000 non-null  int64  
 5   P102           495000 non-null  int64  
 6   AIT201         495000 non-null  float64
 7   AIT202         495000 non-null  float64
 8   AIT203         495000 non-null  float64
 9   FIT201         495000 non-null  float64
 10  MV201          495000 non-null  int64  
 11  P201           495000 non-null  int64  
 12  P202           495000 non-null  int64  
 13  P203           495000 non-null  int64  
 14  P204           495000 non-null  int64  
 15  P205           495000 non-null  int64  
 16  P206           495000 non-null  int64  
 17  DPIT301  

Unnamed: 0,FIT101,LIT101,MV101,P101,P102,AIT201,AIT202,AIT203,FIT201,MV201,...,FIT504,P501,P502,PIT501,PIT502,PIT503,FIT601,P601,P602,P603
count,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0,...,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0
mean,1.850517,587.532773,1.720564,1.749149,1.0,263.783544,8.388162,348.379334,1.834095,1.746497,...,0.307254,1.996893,1.0,251.780601,1.187011,190.928428,0.014427,1.0,1.007943,1.0
std,1.132519,121.666482,0.457612,0.433503,0.0,4.787117,0.090233,49.45001,1.059288,0.443279,...,0.017413,0.055654,0.0,13.600787,0.204781,10.61423,0.148934,0.0,0.088771,0.0
min,0.0,120.6237,0.0,1.0,1.0,251.6662,8.258652,312.2789,0.0,0.0,...,0.0,1.0,1.0,8.891951,0.0,3.108177,0.0,1.0,1.0,1.0
25%,0.0,508.441,1.0,1.0,1.0,260.7344,8.349654,327.352,0.622016,2.0,...,0.306633,2.0,1.0,250.0,1.057252,189.022,0.0,1.0,1.0,1.0
50%,2.491432,525.6337,2.0,2.0,1.0,265.2845,8.366636,330.9664,2.443085,2.0,...,0.308362,2.0,1.0,253.0441,1.121328,191.986,6.4e-05,1.0,1.0,1.0
75%,2.592,676.7175,2.0,2.0,1.0,266.9828,8.407652,335.3499,2.451799,2.0,...,0.310284,2.0,1.0,255.0468,1.217441,193.8605,6.4e-05,1.0,1.0,1.0
max,2.745092,817.5565,2.0,2.0,1.0,272.5263,8.988273,567.4699,2.487938,2.0,...,0.31701,2.0,1.0,264.6437,3.668343,200.6376,1.746131,1.0,2.0,1.0


### Missing values check

In [4]:
# SWAT Data
missing_values_train = swat_data.isnull().sum()
print("\nMissing Values in Train Data:")
print(missing_values_train)


Missing Values in Train Data:
 Timestamp       0
FIT101           0
LIT101           0
MV101            0
P101             0
P102             0
AIT201           0
AIT202           0
AIT203           0
FIT201           0
MV201            0
P201             0
P202             0
P203             0
P204             0
P205             0
P206             0
DPIT301          0
FIT301           0
LIT301           0
MV301            0
MV302            0
MV303            0
MV304            0
P301             0
P302             0
AIT401           0
AIT402           0
FIT401           0
LIT401           0
P401             0
P402             0
P403             0
P404             0
UV401            0
AIT501           0
AIT502           0
AIT503           0
AIT504           0
FIT501           0
FIT502           0
FIT503           0
FIT504           0
P501             0
P502             0
PIT501           0
PIT502           0
PIT503           0
FIT601           0
P601             0
P602             0


### Duplicate value check

In [5]:
swat_data.duplicated().sum()

np.int64(0)

### Convert categorical variables:

#### One-hot encode the categorical features

In [6]:
from sklearn.preprocessing import OneHotEncoder

# Categorical column names
categorical_col_names = ['Normal/Attack']

# Initialize OneHotEncoder
encoder = OneHotEncoder()

# swat_data 
one_hot_encoded_train = encoder.fit_transform(swat_data.loc[:,categorical_col_names])

one_hot_df_train = pd.DataFrame(one_hot_encoded_train.toarray(), columns=encoder.get_feature_names_out(categorical_col_names))
df_train_ecdoded = pd.concat([swat_data, one_hot_df_train], axis=1)
df_train_ecdoded = df_train_ecdoded.drop(columns=categorical_col_names, axis=1)
df_train = df_train_ecdoded

In [7]:
df_train

Unnamed: 0,Timestamp,FIT101,LIT101,MV101,P101,P102,AIT201,AIT202,AIT203,FIT201,...,P501,P502,PIT501,PIT502,PIT503,FIT601,P601,P602,P603,Normal/Attack_Normal
0,22/12/2015 4:30:00 PM,0.000000,124.3135,1,1,1,251.9226,8.313446,312.7916,0.000000,...,1,1,9.100231,0.000000,3.3485,0.000256,1,1,1,1.0
1,22/12/2015 4:30:01 PM,0.000000,124.3920,1,1,1,251.9226,8.313446,312.7916,0.000000,...,1,1,9.100231,0.000000,3.3485,0.000256,1,1,1,1.0
2,22/12/2015 4:30:02 PM,0.000000,124.4705,1,1,1,251.9226,8.313446,312.7916,0.000000,...,1,1,9.100231,0.000000,3.3485,0.000256,1,1,1,1.0
3,22/12/2015 4:30:03 PM,0.000000,124.6668,1,1,1,251.9226,8.313446,312.7916,0.000000,...,1,1,9.100231,0.000000,3.3485,0.000256,1,1,1,1.0
4,22/12/2015 4:30:04 PM,0.000000,124.5098,1,1,1,251.9226,8.313446,312.7916,0.000000,...,1,1,9.100231,0.000000,3.3485,0.000256,1,1,1,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494995,28/12/2015 9:59:55 AM,2.460366,523.0430,2,2,1,262.0161,8.396437,328.5055,2.442316,...,2,1,250.817100,1.778105,189.8552,0.000128,1,1,1,1.0
494996,28/12/2015 9:59:56 AM,2.448836,522.9645,2,2,1,262.0161,8.396437,328.5055,2.442316,...,2,1,250.817100,1.778105,189.5027,0.000128,1,1,1,1.0
494997,28/12/2015 9:59:57 AM,2.434744,522.8860,2,2,1,262.0161,8.396437,328.6337,2.444879,...,2,1,250.817100,1.778105,189.5027,0.000128,1,1,1,1.0
494998,28/12/2015 9:59:58 AM,2.428338,522.9252,2,2,1,262.0161,8.396437,328.6337,2.445391,...,2,1,250.817100,1.649953,189.5027,0.000128,1,1,1,1.0


### Separate features and target variable

#### -> Dropping Timestamp , and setting Attacktype as Target variable

In [8]:
# Remove leading whitespace from column names
df_train.columns = df_train.columns.str.strip()

# Drop the 'Timestamp' column
X_train_data = df_train.drop(columns=['Timestamp'])
y_train_data = df_train['Normal/Attack_Normal']



In [9]:
X_train_data.info()
X_train_data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495000 entries, 0 to 494999
Data columns (total 52 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   FIT101                495000 non-null  float64
 1   LIT101                495000 non-null  float64
 2   MV101                 495000 non-null  int64  
 3   P101                  495000 non-null  int64  
 4   P102                  495000 non-null  int64  
 5   AIT201                495000 non-null  float64
 6   AIT202                495000 non-null  float64
 7   AIT203                495000 non-null  float64
 8   FIT201                495000 non-null  float64
 9   MV201                 495000 non-null  int64  
 10  P201                  495000 non-null  int64  
 11  P202                  495000 non-null  int64  
 12  P203                  495000 non-null  int64  
 13  P204                  495000 non-null  int64  
 14  P205                  495000 non-null  int64  
 15  

Unnamed: 0,FIT101,LIT101,MV101,P101,P102,AIT201,AIT202,AIT203,FIT201,MV201,...,P501,P502,PIT501,PIT502,PIT503,FIT601,P601,P602,P603,Normal/Attack_Normal
count,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0,...,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0,495000.0
mean,1.850517,587.532773,1.720564,1.749149,1.0,263.783544,8.388162,348.379334,1.834095,1.746497,...,1.996893,1.0,251.780601,1.187011,190.928428,0.014427,1.0,1.007943,1.0,1.0
std,1.132519,121.666482,0.457612,0.433503,0.0,4.787117,0.090233,49.45001,1.059288,0.443279,...,0.055654,0.0,13.600787,0.204781,10.61423,0.148934,0.0,0.088771,0.0,0.0
min,0.0,120.6237,0.0,1.0,1.0,251.6662,8.258652,312.2789,0.0,0.0,...,1.0,1.0,8.891951,0.0,3.108177,0.0,1.0,1.0,1.0,1.0
25%,0.0,508.441,1.0,1.0,1.0,260.7344,8.349654,327.352,0.622016,2.0,...,2.0,1.0,250.0,1.057252,189.022,0.0,1.0,1.0,1.0,1.0
50%,2.491432,525.6337,2.0,2.0,1.0,265.2845,8.366636,330.9664,2.443085,2.0,...,2.0,1.0,253.0441,1.121328,191.986,6.4e-05,1.0,1.0,1.0,1.0
75%,2.592,676.7175,2.0,2.0,1.0,266.9828,8.407652,335.3499,2.451799,2.0,...,2.0,1.0,255.0468,1.217441,193.8605,6.4e-05,1.0,1.0,1.0,1.0
max,2.745092,817.5565,2.0,2.0,1.0,272.5263,8.988273,567.4699,2.487938,2.0,...,2.0,1.0,264.6437,3.668343,200.6376,1.746131,1.0,2.0,1.0,1.0


## Imputing missing values with the mean values

In [10]:
# Replace infinity values with NaN
X_train_data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Impute missing values with the mean of each column
X_train_data.fillna(X_train_data.mean(), inplace=True)

# Check if NaN values have been replaced
nan_counts_after_imputation = X_train_data.isnull().sum()
print("NaN Counts after imputation:")
print(nan_counts_after_imputation)

NaN Counts after imputation:
FIT101                  0
LIT101                  0
MV101                   0
P101                    0
P102                    0
AIT201                  0
AIT202                  0
AIT203                  0
FIT201                  0
MV201                   0
P201                    0
P202                    0
P203                    0
P204                    0
P205                    0
P206                    0
DPIT301                 0
FIT301                  0
LIT301                  0
MV301                   0
MV302                   0
MV303                   0
MV304                   0
P301                    0
P302                    0
AIT401                  0
AIT402                  0
FIT401                  0
LIT401                  0
P401                    0
P402                    0
P403                    0
P404                    0
UV401                   0
AIT501                  0
AIT502                  0
AIT503                  0
AIT504   

# Feature Selection: 

- I have peformed many techniques below to select features for training the model, like feature importance , Mulitcolinearity Check, RFE, and correaltion analysis

###  check for multicollinearity

###  Recursive Feature Elimination with Random Forest


In [11]:
model = RandomForestClassifier()
rfe = RFE(model, n_features_to_select=15)
rfe.fit(X_train_data, y_train_data)

### Selected features from RFE

In [12]:
selected_features_rfe = X_train_data.columns[rfe.support_]
print("Selected features from RFE:")
print(selected_features_rfe)

Selected features from RFE:
Index(['AIT504', 'FIT501', 'FIT502', 'FIT503', 'FIT504', 'P501', 'P502',
       'PIT501', 'PIT502', 'PIT503', 'FIT601', 'P601', 'P602', 'P603',
       'Normal/Attack_Normal'],
      dtype='object')


## So we are considering 'selected_features' as the final list of features from Feature selection process:



In [13]:

selected_features = ['AIT504', 'FIT501', 'FIT502', 'FIT503', 'FIT504', 'P501', 'P502',
       'PIT501', 'PIT502', 'PIT503', 'FIT601', 'P601', 'P602', 'P603']

### Correlation analysis on New Features and Existing Features

In [14]:
# Calculate correlation between all features and the target variable
correlation_with_target = X_train_data[selected_features + ['Normal/Attack_Normal']].corr()['Normal/Attack_Normal'].sort_index(ascending=False)

# Display correlation values
print(correlation_with_target)

PIT503                 NaN
PIT502                 NaN
PIT501                 NaN
P603                   NaN
P602                   NaN
P601                   NaN
P502                   NaN
P501                   NaN
Normal/Attack_Normal   NaN
FIT601                 NaN
FIT504                 NaN
FIT503                 NaN
FIT502                 NaN
FIT501                 NaN
AIT504                 NaN
Name: Normal/Attack_Normal, dtype: float64


### Final Feature List for training model

In [15]:
# Concatenate dataframes along columns
df_final = X_train_data[selected_features]

# Display the combined dataframe
print(df_final)

           AIT504    FIT501    FIT502    FIT503    FIT504  P501  P502  \
0       123.31450  0.001538  0.001409  0.001664  0.000000     1     1   
1       123.31450  0.001538  0.001409  0.001664  0.000000     1     1   
2       123.31450  0.001538  0.001409  0.001664  0.000000     1     1   
3       123.31450  0.001538  0.001409  0.001664  0.000000     1     1   
4       123.31450  0.001538  0.001409  0.001664  0.000000     1     1   
...           ...       ...       ...       ...       ...   ...   ...   
494995   12.03538  1.726352  1.292430  0.735269  0.308619     2     1   
494996   12.03538  1.724942  1.281158  0.735269  0.308619     2     1   
494997   12.03538  1.723789  1.272576  0.735269  0.308619     2     1   
494998   12.03538  1.723789  1.272576  0.735269  0.308619     2     1   
494999   12.03538  1.723789  1.279621  0.735269  0.307786     2     1   

            PIT501    PIT502    PIT503    FIT601  P601  P602  P603  
0         9.100231  0.000000    3.3485  0.000256     1

# Feature Scaling

## Using MinMaxScaler()

### First, I performed individually on Training Data

In [16]:
from sklearn.preprocessing import MinMaxScaler

# Select the columns from the DataFrame
X_final = df_final[selected_features]

# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Fit and transform the features
X_scaled_train = scaler.fit_transform(X_final)

# Use X_scaled_standard for further analysis or modeling


In [17]:
print('Scaled Trained Data: X_scaled_train')
print(X_scaled_train[:5])  # Print the first 5 rows of the scaled features

Scaled Trained Data: X_scaled_train
[[5.27549303e-01 1.45942525e-04 5.64883354e-04 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 8.14383482e-04
  0.00000000e+00 1.21664406e-03 1.46783661e-04 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [5.27549303e-01 1.45942525e-04 5.64883354e-04 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 8.14383482e-04
  0.00000000e+00 1.21664406e-03 1.46783661e-04 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [5.27549303e-01 1.45942525e-04 5.64883354e-04 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 8.14383482e-04
  0.00000000e+00 1.21664406e-03 1.46783661e-04 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [5.27549303e-01 1.45942525e-04 5.64883354e-04 0.00000000e+00
  0.00000000e+00 0.00000000e+00 0.00000000e+00 8.14383482e-04
  0.00000000e+00 1.21664406e-03 1.46783661e-04 0.00000000e+00
  0.00000000e+00 0.00000000e+00]
 [5.27549303e-01 1.45942525e-04 5.64883354e-04 0.00000000e+00
  0.00000000e+00 0.0000000

### Note: After Scaling the Train and Test data, We can start working on model development, We will use Scale Train Data for training new models.

# 3.2 Model Development

## Model Selection

### Splitting the data 

In [18]:
from sklearn.model_selection import train_test_split

# Define features (X) and target variable (y)
X = X_scaled_train
y = df_train['Normal/Attack_Normal']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


##  Random forest classification Model

In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Instantiate the model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 1.0


## To predict whether a system has been under attack or not based on your dataset with attack types, you can approach it as a binary classification problem where the target variable is binary: 1 for "under attack" and 0 for "not under attack" (normal).

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Data Preparation
# Assume X_train, X_test, y_train, y_test are prepared

# Model Training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Feature Importance
feature_importance = model.feature_importances_
# Analyze feature importance to understand which features are important for classification

# Deployment and Monitoring
# Deploy the trained model into production and monitor its performance over time


              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00     99000

    accuracy                           1.00     99000
   macro avg       1.00      1.00      1.00     99000
weighted avg       1.00      1.00      1.00     99000



# Saving our trained model

In [21]:
# After training your model
from sklearn.ensemble import RandomForestClassifier
import joblib

# Assume `model` is your trained RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Save the trained model to a file
joblib.dump(model, 'random_forest_model.pkl')

['random_forest_model.pkl']

In [22]:
pip show scikit-learn

Name: scikit-learn
Version: 1.5.2
Summary: A set of python modules for machine learning and data mining
Home-page: https://scikit-learn.org
Author: 
Author-email: 
License: BSD 3-Clause License
        
        Copyright (c) 2007-2024 The scikit-learn developers.
        All rights reserved.
        
        Redistribution and use in source and binary forms, with or without
        modification, are permitted provided that the following conditions are met:
        
        * Redistributions of source code must retain the above copyright notice, this
          list of conditions and the following disclaimer.
        
        * Redistributions in binary form must reproduce the above copyright notice,
          this list of conditions and the following disclaimer in the documentation
          and/or other materials provided with the distribution.
        
        * Neither the name of the copyright holder nor the names of its
          contributors may be used to endorse or promote produ

In [23]:
# Short script for this ML model:

# import pandas as pd
# import numpy as np
# import joblib
# from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
# from sklearn.metrics import accuracy_score, classification_report

# # Load data
# swat_data = pd.read_csv('SWaT_Dataset_Normal_v1_modified.csv')

# # One-hot encode the target variable
# categorical_col_names = ['Normal/Attack']
# encoder = OneHotEncoder()
# one_hot_encoded_train = encoder.fit_transform(swat_data.loc[:, categorical_col_names])
# one_hot_df_train = pd.DataFrame(one_hot_encoded_train.toarray(), columns=encoder.get_feature_names_out(categorical_col_names))
# df_train_encoded = pd.concat([swat_data, one_hot_df_train], axis=1)
# df_train_encoded = df_train_encoded.drop(columns=categorical_col_names, axis=1)
# df_train = df_train_encoded

# # Prepare features and target variable
# selected_features = ['AIT504', 'FIT501', 'FIT502', 'FIT503', 'FIT504', 'P501', 'P502', 'PIT501', 'PIT502', 'PIT503', 'FIT601', 'P601', 'P602', 'P603']
# X_train_data = df_train[selected_features]
# y_train_data = df_train['Normal/Attack_Normal']

# # Handle missing values
# X_train_data.replace([np.inf, -np.inf], np.nan, inplace=True)
# X_train_data.fillna(X_train_data.mean(), inplace=True)

# # Scale the features
# scaler = MinMaxScaler()
# X_scaled_train = scaler.fit_transform(X_train_data)

# # Train/test split
# X_train, X_test, y_train, y_test = train_test_split(X_scaled_train, y_train_data, test_size=0.2, random_state=42)

# # Train RandomForest model
# rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
# rf_classifier.fit(X_train, y_train)

# # Evaluate model
# y_pred = rf_classifier.predict(X_test)
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))

# # Save the trained model
# joblib.dump((rf_classifier, scaler, selected_features), 'random_forest_model.pkl')
