In [1]:
# Basic Data Handling
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Date & Time Handling
from datetime import datetime

# Preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score

# Machine Learning Models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor


# Evaluation Metrics
from sklearn.metrics import mean_squared_error, r2_score



In [2]:
# Load the dataset
df = pd.read_csv("C:\\Users\\T L S\\Desktop\\BAKSH\\cybersecurity_intrusion_data.csv")  # Replace with your actual file name

# Display the first 5 rows
print("First 5 Rows:")
print(df.head())

# Get a quick statistical summary of numeric columns
print("\nStatistical Summary:")
print(df.describe())

# Optional: Check data types and missing values
print("\nInfo:")
print(df.info())


First 5 Rows:
  session_id  network_packet_size protocol_type  login_attempts  \
0  SID_00001                  599           TCP               4   
1  SID_00002                  472           TCP               3   
2  SID_00003                  629           TCP               3   
3  SID_00004                  804           UDP               4   
4  SID_00005                  453           TCP               5   

   session_duration encryption_used  ip_reputation_score  failed_logins  \
0        492.983263             DES             0.606818              1   
1       1557.996461             DES             0.301569              0   
2         75.044262             DES             0.739164              2   
3        601.248835             DES             0.123267              0   
4        532.540888             AES             0.054874              1   

  browser_type  unusual_time_access  attack_detected  
0         Edge                    0                1  
1      Firefox        

In [3]:
# Check for null values in each column
print("Null Values Per Column:")
print(df.isnull().sum())

Null Values Per Column:
session_id                0
network_packet_size       0
protocol_type             0
login_attempts            0
session_duration          0
encryption_used        1966
ip_reputation_score       0
failed_logins             0
browser_type              0
unusual_time_access       0
attack_detected           0
dtype: int64


In [4]:
# Drop unwanted columns
df.drop(['session_id'], axis=1)

# Confirm they are removed
print("Remaining Columns:")
print(df.columns)


Remaining Columns:
Index(['session_id', 'network_packet_size', 'protocol_type', 'login_attempts',
       'session_duration', 'encryption_used', 'ip_reputation_score',
       'failed_logins', 'browser_type', 'unusual_time_access',
       'attack_detected'],
      dtype='object')


In [8]:
from sklearn.preprocessing import LabelEncoder

# List of categorical columns you want to encode
categorical_cols = ['protocol_type', 'encryption_used', 'browser_type']

# Initialize the label encoder
le = LabelEncoder()

# Apply label encoding to each column
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])


In [9]:
from sklearn.preprocessing import MinMaxScaler
# Step 2: Apply MinMaxScaler only to numerical columns
numerical_cols = ['network_packet_size', 'login_attempts', 'session_duration', 
                  'ip_reputation_score', 'failed_logins', 'unusual_time_access']

scaler = MinMaxScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])

In [13]:
from sklearn.model_selection import train_test_split

# Step 1: Select X (features) and y (target)
X = df.drop(['session_id', 'attack_detected'], axis=1)  # Drop session_id and target
y = df['attack_detected']                                # Target variable

# (Now you would apply LabelEncoder and MinMaxScaler here if needed, as we discussed earlier)

# Step 2: Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Initialize the model
model = RandomForestClassifier(random_state=42)

# Step 2: Train (fit) the model
model.fit(X_train, y_train)

# Step 3: Predict on the test set
y_pred = model.predict(X_test)

# Step 4: Evaluate
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Step 5: Print Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy * 100:.2f}%")


Confusion Matrix:
[[1050    5]
 [ 217  636]]

Classification Report:
              precision    recall  f1-score   support

           0       0.83      1.00      0.90      1055
           1       0.99      0.75      0.85       853

    accuracy                           0.88      1908
   macro avg       0.91      0.87      0.88      1908
weighted avg       0.90      0.88      0.88      1908


Accuracy: 88.36%


In [23]:
import joblib

# Save the trained model
joblib.dump(model, 'intrusion.pkl')

print("✅ Model saved successfully as 'random_forest_model.pkl'")


✅ Model saved successfully as 'random_forest_model.pkl'


In [17]:
import xgboost as xgb
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Initialize XGBoost model
model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

# Step 2: Train the model
model.fit(X_train, y_train)

# Step 3: Predict
y_pred = model.predict(X_test)

# Step 4: Evaluate
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"\nAccuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[[1036   19]
 [ 211  642]]
              precision    recall  f1-score   support

           0       0.83      0.98      0.90      1055
           1       0.97      0.75      0.85       853

    accuracy                           0.88      1908
   macro avg       0.90      0.87      0.87      1908
weighted avg       0.89      0.88      0.88      1908


Accuracy: 87.95%


In [18]:
from catboost import CatBoostClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Initialize CatBoost model
model = CatBoostClassifier(
    iterations=1000,         # number of boosting rounds
    learning_rate=0.05,      # learning rate
    depth=6,                 # depth of trees
    loss_function='Logloss', # for binary classification
    eval_metric='Accuracy',  # evaluation metric
    random_seed=42,
    verbose=100              # print updates every 100 iterations
)

# Step 2: Train model
model.fit(X_train, y_train)

# Step 3: Predict
y_pred = model.predict(X_test)

# Step 4: Evaluate
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Step 5: Print Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy * 100:.2f}%")


0:	learn: 0.8759995	total: 165ms	remaining: 2m 44s
100:	learn: 0.8974964	total: 1.98s	remaining: 17.6s
200:	learn: 0.8982829	total: 2.92s	remaining: 11.6s
300:	learn: 0.9016909	total: 3.86s	remaining: 8.96s
400:	learn: 0.9077205	total: 4.85s	remaining: 7.25s
500:	learn: 0.9124394	total: 5.8s	remaining: 5.77s
600:	learn: 0.9186001	total: 6.71s	remaining: 4.45s
700:	learn: 0.9234500	total: 8.09s	remaining: 3.45s
800:	learn: 0.9296107	total: 8.92s	remaining: 2.21s
900:	learn: 0.9353782	total: 10.3s	remaining: 1.13s
999:	learn: 0.9398348	total: 11.2s	remaining: 0us
Confusion Matrix:
[[1042   13]
 [ 212  641]]

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.99      0.90      1055
           1       0.98      0.75      0.85       853

    accuracy                           0.88      1908
   macro avg       0.91      0.87      0.88      1908
weighted avg       0.90      0.88      0.88      1908


Accuracy: 88.21%


In [19]:
import lightgbm as lgb
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 1: Initialize LightGBM model
model = lgb.LGBMClassifier(
    n_estimators=1000,        # number of boosting rounds
    learning_rate=0.05,       # learning rate
    max_depth=6,              # maximum tree depth
    random_state=42
)

# Step 2: Train the model
model.fit(X_train, y_train)

# Step 3: Predict
y_pred = model.predict(X_test)

# Step 4: Evaluate
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Step 5: Print Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy * 100:.2f}%")


[LightGBM] [Info] Number of positive: 3411, number of negative: 4218
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000607 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 797
[LightGBM] [Info] Number of data points in the train set: 7629, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.447110 -> initscore=-0.212356
[LightGBM] [Info] Start training from score -0.212356
Confusion Matrix:
[[1024   31]
 [ 211  642]]

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.97      0.89      1055
           1       0.95      0.75      0.84       853

    accuracy                           0.87      1908
   macro avg       0.89      0.86      0.87      1908
weighted avg       0.88      0.87      0.87      1908


Accuracy: 87.32%
