In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

# Step 1: Load dataset
file_path = r"/content/combined_dataset (1).csv"
data = pd.read_csv(file_path)


# Step 2: Identify target column
target_column = data.columns[-1]

# Step 3: Preprocessing
data_cleaned = data.dropna().copy()

# Encode target variable
data_cleaned.loc[:, target_column] = data_cleaned[target_column].astype(str)
label_encoder = LabelEncoder()
data_cleaned.loc[:, target_column] = label_encoder.fit_transform(data_cleaned[target_column])

# Remove duplicate rows
data_cleaned = data_cleaned.drop_duplicates()

# Separate features and target
X = data_cleaned.drop(columns=[target_column])
y = data_cleaned[target_column]

# Ensure numeric conversion
X = X.apply(pd.to_numeric, errors='coerce').replace([np.inf, -np.inf], np.nan).fillna(0)

# Step 4: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Remove any duplicate test rows present in training set
train_hashes = pd.util.hash_pandas_object(X_train).values
test_hashes = pd.util.hash_pandas_object(X_test).values
duplicates_in_test = np.isin(test_hashes, train_hashes).sum()
if duplicates_in_test > 0:
    X_train, y_train = X_train.loc[~X_train.index.isin(X_test.index)], y_train.loc[~y_train.index.isin(y_test.index)]

# Standardize the features
scaler = StandardScaler()
X_train_scaled_all = scaler.fit_transform(X_train)
X_test_scaled_all = scaler.transform(X_test)

# Fix label types
y_train = y_train.astype(int)
y_test = y_test.astype(int)

# Step 5: Identify Top 10 Predictive Features
clf_tree = DecisionTreeClassifier(random_state=42)
clf_tree.fit(X_train_scaled_all, y_train)
importances = pd.Series(clf_tree.feature_importances_, index=X.columns).sort_values(ascending=False)

# Use top 10 features
top_features = importances.head(10).index
print("Top 10 most predictive features:\n", top_features)

# Filter data to top 10 features
X_train = X_train[top_features]
X_test = X_test[top_features]

# Re-standardize for reduced feature set
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Add slight noise to avoid overfitting
X_train_scaled += np.random.normal(0, 0.01, X_train_scaled.shape)

# Step 6: Train RandomForest Model
clf = RandomForestClassifier(n_estimators=50, max_depth=6, random_state=42)
clf.fit(X_train_scaled, y_train)
y_pred = clf.predict(X_test_scaled)

# Step 7: Evaluate RandomForest Model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy : {accuracy * 100:.2f}%")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Step 8: Try Logistic Regression
clf_lr = LogisticRegression(max_iter=500)
clf_lr.fit(X_train_scaled, y_train)
y_pred_lr = clf_lr.predict(X_test_scaled)
print("Logistic Regression Accuracy :", accuracy_score(y_test, y_pred_lr))

Top 10 most predictive features:
 Index([' Bwd Packet Length Std', ' Subflow Bwd Packets', ' act_data_pkt_fwd',
       ' Destination Port', ' Avg Fwd Segment Size', ' Idle Min',
       ' Average Packet Size', 'Flow Bytes/s', 'Subflow Fwd Packets',
       ' Flow IAT Mean'],
      dtype='object')
Accuracy : 99.86%
Confusion Matrix:
 [[4457    4]
 [   6 2561]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      4461
           1       1.00      1.00      1.00      2567

    accuracy                           1.00      7028
   macro avg       1.00      1.00      1.00      7028
weighted avg       1.00      1.00      1.00      7028

Logistic Regression Accuracy : 0.9970119521912351


In [8]:
data.head()

Unnamed: 0,Destination Port,Flow Duration,Total Fwd Packets,Total Backward Packets,Total Length of Fwd Packets,Total Length of Bwd Packets,Fwd Packet Length Max,Fwd Packet Length Min,Fwd Packet Length Mean,Fwd Packet Length Std,...,min_seg_size_forward,Active Mean,Active Std,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,Label
0,54865,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
1,55054,109,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
2,55055,52,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
3,46236,34,1,1,6,6,6,6,6.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN
4,54863,3,2,0,12,0,6,6,6.0,0.0,...,20,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BENIGN


In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35497 entries, 0 to 35496
Data columns (total 79 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0    Destination Port             35497 non-null  int64  
 1    Flow Duration                35497 non-null  int64  
 2    Total Fwd Packets            35497 non-null  int64  
 3    Total Backward Packets       35497 non-null  int64  
 4   Total Length of Fwd Packets   35497 non-null  int64  
 5    Total Length of Bwd Packets  35497 non-null  int64  
 6    Fwd Packet Length Max        35497 non-null  int64  
 7    Fwd Packet Length Min        35497 non-null  int64  
 8    Fwd Packet Length Mean       35497 non-null  float64
 9    Fwd Packet Length Std        35497 non-null  float64
 10  Bwd Packet Length Max         35497 non-null  int64  
 11   Bwd Packet Length Min        35497 non-null  int64  
 12   Bwd Packet Length Mean       35497 non-null  float64
 13   