In [1]:
# ✅ STEP 1: Upload your .csv file
from google.colab import files

print("/content/sample_network_traffic.csv")
uploaded = files.upload()


/content/sample_network_traffic.csv


Saving sample_network_traffic.csv to sample_network_traffic (1).csv


In [3]:
# ✅ STEP 2: Load the uploaded CSV into a DataFrame
import pandas as pd

# Automatically get the uploaded file name
file_name = list(uploaded.keys())[0]

# Read the CSV file
df = pd.read_csv(file_name)

# Show the shape and first few rows of the data
print("✅ File loaded successfully!")
print("📄 Shape of data (rows, columns):", df.shape)
print("📌 Column names:", df.columns.tolist())

# Display the top 5 rows
df.head()


✅ File loaded successfully!
📄 Shape of data (rows, columns): (5, 7)
📌 Column names: ['Timestamp', 'Source_IP', 'Destination_IP', 'Protocol', 'Packet_Size', 'Flag', 'Label']


Unnamed: 0,Timestamp,Source_IP,Destination_IP,Protocol,Packet_Size,Flag,Label
0,2025-07-11 10:00:00,192.168.1.2,10.0.0.5,TCP,512,SYN,Normal
1,2025-07-11 10:00:01,192.168.1.2,10.0.0.7,UDP,128,-,Normal
2,2025-07-11 10:00:03,172.16.0.3,10.0.0.9,TCP,1024,ACK,Malicious
3,2025-07-11 10:00:05,10.0.0.4,192.168.1.3,ICMP,64,-,Normal
4,2025-07-11 10:00:07,192.168.1.4,10.0.0.8,TCP,256,FIN,Malicious


In [4]:
# ✅ STEP 3: Train AI model to classify traffic types
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Separate features (X) and label (y)
# Exclude non-numeric columns before scaling and training
X = df.drop(columns=['Label', 'Timestamp', 'Source_IP', 'Destination_IP', 'Flag', 'Protocol'])
y = df['Label']                 # the target column

# Scale (normalize) the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into training and testing sets (80% train, 20% test)
# Removed stratify=y because the dataset is too small
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Train a RandomForest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict on test set
y_pred = clf.predict(X_test)

# Show classification performance
print("\n✅ --- Traffic Classification Report ---")
print(classification_report(y_test, y_pred))


✅ --- Traffic Classification Report ---
              precision    recall  f1-score   support

      Normal       1.00      1.00      1.00         1

    accuracy                           1.00         1
   macro avg       1.00      1.00      1.00         1
weighted avg       1.00      1.00      1.00         1



In [None]:
# ✅ STEP 4: Detect anomalies in network traffic
from sklearn.ensemble import IsolationForest

# Optional: Train on only 'Benign' data if it's available
if 'Benign' in y_train.values:
    X_benign = X_train[y_train == 'Benign']
else:
    X_benign = X_train  # fallback if 'Benign' isn't available

# Train the Isolation Forest model
anomaly_model = IsolationForest(contamination=0.01, random_state=42)
anomaly_model.fit(X_benign)

# Predict on test data
y_anomaly_pred = anomaly_model.predict(X_test)

# Convert predictions: -1 = Anomaly, 1 = Normal
y_anomaly_label = ['Anomaly' if val == -1 else 'Normal' for val in y_anomaly_pred]

# Show a sample of results
import pandas as pd
print("\n🔍 --- Sample Anomaly Detection Output ---")
sample = pd.DataFrame({'Actual': y_test.values[:10], 'Predicted': y_anomaly_label[:10]})
print(sample)



🔍 --- Sample Anomaly Detection Output ---
   Actual Predicted
0  Normal    Normal


In [None]:
# ✅ STEP 5: Optimize model using Grid Search
from sklearn.model_selection import GridSearchCV
import time

# Define possible model settings (hyperparameters)
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5]
}

# Create the Grid Search object
grid = GridSearchCV(RandomForestClassifier(random_state=42),
                    param_grid,
                    cv=2,  # Reduced to 2-fold cross-validation due to small dataset size
                    scoring='f1_macro',
                    verbose=1)

# Measure training time
start = time.time()
grid.fit(X_train, y_train)
end = time.time()

# Show results
print(f"\n⏱️ Grid Search Training Time: {end - start:.2f} seconds")
print("✅ Best Parameters Found:", grid.best_params_)

# Use the best model to make predictions
best_model = grid.best_estimator_
y_best_pred = best_model.predict(X_test)

# Show updated performance report
from sklearn.metrics import classification_report
print("\n📊 Optimized Classification Report:")
print(classification_report(y_test, y_best_pred))

Fitting 2 folds for each of 8 candidates, totalling 16 fits

⏱️ Grid Search Training Time: 1.81 seconds
✅ Best Parameters Found: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 50}

📊 Optimized Classification Report:
              precision    recall  f1-score   support

   Malicious       0.00      0.00      0.00       0.0
      Normal       0.00      0.00      0.00       1.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# ✅ STEP 6: Simulate privacy-preserving traffic data
import numpy as np

# Create a dataset with flow-level metadata
np.random.seed(42)
privacy_df = pd.DataFrame({
    'FlowDuration': np.random.normal(500, 150, 1000),
    'PacketCount': np.random.randint(5, 50, 1000),
    'AvgPacketSize': np.random.normal(300, 50, 1000),
    'TCP_Flag_Count': np.random.randint(0, 5, 1000),
    'Label': np.random.choice(['Benign', 'Suspicious'], size=1000, p=[0.8, 0.2])
})

privacy_df.head()


Unnamed: 0,FlowDuration,PacketCount,AvgPacketSize,TCP_Flag_Count,Label
0,574.507123,48,251.181236,0,Benign
1,479.260355,44,255.377208,0,Benign
2,597.153281,15,317.751123,2,Benign
3,728.454478,7,288.944345,3,Benign
4,464.876994,10,257.985928,2,Benign


In [None]:
# Split metadata into features and label
X_priv = privacy_df.drop(columns=['Label'])
y_priv = privacy_df['Label']

# Train-test split
Xp_train, Xp_test, yp_train, yp_test = train_test_split(
    X_priv, y_priv, test_size=0.2, stratify=y_priv, random_state=42
)

# Scale the features
Xp_train_scaled = scaler.fit_transform(Xp_train)
Xp_test_scaled = scaler.transform(Xp_test)

# Train RandomForest classifier
clf_priv = RandomForestClassifier(random_state=42)
clf_priv.fit(Xp_train_scaled, yp_train)

# Predict and report
yp_pred = clf_priv.predict(Xp_test_scaled)
print("\n🔒 Privacy-Preserving Model Report:")
print(classification_report(yp_test, yp_pred))



🔒 Privacy-Preserving Model Report:
              precision    recall  f1-score   support

      Benign       0.81      0.98      0.88       162
  Suspicious       0.00      0.00      0.00        38

    accuracy                           0.79       200
   macro avg       0.40      0.49      0.44       200
weighted avg       0.65      0.79      0.71       200

