In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
import joblib

# 1. Load dataset
data = pd.read_csv("web_attacks_balanced.csv")

# 2. Define the target column and selected features
target_column = "Label"
# Arbitrarily selected 4 features for demonstration.
# You should replace these with features identified through proper feature selection.
selected_features = [
    'Flow Duration',
    'Total Fwd Packets',
    'Total Backward Packets',
    'Total Length of Fwd Packets'
]

# Ensure the target column and selected features are in the dataset
if target_column not in data.columns:
    raise ValueError(f"Target column '{target_column}' not found in dataset.")
for feature in selected_features:
    if feature not in data.columns:
        raise ValueError(f"Selected feature '{feature}' not found in dataset.")

# 3. Prepare data with only selected features
X = data[selected_features]
y = data[target_column]

# 4. Split train-test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Handle imbalance using SMOTE
# Note: SMOTE needs numerical data. Ensure your selected features are numerical.
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# 6. Train RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train_smote, y_train_smote)

# 7. Save the trained model
joblib.dump(model, 'random_forest_model_4_features.joblib') # Saved with a new name for clarity
print("Model saved as 'random_forest_model_4_features.joblib'")

# 8. Predict and evaluate
y_pred = model.predict(X_test)

print("\nAccuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


: 