# Automated Threat Detection using Machine Learning

## Project Overview
This project demonstrates a complete machine learning workflow for building a robust Network Intrusion Detection System (NIDS). The system is designed to classify network traffic as either normal or one of several specific cyberattack categories (e.g., DoS, Probe, R2L, U2R). This notebook is prepared for public sharing on GitHub and has had all personal credentials removed.

### Step 1: Install and Import Libraries

In [None]:
# Install necessary libraries
!pip install -U scikit-learn pandas numpy matplotlib seaborn

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn imports for preprocessing, modeling, and evaluation
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Set visualization style
sns.set_style('whitegrid')

### Step 2: Load Local Datasets

This cell loads the data from local CSV files. To run this notebook, ensure `Train_data.csv` and `Test_data.csv` are in the same directory.

In [None]:
try:
    df_train_raw = pd.read_csv('Train_data.csv')
    df_test_raw = pd.read_csv('Test_data.csv')
    print("Training and Test data loaded successfully!")
except FileNotFoundError:
    print("Error: Make sure 'Train_data.csv' and 'Test_data.csv' are in the same folder as the notebook.")

df_train_raw.head()

### Step 3: Exploratory Data Analysis (EDA)

In [None]:
print("Training Data Info:")
df_train_raw.info()

# Check for missing values
print(f"\nMissing values in training data: {df_train_raw.isnull().sum().sum()}")

#### Creating Multiclass Labels
For demonstration, we simulate multiclass labels based on the binary 'anomaly' class to showcase the model's classification capabilities.

In [None]:
# Create a new 'attack_category' column
attack_map = {0: 'Normal', 1: 'DoS', 2: 'Probe', 3: 'R2L', 4: 'U2R'}
anomaly_indices = df_train_raw[df_train_raw['class'] == 'anomaly'].index

# Simulate the multiclass labels for anomalies
np.random.seed(42) # for reproducibility
simulated_attacks = np.random.choice([1, 2, 3, 4], size=len(anomaly_indices), p=[0.75, 0.15, 0.08, 0.02])

df_train_raw['attack_category'] = 'Normal'
df_train_raw.loc[anomaly_indices, 'attack_category'] = [attack_map[x] for x in simulated_attacks]

# Visualize the new multiclass distribution
plt.figure(figsize=(10, 7))
sns.countplot(y='attack_category', data=df_train_raw, order=df_train_raw['attack_category'].value_counts().index)
plt.title('Distribution of Network Traffic by Attack Category')
plt.xlabel('Count')
plt.ylabel('Attack Category')
plt.show()

# Drop the original binary 'class' column
df_train = df_train_raw.drop('class', axis=1)

### Step 4: Data Preprocessing

In [None]:
# Separate features (X) and target (y)
X = df_train.drop('attack_category', axis=1)
y = df_train['attack_category']

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=np.number).columns

# Create the preprocessing pipelines
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a column transformer to apply different transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

### Step 5: Model Training & Evaluation

In [None]:
# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define the model pipeline
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1))])

# Train the model
print("Training the Random Forest model...")
model_pipeline.fit(X_train, y_train)
print("Model training complete!")

# --- Model Evaluation ---
print("\n--- Model Evaluation Report ---")
y_pred = model_pipeline.predict(X_val)
print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}\n")
print("Classification Report:")
print(classification_report(y_val, y_pred))

### Step 6: Feature Importance Analysis

In [None]:
# Get feature importances from the trained model
try:
    ohe_feature_names = model_pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
    all_feature_names = np.concatenate([numerical_features, ohe_feature_names])

    importance_df = pd.DataFrame({
        'Feature': all_feature_names,
        'Importance': model_pipeline.named_steps['classifier'].feature_importances_
    }).sort_values(by='Importance', ascending=False)

    # Plot the top 20 most important features
    plt.figure(figsize=(12, 8))
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(20))
    plt.title('Top 20 Most Important Features for Intrusion Detection')
    plt.show()
except Exception as e:
    print(f"Could not plot feature importances: {e}")