In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load training dataset
train_path = 'datasets/UNSW_NB15_training-set.csv'
test_path = 'datasets/UNSW_NB15_testing-set.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

# Dataset Overview
print("Dataset Shape:", train_df.shape)
print("\nDataset Columns:", train_df.columns)

# Data types and null values
print("\nData Types and Null Values:")
print(train_df.info())

# Class distribution (attack vs normal)
print("\nAttack Label Distribution:")
print(train_df['label'].value_counts())

# Visualization of Attack Label Distribution
plt.figure(figsize=(6,4))
sns.countplot(x='label', data=train_df)
plt.title('Distribution of Normal vs Attack Labels')
plt.xlabel('Label (0: Normal, 1: Attack)')
plt.ylabel('Count')
plt.show()

# Distribution of attack categories
plt.figure(figsize=(12,5))
sns.countplot(y='attack_cat', data=train_df, order=train_df['attack_cat'].value_counts().index)
plt.title('Distribution of Attack Categories')
plt.xlabel('Count')
plt.ylabel('Attack Category')
plt.show()

# Correlation Heatmap of numerical features
numeric_features = train_df.select_dtypes(include=[np.number])

plt.figure(figsize=(15,12))
sns.heatmap(numeric_features.corr(), cmap='coolwarm', linewidths=0.1)
plt.title('Correlation Heatmap of Numeric Features')
plt.show()