In [None]:
import os
import requests

In [None]:
DATA_DIR = "data"
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)
URLS = {
    'KDDTrain+.txt':'https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTrain+.txt',
    'KDDTest+.txt' : 'https://raw.githubusercontent.com/defcom17/NSL_KDD/master/KDDTest+.txt'
}

def download_file(url,filename):
    local_path = os.path.join(DATA_DIR, filename)

    if os.path.exists(local_path):
        print(f"Correct file found: {filename} already exists")
        return
    print(f"Downloading {filename}...")
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()

        with open(local_path, 'wb') as f:
            for chunk in response.iter_content(chunk_size = 8000):
                f.write(chunk)
        print(f"Success saved to {local_path}")
    except Exception as e:
        print(f"Failed to download {filename} :{e}")
        
                    

In [None]:
for filename, url in URLS.items():
    download_file(url,filename)
print("\nAll files should now be in 'data' folder.")

In [None]:
!pip install pandas

In [None]:
import pandas as pd
import os

# Define the path to your data
DATA_DIR = 'data'
TRAIN_PATH = os.path.join(DATA_DIR, 'KDDTrain+.txt')

# Define the standard column names for the NSL-KDD dataset
COLUMNS = [
    "duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", 
    "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", 
    "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", 
    "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds", 
    "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate", 
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", 
    "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", 
    "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", 
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", 
    "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "class", "difficulty_level"
]

# Read the dataset
# header=None tells pandas there is no header row in the file
# names=COLUMNS assign the column names we defined above
df = pd.read_csv(TRAIN_PATH, header=None, names=COLUMNS)

# Display the first few rows
df
# df.head()

In [None]:
train_path = os.path.join(DATA_DIR, 'KDDTrain+.txt')
test_path = os.path.join(DATA_DIR, 'KDDTest+.txt')

# Read CSV with our column names
train_df = pd.read_csv(train_path, names=COLUMNS)
test_df = pd.read_csv(test_path, names=COLUMNS)

print("Data Loaded Successfully!")

In [None]:
# Display first 5 rows
train_df.head()
train_df.tail()

test_df.tail()

In [None]:
train_df.describe()

In [None]:
print("\n---Data Info---")
train_df.info()

In [None]:
print(f"Missing Values in teh Training Data: {train_df.isnull().sum().sum()}")
print(f"Missing Values in teh Testing Data: {test_df.isnull().sum()}")

In [None]:
print(f"Duplicate data in the training dataset {train_df.duplicated().sum()}")
print(f"Duplicate data in the test dataset {test_df.duplicated()}")

In [None]:
train_df['class']

In [None]:
!pip install matplotlib seaborn


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
unique_labels = train_df['class'].unique()
print(f" There are {len(unique_labels)} unique lables including 'normal'.")
print(unique_labels)

In [None]:
train_df['class'].value_counts()