In [11]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


# Column names based on NSL-KDD documentation
col_names = [
    "duration","protocol_type","service","flag","src_bytes","dst_bytes","land",
    "wrong_fragment","urgent","hot","num_failed_logins","logged_in","num_compromised",
    "root_shell","su_attempted","num_root","num_file_creations","num_shells",
    "num_access_files","num_outbound_cmds","is_host_login","is_guest_login",
    "count","srv_count","serror_rate","srv_serror_rate","rerror_rate","srv_rerror_rate",
    "same_srv_rate","diff_srv_rate","srv_diff_host_rate","dst_host_count",
    "dst_host_srv_count","dst_host_same_srv_rate","dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate","dst_host_srv_diff_host_rate","dst_host_serror_rate",
    "dst_host_srv_serror_rate","dst_host_rerror_rate","dst_host_srv_rerror_rate",
    "label", "difficulty"
]

# Loading datasets
df_train = pd.read_csv("data/KDDTrain+.txt", names=col_names)
df_test = pd.read_csv("data/KDDTest+.txt", names=col_names)

#inserted after adding the difficulty column to fix the error where all rows were marked as attacks because it was checking 43rd column
print("Sample labels:", df_train['label'].unique()[:5])

print("Training set shape:", df_train.shape)
print(df_train.head(10))
print("\nClass distribution (raw labels):")
print(df_train['label'].value_counts())

df_train_enc = df_train.copy()
df_test_enc = df_test.copy()

# Encode label columns using LabelEncoder
categorical_cols = ['protocol_type', 'service', 'flag']
encoders = {}

for col in categorical_cols:
    enc = LabelEncoder()
    df_train_enc[col] = enc.fit_transform(df_train[col])
    # Safely encode test data, assign -1 if unseen category
    df_test_enc[col] = df_test[col].apply(lambda x: enc.transform([x])[0] if x in enc.classes_ else -1)
    encoders[col] = enc

# Confirm all categorical columns are numeric now:
print(df_train_enc[categorical_cols].dtypes)
df_train_enc['attack'] = df_train_enc['label'].apply(lambda x: 0 if x == 'normal' else 1)
df_test_enc['attack'] = df_test_enc['label'].apply(lambda x: 0 if x == 'normal' else 1)


print(df_train_enc['attack'].value_counts())


#MinMax scaling using sklearn so training bias is equal across columns
features = df_train_enc.select_dtypes(include=['number']).columns.drop('attack')
scaler = MinMaxScaler()

df_train_enc[features] = scaler.fit_transform(df_train_enc[features])
df_test_enc[features] = scaler.transform(df_test_enc[features])

print(df_train_enc.head())

    
print("Finished feature scaling")
print(df_train_enc.head())


#implementing logistic regression (binary classifier)


x = df_train_enc.select_dtypes(include=['number']).drop(columns=['attack'])
y = df_train_enc['attack']

print("X shape:", x.shape)
print("Y shape:", y.shape)

#Train on 80%, validate on 20%, splitting data using scikit-learn
#random_state parameter shuffles and splits data based on a seed to make results reproducible

#had to add stratify because training data split would sometimes only take data from attack class (1)
x_train, x_val, y_train, y_val = train_test_split(
    x, y , test_size = 0.2, random_state = 42, stratify=y

)

print("Training samples:", x_train.shape)
print("Validation samples:", x_val.shape)

#model is a estimator object containing parameters 
model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)

print("Model training complete")

#Making predictions (testing), 0 = normal, 1 = attack
y_pred = model.predict(x_val)

print("Accuracy:", accuracy_score(y_val, y_pred))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_pred))

#First run results (When model took difficulty column as a feature when training
#Accuracy: 0.976066679896805

#Confusion Matrix:
#[[13119   350]
# [  253 11473]]



Sample labels: ['normal' 'neptune' 'warezclient' 'ipsweep' 'portsweep']
Training set shape: (125973, 43)
   duration protocol_type     service flag  src_bytes  dst_bytes  land  \
0         0           tcp    ftp_data   SF        491          0     0   
1         0           udp       other   SF        146          0     0   
2         0           tcp     private   S0          0          0     0   
3         0           tcp        http   SF        232       8153     0   
4         0           tcp        http   SF        199        420     0   
5         0           tcp     private  REJ          0          0     0   
6         0           tcp     private   S0          0          0     0   
7         0           tcp     private   S0          0          0     0   
8         0           tcp  remote_job   S0          0          0     0   
9         0           tcp     private   S0          0          0     0   

   wrong_fragment  urgent  hot  ...  dst_host_same_srv_rate  \
0               0