In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
import joblib

# Load the datasets
df1 = pd.read_csv("/Users/sarahsteadham/Downloads/CyberProject/UNSW2018IOTBOTNET.csv")
df2 = pd.read_csv("/Users/sarahsteadham/Downloads/CyberProject/UNSWNB15.csv")
df3 = pd.read_csv("/Users/sarahsteadham/Downloads/CyberProject/train_test_network.csv")

# Standardize column names
df1.rename(columns={'attack': 'label', 'category': 'type'}, inplace=True)
df2.rename(columns={'attack_cat': 'type', 'label': 'label'}, inplace=True)
df3.rename(columns={'label': 'label', 'type': 'type'}, inplace=True)

# Ensure each dataset has the standardized columns
print("Dataset 1 Columns after renaming:", df1.columns)
print("Dataset 2 Columns after renaming:", df2.columns)
print("Dataset 3 Columns after renaming:", df3.columns)

# Inspect target column in each dataset
print("Dataset 1 'label' column values:\n", df1['label'].value_counts())
print("Dataset 2 'label' column values:\n", df2['label'].value_counts())
print("Dataset 3 'label' column values:\n", df3['label'].value_counts())

# Combine datasets
df_combined = pd.concat([df1, df2, df3], ignore_index=True)

# Check if the combined DataFrame is empty
if df_combined.empty:
    raise ValueError("The combined DataFrame is empty. Please check the input datasets.")

# Debug: Check if 'label' is in the combined DataFrame and its distribution
print("Combined DataFrame Columns:", df_combined.columns)
print("Combined DataFrame 'label' column values:\n", df_combined['label'].value_counts())

# Check for unintended dropping of rows
print("Combined DataFrame shape:", df_combined.shape)
print("Number of missing values per column:\n", df_combined.isnull().sum())

# Ensure 'label' is still numerical
print("Data types of columns:\n", df_combined.dtypes)
if df_combined['label'].dtype != 'int64' and df_combined['label'].dtype != 'float64':
    df_combined['label'] = df_combined['label'].astype(int)
    print("Converted 'label' to numerical type.")

# Print the first few rows of the combined DataFrame for inspection
print("Combined DataFrame Head:\n", df_combined.head())

# Data Preprocessing
# Drop any irrelevant columns (if any)
df_combined = df_combined.drop(columns=['pkSeqID', 'saddr', 'sport', 'daddr', 'dport'], errors='ignore')

# Handle missing values
# Fill missing values for critical columns with appropriate values (e.g., mean, median)
critical_columns = ['label', 'proto', 'service', 'state', 'src_ip', 'dst_ip']
for col in critical_columns:
    if col in df_combined.columns:
        df_combined[col] = df_combined[col].fillna(df_combined[col].mode()[0])

# Drop rows with remaining missing target values
df_combined = df_combined.dropna(subset=['label'])

# Print the columns after handling missing values
print("Columns after handling missing values:\n", df_combined.columns)

# Verify categorical encoding process
print("Before encoding:", df_combined.shape)
categorical_columns = df_combined.select_dtypes(include=['object']).columns
print("Categorical columns to encode:", categorical_columns)

for col in categorical_columns:
    le = LabelEncoder()
    df_combined[col] = le.fit_transform(df_combined[col])

print("After encoding:", df_combined.shape)

# Print the columns after encoding
print("Columns after encoding categorical variables:\n", df_combined.columns)

# Feature Selection
target_column = 'label'

# Check if the target column exists in the combined DataFrame
if target_column not in df_combined.columns:
    raise ValueError("The target column does not exist in the combined DataFrame.")

# Print the target column values to ensure they are not empty
print("Target column values:\n", df_combined[target_column].value_counts())

feature_columns = df_combined.columns.difference([target_column])

# Print the feature and target columns
print("Feature columns:\n", feature_columns)
print("Target column:\n", target_column)

X = df_combined[feature_columns]
y = df_combined[target_column]

# Check if X and y are not empty
if X.empty or y.empty:
    raise ValueError("Feature columns (X) or target column (y) are empty. Please check the data preprocessing steps.")

# Print the shapes of X and y
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

# Scaling the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model Training
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Model Evaluation
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

# Save the model (if needed)
joblib.dump(clf, 'network_anomaly_model.pkl')

Dataset 1 Columns after renaming: Index(['pkSeqID', 'proto', 'saddr', 'sport', 'daddr', 'dport', 'seq', 'stddev',
       'N_IN_Conn_P_SrcIP', 'min', 'state_number', 'mean', 'N_IN_Conn_P_DstIP',
       'drate', 'srate', 'max', 'label', 'type', 'subcategory'],
      dtype='object')
Dataset 2 Columns after renaming: Index(['id', 'dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'type', 'label'],
      dtype='object')
Dataset 3 Columns after renaming: Index(['src_ip', 'src_port', 'dst_ip', 'dst_port', 'proto', 's