In [4]:
import pandas as pd 
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler  
# Import necessary metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
# Import machine learning models
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

In [6]:
df = pd.read_csv(r"C:\Users\hp\Desktop\TFG\Datasets\CIC_IDS_Merged.csv")
df1 = pd.read_csv(r"C:\Users\hp\Desktop\TFG\Datasets\UNSW_NB15.csv")

  df1 = pd.read_csv(r"C:\Users\hp\Desktop\TFG\Datasets\UNSW_NB15.csv")


In [5]:
# Normalize the 'attack_cat' column directly
def normalize_label(label):
    if pd.isnull(label):
        return 'benign'  # Assign NaN values to 'benign'
    
    # Remove extra spaces and make the label lowercase
    label = str(label).strip().lower()
    
    # Dictionary for replacing and normalizing labels
    replacements = {
        'backdoors': 'backdoor',  # Merge 'backdoors' into 'backdoor'
        'shellcode': 'shellcode',
        'fuzzers': 'fuzzers',
        'reconnaissance': 'reconnaissance',
        'analysis': 'analysis',
        'worms': 'worms',
        'exploits': 'exploits',
        'dos': 'dos',  # Normalize DoS to dos
    }
    
    # Apply the replacement rules
    return replacements.get(label, label)

# Apply the normalization function directly to the 'attack_cat' column
df1['attack_cat'] = df1['attack_cat'].apply(normalize_label)


In [7]:

# Rename columns to match UNSW naming conventions
df = df.rename(columns={
    'Flow Duration': 'dur',
    'Dst Port': 'dsport',
    'Tot Fwd Pkts': 'Spkts',
    'Tot Bwd Pkts': 'Dpkts',
    'TotLen Fwd Pkts': 'sbytes',
    'TotLen Bwd Pkts': 'dbytes',
    'Fwd Pkt Len Mean': 'smeansz',
    'Bwd Pkt Len Mean': 'dmeansz',
    'Init Fwd Win Byts': 'swin',
    'Init Bwd Win Byts': 'dwin',
    'Fwd IAT Std': 'Sjit',
    'Bwd IAT Std': 'Djit',  # Align label name with UNSW
})

# Drop non-overlapping features
df = df.drop(columns=[
    'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags', 'Bwd URG Flags',
    'Fwd Header Len', 'Bwd Header Len', 'Active Mean', 'Idle Mean',  # CIC-specific
    'Label'  # Original CIC label (keep only 'is_attack' renamed to 'Label')
], errors='ignore')

# Ensure data type alignment
df['dbytes'] = df['dbytes'].astype(float)

In [9]:
# Convertir todas las features temporales de UNSW a microsegundos
df1['dur'] = df1['dur'] * 1_000_000      
df1['Sjit'] = df1['Sjit'] * 1000          
df1['Djit'] = df1['Djit'] * 1000          
df1['Sintpkt'] = df1['Sintpkt'] * 1000    
df1['Dintpkt'] = df1['Dintpkt'] * 1000   

In [17]:
df1 = df1.rename(columns={
    'dur': 'dur',
    'dsport': 'dsport',
    'Spkts': 'Spkts',
    'Dpkts': 'Dpkts',
    'sbytes': 'sbytes',
    'dbytes': 'dbytes',
    'smeansz': 'smeansz',
    'dmeansz': 'dmeansz',
    'swin': 'swin',
    'dwin': 'dwin',
    'Sjit': 'Sjit',
    'Djit': 'Djit',
    'Label':'is_attack'
})


# Drop non-overlapping features
df1 = df1.drop(columns=[
    'srcip','proto', 'dstip', 'sport', 'state', 'ct_state_ttl', 'ct_flw_http_mthd',
    'is_sm_ips_ports', 'ct_ftp_cmd', 'attack_cat'  # UNSW-specific
], errors='ignore')

# Ensure data type alignment
df1['is_attack'] = df1['is_attack'].astype(int)
# Calcular tasas y manejar divisiones por cero
df1['Flow Byts/s'] = np.where(
    df1['dur'] != 0,  # Evitar división por cero
    (df1['sbytes'] + df1['dbytes']) / (df1['dur'] / 1_000_000),  # B/s
    np.nan  # Asignar NaN si dur=0
)

df1['Flow Pkts/s'] = np.where(
    df1['dur'] != 0,
    (df1['Spkts'] + df1['Dpkts']) / (df1['dur'] / 1_000_000),  # Pkts/s
    np.nan
)

# Calcular Fwd IAT Mean y manejar Spkts + Dpkts = 0
df1['total_pkts'] = df1['Spkts'] + df1['Dpkts']
df1['Fwd IAT Mean'] = np.where(
    df1['total_pkts'] != 0,
    (df1['Sintpkt'] * df1['Spkts'] + df1['Dintpkt'] * df1['Dpkts']) / df1['total_pkts'],
    np.nan
)
common_features = [
    'dur', 'dsport', 'Spkts', 'Dpkts', 'sbytes', 'dbytes',
    'smeansz', 'dmeansz', 'swin', 'dwin', 'Sjit', 'Djit',
    'Flow Byts/s','Flow Pkts/s','Fwd IAT Mean','is_attack'
]
df = df[common_features]
df1 = df1[common_features]

In [25]:
df1 = df1.sample(frac=1, random_state=42).reset_index(drop=True) #Shuffle UNSW NB15 for randomness

In [35]:
# Convert dsport to numeric, coercing errors to NaN
df1['dsport'] = pd.to_numeric(df1['dsport'], errors='coerce')
# For CICIDS (df)
df['dur'] = df['dur'].astype('float64')  # Match UNSW's dtype
# For UNSW (df1)
df1['dbytes'] = df1['dbytes'].astype('float64')   # Match CICIDS' dtype
df1['smeansz'] = df1['smeansz'].astype('float64')
df1['dmeansz'] = df1['dmeansz'].astype('float64')

In [49]:
# Drop rows with NaN in dsport
df1 = df1.dropna(subset=['dsport'])

In [57]:
# For UNSW (df1)
df1['dsport'] = df1['dsport'].astype('int64')
# For UNSW (df1)
df1['is_attack'] = df1['is_attack'].astype('int64')

In [59]:
# Merge datasets
merged_df = pd.concat([df, df1], ignore_index=True)
# Save to CSV
merged_df.to_csv(r"C:\Users\hp\Desktop\TFG\Datasets\mergedDS.csv", index=False)

In [96]:
def map_unsw_attack(attack):
    if attack == "benign":
        return "Normal"
    elif attack == "dos":
        return "DoS"
    elif attack in ["reconnaissance", "fuzzers"]:
        return "Reconnaissance"
    elif attack == "exploits":
        return "Exploits"
    elif attack in ["backdoor", "worms", "shellcode"]:
        return "Infiltration"
    else:
        return attack  # Leaves 'generic' and 'analysis' unchanged

In [294]:
X = merged_df.drop(columns=["is_attack"])
y = merged_df["is_attack"]

In [296]:
# Split into train/test sets (stratify to handle imbalance)
X_train, X_test, y_train, y_test = train_test_split(
    X,  # or X if not scaled
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)