In [None]:
import pandas as pd
import numpy as np

def create_unified_dataset():
    """
    Script simplifié pour créer un dataset unifié multi-classe Ransomware/DDoS
    à partir des datasets CICANDMAL2017 et cicddos2019 après feature selection séparée.
    """

    # 1. Chargement des datasets
    ransomware_df = pd.read_csv('final_train_ransom_benign_f_s.csv')
    ddos_df = pd.read_csv('final_train_ddos_benign_f_s.csv')

    # 2. Séparation par classe
    ransomware_benign = ransomware_df[ransomware_df['Label'] == 0].copy()
    ransomware_attack = ransomware_df[ransomware_df['Label'] == 1].copy()
    ddos_benign = ddos_df[ddos_df['Label'] == 0].copy()
    ddos_attack = ddos_df[ddos_df['Label'] == 1].copy()

    # 3. Échantillonnage stratégique
    # DDoS attaques -> Label 2 (toutes)
    ddos_attacks_final = ddos_attack.copy()
    ddos_attacks_final['Label'] = 2

    # Ransomware attaques -> Label 1 (117,500 premiers chronologiques)
    ransomware_attack_sorted = ransomware_attack.sort_values('Timestamp').reset_index(drop=True)
    ransomware_sample_size = min(117500, len(ransomware_attack_sorted))
    ransomware_attacks_final = ransomware_attack_sorted.head(ransomware_sample_size).copy()
    ransomware_attacks_final['Label'] = 1

    # DDoS bénin -> Label 0 (tous)
    ddos_benign_final = ddos_benign.copy()
    ddos_benign_final['Label'] = 0

    # Ransomware bénin -> Label 0 (50,000 premiers chronologiques)
    ransomware_benign_sorted = ransomware_benign.sort_values('Timestamp').reset_index(drop=True)
    ransomware_benign_sample_size = min(50000, len(ransomware_benign_sorted))
    ransomware_benign_final = ransomware_benign_sorted.head(ransomware_benign_sample_size).copy()
    ransomware_benign_final['Label'] = 0

    # 4. Création du squelette unifié avec toutes les colonnes
    all_columns = list(set(ransomware_df.columns) | set(ddos_df.columns))
    all_columns.sort()

    # Réindexation avec toutes les colonnes
    ddos_benign_skeleton = ddos_benign_final.reindex(columns=all_columns)
    ransomware_benign_skeleton = ransomware_benign_final.reindex(columns=all_columns)
    ransomware_attacks_skeleton = ransomware_attacks_final.reindex(columns=all_columns)
    ddos_attacks_skeleton = ddos_attacks_final.reindex(columns=all_columns)

    # 5. Fusion et tri chronologique
    skeleton_dataset = pd.concat([
        ddos_benign_skeleton,
        ransomware_benign_skeleton,
        ransomware_attacks_skeleton,
        ddos_attacks_skeleton
    ], ignore_index=True)

    skeleton_dataset = skeleton_dataset.sort_values('Timestamp').reset_index(drop=True)

    # 6. Remplissage des colonnes DDoS manquantes
    ddos_columns_to_fill = [
        'Avg Bwd Segment Size',
        'Bwd Packet Length Min',
        'Fwd Packet Length Max',
        'Fwd Packet Length Std',
        'Subflow Fwd Packets',
        'Total Fwd Packets'
    ]

    ddos_benign = ddos_df[ddos_df['Label'] == 0].copy()
    skeleton_v2 = skeleton_dataset.copy()

    for col in ddos_columns_to_fill:
        if col in ddos_benign.columns:
            all_benign_values = ddos_benign[col].dropna().tolist()
            missing_mask = skeleton_v2[col].isnull()
            missing_count = missing_mask.sum()

            if len(all_benign_values) > 0 and missing_count > 0:
                random_values = np.random.choice(
                    all_benign_values,
                    size=missing_count,
                    replace=True
                )
                skeleton_v2.loc[missing_mask, col] = random_values

    # 7. Remplissage des colonnes Ransomware manquantes
    missing_counts = skeleton_v2.isnull().sum()
    system_columns = ['Label', 'Timestamp']
    ransomware_columns_to_fill = [
        col for col in missing_counts[missing_counts > 0].index.tolist()
        if col not in system_columns
    ]

    ransomware_benign = ransomware_df[ransomware_df['Label'] == 0].copy()
    skeleton_final = skeleton_v2.copy()

    available_columns = [col for col in ransomware_columns_to_fill
                        if col in ransomware_benign.columns]

    for col in available_columns:
        all_benign_values = ransomware_benign[col].dropna().tolist()
        missing_mask = skeleton_final[col].isnull()
        missing_count = missing_mask.sum()

        if len(all_benign_values) > 0 and missing_count > 0:
            random_values = np.random.choice(
                all_benign_values,
                size=missing_count,
                replace=True
            )
            skeleton_final.loc[missing_mask, col] = random_values

    # 8. Sauvegarde
    output_filename = 'unified_dataset_final_complete.csv'
    skeleton_final.to_csv(output_filename, index=False)

    return skeleton_final, output_filename

if __name__ == "__main__":
    np.random.seed(42)
    final_dataset, output_file = create_unified_dataset()

    # Statistiques finales
    print(f"Dataset créé: {final_dataset.shape[0]:,} lignes × {final_dataset.shape[1]} colonnes")
    print(f"Distribution: {dict(final_dataset['Label'].value_counts().sort_index())}")
    print(f"Fichier sauvegardé: {output_file}")