# üß™ IoT Device Identification - Data Exploration & Preprocessing

Ce notebook explore et pr√©traite le dataset IPFIX pour l'identification des appareils IoT.

## üìã Objectifs
1. Charger et explorer les donn√©es brutes
2. Analyser la distribution des classes
3. Nettoyer et normaliser les features
4. Cr√©er des s√©quences temporelles
5. Sauvegarder les donn√©es trait√©es sur Google Drive

## üîß Setup - Montage Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
PROJECT_DIR = '/content/drive/MyDrive/PFE_IoT'
os.makedirs(PROJECT_DIR, exist_ok=True)
os.makedirs(f'{PROJECT_DIR}/data/processed', exist_ok=True)
os.makedirs(f'{PROJECT_DIR}/results', exist_ok=True)
print(f"‚úÖ Projet configur√© dans: {PROJECT_DIR}")

## üì¶ Installation des d√©pendances

In [None]:
!pip install -q pandas numpy scikit-learn tqdm matplotlib seaborn

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import pickle
import json
from pathlib import Path

pd.set_option('display.max_columns', 50)
plt.style.use('seaborn-v0_8-darkgrid')
print("‚úÖ D√©pendances install√©es")

## üìÅ Configuration des chemins

‚ö†Ô∏è **IMPORTANT**: Uploadez vos fichiers CSV (home1.csv, home2.csv, etc.) dans le dossier sp√©cifi√©.

In [None]:
# ‚¨áÔ∏è MODIFIEZ CE CHEMIN selon l'emplacement de vos donn√©es
RAW_DATA_DIR = f'{PROJECT_DIR}/data/raw/IPFIX ML Instances'
PROCESSED_DATA_DIR = f'{PROJECT_DIR}/data/processed'
RESULTS_DIR = f'{PROJECT_DIR}/results'

# V√©rification
if os.path.exists(RAW_DATA_DIR):
    files = [f for f in os.listdir(RAW_DATA_DIR) if f.endswith('.csv')]
    print(f"‚úÖ Trouv√© {len(files)} fichiers CSV")
    for f in sorted(files):
        size_mb = os.path.getsize(os.path.join(RAW_DATA_DIR, f)) / (1024*1024)
        print(f"   - {f}: {size_mb:.1f} MB")
else:
    print(f"‚ùå Dossier non trouv√©: {RAW_DATA_DIR}")
    print("Cr√©ez le dossier et uploadez les fichiers CSV.")

## üìä Exploration des donn√©es

In [None]:
# Charger un fichier pour exploration
sample_file = f'{RAW_DATA_DIR}/home1.csv'

if os.path.exists(sample_file):
    print("Chargement de home1.csv (√©chantillon)...")
    df_sample = pd.read_csv(sample_file, nrows=10000)
    print(f"\nüìê Dimensions: {df_sample.shape}")
    print(f"\nüìã Colonnes ({len(df_sample.columns)}):")
    print(df_sample.columns.tolist())
    print(f"\nüìä Types:")
    print(df_sample.dtypes)
    print(f"\nüîç Aper√ßu:")
    display(df_sample.head())
else:
    print("Fichier sample non trouv√©")

In [None]:
# Distribution des appareils
if 'df_sample' in locals():
    device_counts = df_sample['device'].value_counts()
    print("\nüì± Distribution des appareils (√©chantillon):")
    print(device_counts)
    
    plt.figure(figsize=(12, 6))
    device_counts.plot(kind='bar')
    plt.title('Distribution des appareils IoT')
    plt.xlabel('Appareil')
    plt.ylabel('Nombre de flux')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig(f'{RESULTS_DIR}/device_distribution_sample.png', dpi=150)
    plt.show()

## üîÑ Pr√©traitement

In [None]:
# Configuration du pr√©traitement
FEATURES_TO_KEEP = [
    'duration', 'ipProto',
    'outPacketCount', 'outByteCount', 'inPacketCount', 'inByteCount',
    'outSmallPktCount', 'outLargePktCount', 'outNonEmptyPktCount', 'outDataByteCount',
    'outAvgIAT', 'outFirstNonEmptyPktSize', 'outMaxPktSize', 'outStdevPayloadSize',
    'outStdevIAT', 'outAvgPacketSize',
    'inSmallPktCount', 'inLargePktCount', 'inNonEmptyPktCount', 'inDataByteCount',
    'inAvgIAT', 'inFirstNonEmptyPktSize', 'inMaxPktSize', 'inStdevPayloadSize',
    'inStdevIAT', 'inAvgPacketSize',
    'http', 'https', 'smb', 'dns', 'ntp', 'tcp', 'udp', 'ssdp', 'lan', 'wan',
    'deviceInitiated'
]

FEATURES_TO_DROP = ['start', 'srcMac', 'destMac', 'srcIP', 'destIP', 'srcPort', 'destPort']
LABEL_COLUMN = 'device'
SEQUENCE_LENGTH = 10
STRIDE = 5
MIN_SAMPLES_PER_CLASS = 500
TEST_SIZE = 0.2
VAL_SIZE = 0.1

print("‚úÖ Configuration charg√©e")

In [None]:
def load_all_data(data_dir, max_files=None):
    """Charge tous les fichiers CSV"""
    dfs = []
    csv_files = sorted([f for f in os.listdir(data_dir) if f.startswith('home') and f.endswith('.csv')])
    
    if max_files:
        csv_files = csv_files[:max_files]
    
    for f in tqdm(csv_files, desc="Chargement"):
        df = pd.read_csv(os.path.join(data_dir, f))
        df['source_file'] = f.replace('.csv', '')
        dfs.append(df)
        print(f"  {f}: {len(df):,} lignes")
    
    return pd.concat(dfs, ignore_index=True)

def clean_data(df):
    """Nettoie les donn√©es"""
    print(f"Avant nettoyage: {len(df):,} lignes")
    
    # Supprimer doublons et NA
    df = df.drop_duplicates()
    df = df.dropna(subset=[LABEL_COLUMN])
    
    # Supprimer colonnes non pertinentes
    for col in FEATURES_TO_DROP:
        if col in df.columns:
            df = df.drop(columns=col)
    
    # Filtrer classes rares
    class_counts = df[LABEL_COLUMN].value_counts()
    valid_classes = class_counts[class_counts >= MIN_SAMPLES_PER_CLASS].index
    df = df[df[LABEL_COLUMN].isin(valid_classes)]
    
    print(f"Apr√®s nettoyage: {len(df):,} lignes")
    print(f"Classes conserv√©es: {len(valid_classes)}")
    
    return df, valid_classes

print("‚úÖ Fonctions d√©finies")

In [None]:
# ‚ö° Chargement des donn√©es (ajustez max_files pour tester)
MAX_FILES = None  # Mettre un nombre (ex: 3) pour tester plus vite

print("="*60)
print("CHARGEMENT DES DONN√âES")
print("="*60)

df = load_all_data(RAW_DATA_DIR, max_files=MAX_FILES)
print(f"\n‚úÖ Total: {len(df):,} flux")

In [None]:
# Nettoyage
print("\n" + "="*60)
print("NETTOYAGE")
print("="*60)

df_clean, valid_classes = clean_data(df)

In [None]:
# Distribution finale
plt.figure(figsize=(14, 6))
df_clean[LABEL_COLUMN].value_counts().plot(kind='bar')
plt.title('Distribution finale des classes d\'appareils IoT')
plt.xlabel('Appareil')
plt.ylabel('Nombre de flux')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(f'{RESULTS_DIR}/device_distribution_final.png', dpi=150)
plt.show()

print("\nüìä Distribution:")
print(df_clean[LABEL_COLUMN].value_counts())

In [None]:
def create_sequences(X, y, source_groups, seq_length=SEQUENCE_LENGTH, stride=STRIDE):
    """Cr√©e des s√©quences temporelles"""
    X_seq, y_seq = [], []
    
    for group_id in tqdm(np.unique(source_groups), desc="Cr√©ation s√©quences"):
        mask = source_groups == group_id
        X_group = X[mask]
        y_group = y[mask]
        
        n_samples = len(X_group) - seq_length
        for i in range(0, max(1, n_samples), stride):
            if i + seq_length <= len(X_group):
                X_seq.append(X_group[i:i+seq_length])
                y_seq.append(y_group[i+seq_length-1])
    
    return np.array(X_seq), np.array(y_seq)

print("‚úÖ Fonction de s√©quen√ßage d√©finie")

In [None]:
# S√©lection des features et encodage
print("\n" + "="*60)
print("S√âLECTION FEATURES & ENCODAGE")
print("="*60)

features = [c for c in FEATURES_TO_KEEP if c in df_clean.columns]
print(f"Features s√©lectionn√©es: {len(features)}")

X = df_clean[features].values
y = df_clean[LABEL_COLUMN].values
source_groups = df_clean['source_file'].values

# Encodage des labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
num_classes = len(label_encoder.classes_)
print(f"Classes: {num_classes}")
print(f"Labels: {label_encoder.classes_[:5]}...")

# Normalisation
scaler = StandardScaler()
X_normalized = scaler.fit_transform(X)
print(f"Shape X: {X_normalized.shape}")

In [None]:
# Cr√©ation des s√©quences
print("\n" + "="*60)
print("CR√âATION DES S√âQUENCES")
print("="*60)

X_seq, y_seq = create_sequences(X_normalized, y_encoded, source_groups)
print(f"\n‚úÖ S√©quences cr√©√©es: {X_seq.shape}")
print(f"   Input shape: (samples, timesteps, features) = {X_seq.shape}")
print(f"   Labels shape: {y_seq.shape}")

In [None]:
# Division Train/Val/Test
print("\n" + "="*60)
print("DIVISION TRAIN/VAL/TEST")
print("="*60)

X_train, X_temp, y_train, y_temp = train_test_split(
    X_seq, y_seq, test_size=TEST_SIZE, random_state=42, stratify=y_seq
)

val_ratio = VAL_SIZE / (1 - TEST_SIZE)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=(1-val_ratio), random_state=42, stratify=y_temp
)

print(f"Train: {len(X_train):,} ({len(X_train)/len(X_seq)*100:.1f}%)")
print(f"Val:   {len(X_val):,} ({len(X_val)/len(X_seq)*100:.1f}%)")
print(f"Test:  {len(X_test):,} ({len(X_test)/len(X_seq)*100:.1f}%)")

In [None]:
# Sauvegarde sur Google Drive
print("\n" + "="*60)
print("SAUVEGARDE SUR GOOGLE DRIVE")
print("="*60)

np.save(f'{PROCESSED_DATA_DIR}/X_train.npy', X_train)
np.save(f'{PROCESSED_DATA_DIR}/X_val.npy', X_val)
np.save(f'{PROCESSED_DATA_DIR}/X_test.npy', X_test)
np.save(f'{PROCESSED_DATA_DIR}/y_train.npy', y_train)
np.save(f'{PROCESSED_DATA_DIR}/y_val.npy', y_val)
np.save(f'{PROCESSED_DATA_DIR}/y_test.npy', y_test)

# Sauvegarder le pr√©processeur
with open(f'{PROCESSED_DATA_DIR}/preprocessor.pkl', 'wb') as f:
    pickle.dump({
        'scaler': scaler,
        'label_encoder': label_encoder,
        'feature_names': features,
        'num_classes': num_classes,
        'sequence_length': SEQUENCE_LENGTH
    }, f)

# M√©tadonn√©es
metadata = {
    'n_train': len(X_train),
    'n_val': len(X_val),
    'n_test': len(X_test),
    'n_features': X_train.shape[2],
    'sequence_length': SEQUENCE_LENGTH,
    'num_classes': num_classes,
    'classes': list(label_encoder.classes_)
}

with open(f'{PROCESSED_DATA_DIR}/metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"‚úÖ Donn√©es sauvegard√©es dans: {PROCESSED_DATA_DIR}")
print(f"\nüìÑ Fichiers cr√©√©s:")
for f in os.listdir(PROCESSED_DATA_DIR):
    size = os.path.getsize(os.path.join(PROCESSED_DATA_DIR, f)) / (1024*1024)
    print(f"   - {f}: {size:.2f} MB")

## üìà R√©sum√© du pr√©traitement

In [None]:
print("\n" + "="*60)
print("R√âSUM√â")
print("="*60)

print(f"""
üìä Dataset:
   - Flux totaux: {len(df_clean):,}
   - S√©quences cr√©√©es: {len(X_seq):,}
   - Features: {X_train.shape[2]}
   - Longueur s√©quence: {SEQUENCE_LENGTH}
   - Classes: {num_classes}

üìÅ Fichiers sauvegard√©s:
   - X_train.npy, X_val.npy, X_test.npy
   - y_train.npy, y_val.npy, y_test.npy
   - preprocessor.pkl (scaler + label_encoder)
   - metadata.json

‚û°Ô∏è Prochaine √©tape: Ex√©cuter 02_LSTM_Training.ipynb
""")