In [8]:
import pickle

# Charger et inspecter
with open('../data/05_model_input/gnn_training/training_data_main.pkl', 'rb') as f:
    data = pickle.load(f)

print("Structure générale:")
print(f"Nombre de batches: {len(data['batches'])}")
print(f"Total subgraphs: {data['stats']['total_subgraphs']}")

# Examiner quelques batches
for i in [0, 100, 1000]:  # Début, milieu, plus loin
    if i < len(data['batches']):
        batch = data['batches'][i]
        print(f"\nBatch {i} (packet_id={batch['packet_id']}):")
        print(f"  Subgraphs: {batch['count']}")
        
        # Premier subgraph du batch
        if batch['subgraphs']:
            sg = batch['subgraphs'][0]
            print(f"  Premier subgraph: {sg['x'].shape[0]} noeuds, {sg['edge_index'].shape[1]} aretes")

Structure générale:
Nombre de batches: 54791
Total subgraphs: 54791

Batch 0 (packet_id=0):
  Subgraphs: 1
  Premier subgraph: 6 noeuds, 5 aretes

Batch 100 (packet_id=136):
  Subgraphs: 1
  Premier subgraph: 6 noeuds, 5 aretes

Batch 1000 (packet_id=1446):
  Subgraphs: 1
  Premier subgraph: 5 noeuds, 4 aretes


In [9]:
# Vérifier ordre packet_ids
packet_ids = [batch['packet_id'] for batch in data['batches']]
print("Packet_IDs progression:")
print(f"Min: {min(packet_ids)}, Max: {max(packet_ids)}")
print(f"Ordre croissant: {packet_ids == sorted(packet_ids)}")
print(f"Premiers: {packet_ids[:10]}")
print(f"Derniers: {packet_ids[-10:]}")

Packet_IDs progression:
Min: 0, Max: 80999
Ordre croissant: True
Premiers: [0, 1, 2, 3, 4, 6, 7, 8, 9, 10]
Derniers: [80985, 80986, 80988, 80990, 80992, 80993, 80995, 80996, 80998, 80999]


In [10]:
# Analyser distribution tailles
subgraph_counts = [batch['count'] for batch in data['batches']]
node_counts = []

for batch in data['batches'][:100]:  # Échantillon
    for sg in batch['subgraphs']:
        node_counts.append(sg['x'].shape[0])

print(f"Subgraphs par batch: min={min(subgraph_counts)}, max={max(subgraph_counts)}, moy={sum(subgraph_counts)/len(subgraph_counts):.1f}")
print(f"Nœuds par subgraph: min={min(node_counts)}, max={max(node_counts)}, moy={sum(node_counts)/len(node_counts):.1f}")

Subgraphs par batch: min=1, max=1, moy=1.0
Nœuds par subgraph: min=5, max=30, moy=7.9


In [11]:
import torch
import torch.nn.functional as F

# Test sur les 10 premiers batches
print("Test mini-entraînement:")

for i, batch in enumerate(data['batches'][:10]):
    packet_id = batch['packet_id']
    subgraphs = batch['subgraphs']
    
    print(f"Packet_ID {packet_id}: {len(subgraphs)} subgraphs")
    
    # Test chaque subgraph
    for j, sg in enumerate(subgraphs):
        x = sg['x']  # Node features
        edge_index = sg['edge_index']  # Connections
        edge_attr = sg['edge_attr']  # Edge features
        
        print(f"  Subgraph {j}: nodes={x.shape}, edges={edge_index.shape}, edge_attr={edge_attr.shape}")
        
        # Test basique: peut-on calculer une loss simple ?
        try:
            # Reconstruction fictive (identité)
            x_reconstructed = x.clone()
            
            # Loss MSE
            loss = F.mse_loss(x_reconstructed, x)
            print(f"    Test loss: {loss.item():.4f} - OK")
            
        except Exception as e:
            print(f"    ERREUR: {e}")

Test mini-entraînement:
Packet_ID 0: 1 subgraphs
  Subgraph 0: nodes=torch.Size([6, 64]), edges=torch.Size([2, 5]), edge_attr=torch.Size([5, 64])
    Test loss: 0.0000 - OK
Packet_ID 1: 1 subgraphs
  Subgraph 0: nodes=torch.Size([5, 64]), edges=torch.Size([2, 4]), edge_attr=torch.Size([4, 64])
    Test loss: 0.0000 - OK
Packet_ID 2: 1 subgraphs
  Subgraph 0: nodes=torch.Size([10, 64]), edges=torch.Size([2, 597]), edge_attr=torch.Size([597, 64])
    Test loss: 0.0000 - OK
Packet_ID 3: 1 subgraphs
  Subgraph 0: nodes=torch.Size([12, 64]), edges=torch.Size([2, 14]), edge_attr=torch.Size([14, 64])
    Test loss: 0.0000 - OK
Packet_ID 4: 1 subgraphs
  Subgraph 0: nodes=torch.Size([5, 64]), edges=torch.Size([2, 4]), edge_attr=torch.Size([4, 64])
    Test loss: 0.0000 - OK
Packet_ID 6: 1 subgraphs
  Subgraph 0: nodes=torch.Size([6, 64]), edges=torch.Size([2, 5]), edge_attr=torch.Size([5, 64])
    Test loss: 0.0000 - OK
Packet_ID 7: 1 subgraphs
  Subgraph 0: nodes=torch.Size([5, 64]), edges=to

In [12]:
import pickle
import torch
from torch_geometric.data import Data, DataLoader

# Charger vos données DÉJÀ transformées
with open('../data/05_model_input/gnn_training/training_data_main.pkl', 'rb') as f:
    training_data = pickle.load(f)

def extract_pytorch_graphs(training_data):
    """Extraire les graphiques PyTorch déjà convertis"""
    pytorch_graphs = []
    
    for batch in training_data['batches']:
        for subgraph_data in batch['subgraphs']:
            # Les données sont DÉJÀ en format PyTorch !
            data = Data(
                x=subgraph_data['x'],                    # Déjà torch.Tensor
                edge_index=subgraph_data['edge_index'],  # Déjà torch.Tensor  
                edge_attr=subgraph_data['edge_attr'],    # Déjà torch.Tensor
                packet_id=batch['packet_id'],
                synthetic_edge=subgraph_data['synthetic_edge']
            )
            pytorch_graphs.append(data)
    
    return pytorch_graphs

# Extraction immédiate
main_dataset = extract_pytorch_graphs(training_data)
print(f"Dataset extrait: {len(main_dataset)} graphiques PyTorch")

# Vérification du format
sample = main_dataset[0]
print(f"Sample: x={sample.x.shape}, edge_index={sample.edge_index.shape}")

Dataset extrait: 54791 graphiques PyTorch
Sample: x=torch.Size([6, 64]), edge_index=torch.Size([2, 5])


In [15]:
import pickle

# Charger le fichier
with open("../data/05_model_input/gnn_training/training_data_main.pkl", 'rb') as f:
    training_data = pickle.load(f)

print("=== ANALYSE DES DONNÉES (DICTIONNAIRE) ===")

# Type et contenu
print(f"Type: {type(training_data)}")

if isinstance(training_data, dict):
    print(f"Clés du dictionnaire: {list(training_data.keys())}")
    print(f"Nombre de clés: {len(training_data.keys())}")
    
    # Analyser chaque clé
    for key in list(training_data.keys())[:3]:  # Premiers 3 seulement
        value = training_data[key]
        print(f"\nClé '{key}':")
        print(f"  Type: {type(value)}")
        
        if isinstance(value, list):
            print(f"  Taille liste: {len(value)}")
            if len(value) > 0:
                print(f"  Premier élément de la liste: {type(value[0])}")
                if hasattr(value[0], 'x'):
                    print(f"    Subgraph - Nodes: {value[0].x.shape}, Edges: {value[0].edge_index.shape}")
        
        elif hasattr(value, 'x'):
            print(f"  Subgraph direct - Nodes: {value.x.shape}, Edges: {value.edge_index.shape}")
        
        else:
            print(f"  Contenu: {value}")

else:
    print("Ce n'est pas un dictionnaire...")
    print(f"Type réel: {type(training_data)}")

=== ANALYSE DES DONNÉES (DICTIONNAIRE) ===
Type: <class 'dict'>
Clés du dictionnaire: ['batches', 'stats', 'status']
Nombre de clés: 3

Clé 'batches':
  Type: <class 'list'>
  Taille liste: 54791
  Premier élément de la liste: <class 'dict'>

Clé 'stats':
  Type: <class 'dict'>
  Contenu: {'total_batches': 54791, 'total_subgraphs': 54791, 'avg_subgraphs_per_batch': 1.0}

Clé 'status':
  Type: <class 'str'>
  Contenu: READY


In [16]:
# Dans votre notebook
batches = training_data['batches']
print(f"Premier batch: {type(batches[0])}")
print(f"Clés du premier batch: {batches[0].keys()}")

# Analyser la structure d'un batch
first_batch = batches[0]
for key, value in first_batch.items():
    print(f"  {key}: {type(value)}")
    if isinstance(value, list) and len(value) > 0:
        print(f"    Premier élément: {type(value[0])}")

Premier batch: <class 'dict'>
Clés du premier batch: dict_keys(['packet_id', 'subgraphs', 'count'])
  packet_id: <class 'int'>
  subgraphs: <class 'list'>
    Premier élément: <class 'dict'>
  count: <class 'int'>


In [17]:
# Dans votre notebook
batches = training_data['batches']
first_batch = batches[0]
first_subgraph = first_batch['subgraphs'][0]

print("=== ANALYSE DU SUBGRAPH ===")
print(f"Type du subgraph: {type(first_subgraph)}")
print(f"Clés: {first_subgraph.keys() if isinstance(first_subgraph, dict) else 'Pas un dict'}")

# Si c'est un dict, analyser le contenu
if isinstance(first_subgraph, dict):
    for key, value in first_subgraph.items():
        print(f"  {key}: {type(value)}")
        if hasattr(value, 'shape'):
            print(f"    Shape: {value.shape}")
        elif isinstance(value, (list, tuple)):
            print(f"    Longueur: {len(value)}")

=== ANALYSE DU SUBGRAPH ===
Type du subgraph: <class 'dict'>
Clés: dict_keys(['x', 'edge_index', 'edge_attr', 'num_nodes', 'num_edges', 'synthetic_edge'])
  x: <class 'torch.Tensor'>
    Shape: torch.Size([6, 64])
  edge_index: <class 'torch.Tensor'>
    Shape: torch.Size([2, 5])
  edge_attr: <class 'torch.Tensor'>
    Shape: torch.Size([5, 64])
  num_nodes: <class 'int'>
  num_edges: <class 'int'>
  synthetic_edge: <class 'bool'>
