# Setup Training Data for 10-Minute GraphSAGE Training

H∆∞·ªõng d·∫´n t·∫°o d·ªØ li·ªáu l·ªõn h∆°n ƒë·ªÉ train model trong ~10 ph√∫t.

**C·∫•u tr√∫c d·ªØ li·ªáu m·ª•c ti√™u:**
- 2000 samples training (thay v√¨ 240)
- Validate/test proportional
- K√©o d√†i training ~10 ph√∫t v·ªõi GPU

In [None]:
import os
import sys
import json
import pandas as pd
import numpy as np
import networkx as nx
from pathlib import Path
from tqdm import tqdm
import torch

# Th√™m project path
project_root = Path('../').resolve()
sys.path.insert(0, str(project_root / 'src'))

print(f"‚úì Project root: {project_root}")
print(f"‚úì Python version: {sys.version}")

## 1. Define Project Structure

In [None]:
# ƒê·ªãnh nghƒ©a c·∫•u tr√∫c th∆∞ m·ª•c
DATA_DIR = project_root / 'data' / 'graph_medium'
MODELS_DIR = project_root / 'models'
LOGS_DIR = project_root / 'logs'

# T·∫°o th∆∞ m·ª•c n·∫øu ch∆∞a c√≥
for dir_path in [DATA_DIR, MODELS_DIR, LOGS_DIR]:
    os.makedirs(dir_path, exist_ok=True)
    print(f"‚úì Created/verified: {dir_path}")

print("\nüìÅ Project Structure:")
print(f"   DATA_DIR   = {DATA_DIR}")
print(f"   MODELS_DIR = {MODELS_DIR}")
print(f"   LOGS_DIR   = {LOGS_DIR}")

## 2. Load Graph Data

In [None]:
# Ki·ªÉm tra v√† load d·ªØ li·ªáu graph
nodes_file = DATA_DIR / 'graph_nodes.csv'
edges_file = DATA_DIR / 'graph_edges.csv'

if not nodes_file.exists():
    print(f"‚ö†Ô∏è  {nodes_file} kh√¥ng t·ªìn t·∫°i. T√¥i s·∫Ω sinh d·ªØ li·ªáu graph...")
    # Sinh graph m·ªõi n·∫øu ch∆∞a c√≥
    import subprocess
    result = subprocess.run([sys.executable, str(project_root / 'src' / 'data' / 'generate_graph.py')],
                          capture_output=True, text=True)
    print(result.stdout)
    if result.returncode != 0:
        print(f"Error: {result.stderr}")

# Load d·ªØ li·ªáu
nodes_df = pd.read_csv(nodes_file)
edges_df = pd.read_csv(edges_file)

print(f"\nüìä Graph Statistics:")
print(f"   Nodes: {len(nodes_df)}")
print(f"   Edges: {len(edges_df)}")
print(f"\n   Nodes sample:\n{nodes_df.head()}")
print(f"\n   Edges sample:\n{edges_df.head()}")

## 3. Build NetworkX Graph & Generate Path Samples

In [None]:
# Build NetworkX graph ƒë·ªÉ t√≠nh shortest path nhanh
print("üî® Building NetworkX graph...")
G = nx.Graph()
num_nodes = len(nodes_df)
G.add_nodes_from(range(num_nodes))

for _, row in edges_df.iterrows():
    src, dst, weight = int(row['source']), int(row['target']), float(row['weight'])
    if src < num_nodes and dst < num_nodes:
        G.add_edge(src, dst, weight=weight)

print(f"   ‚úì Graph: {G.number_of_nodes()} nodes, {G.number_of_edges()} edges")

# Sinh d·ªØ li·ªáu training samples
print("\nüéØ Generating 2000 path samples for 10-minute training...")
NUM_SAMPLES = 2000
np.random.seed(42)

sources = []
targets = []
costs = []

# Random select source-target pairs
for i in tqdm(range(NUM_SAMPLES), desc="Generating paths"):
    src = np.random.randint(0, num_nodes)
    dst = np.random.randint(0, num_nodes)
    
    if src == dst:
        continue
    
    try:
        # D√πng NetworkX ƒë·ªÉ t√≠nh chi ph√≠ Dijkstra nhanh
        cost = nx.shortest_path_length(G, source=src, target=dst, weight='weight')
        sources.append(src)
        targets.append(dst)
        costs.append(cost)
    except nx.NetworkXNoPath:
        continue

# ƒê·∫£m b·∫£o c√≥ ƒë·ªß samples
sources = sources[:NUM_SAMPLES]
targets = targets[:NUM_SAMPLES]
costs = costs[:NUM_SAMPLES]

print(f"   ‚úì Generated {len(sources)} valid paths")
print(f"   Cost statistics: min={min(costs):.2f}, max={max(costs):.2f}, mean={np.mean(costs):.2f}")

## 4. Split Data into Train/Val/Test

In [None]:
# T·∫°o DataFrame
paths_df = pd.DataFrame({
    'source': sources,
    'target': targets,
    'cost': costs
})

# Split: Train 80% (1600), Val 10% (200), Test 10% (200)
n_total = len(paths_df)
n_train = int(0.8 * n_total)
n_val = int(0.1 * n_total)

train_df = paths_df[:n_train]
val_df = paths_df[n_train:n_train + n_val]
test_df = paths_df[n_train + n_val:]

# L∆∞u CSV
train_file = DATA_DIR / 'paths_train.csv'
val_file = DATA_DIR / 'paths_val.csv'
test_file = DATA_DIR / 'paths_test.csv'

train_df.to_csv(train_file, index=False)
val_df.to_csv(val_file, index=False)
test_df.to_csv(test_file, index=False)

print(f"üíæ Dataset Split:")
print(f"   Training:   {len(train_df):4d} samples ‚Üí {train_file}")
print(f"   Validation: {len(val_df):4d} samples ‚Üí {val_file}")
print(f"   Test:       {len(test_df):4d} samples ‚Üí {test_file}")
print(f"   Total:      {n_total:4d} samples")

## 5. Setup Training Configuration for 10-Minute Training

In [None]:
# C·∫•u h√¨nh training ƒë·ªÉ ch·∫°y ~10 ph√∫t
config = {
    "model": {
        "hidden_dim": 64,
        "num_layers": 2,
        "dropout": 0.3,
        "aggregator": "mean"
    },
    "training": {
        "num_epochs": 100,           # 10 ph√∫t / 100 epochs ‚âà 6 gi√¢y/epoch
        "batch_size": 32,
        "learning_rate": 0.001,
        "weight_decay": 1e-5,
        "optimizer": "adam"
    },
    "data": {
        "num_train_samples": len(train_df),
        "num_val_samples": len(val_df),
        "num_test_samples": len(test_df),
        "num_nodes": num_nodes,
        "num_edges": G.number_of_edges()
    },
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}

# L∆∞u config
config_file = MODELS_DIR / 'config.json'
with open(config_file, 'w') as f:
    json.dump(config, f, indent=2)

print("‚öôÔ∏è  Training Configuration:")
print(f"   Device: {config['device']}")
print(f"   Epochs: {config['training']['num_epochs']}")
print(f"   Batch size: {config['training']['batch_size']}")
print(f"   Learning rate: {config['training']['learning_rate']}")
print(f"   Estimated time: ~10 minutes")
print(f"\n   Config saved ‚Üí {config_file}")

## 6. Project Structure Verification

In [None]:
import os

def print_tree(directory, prefix="", max_depth=3, current_depth=0):
    """In c·∫•u tr√∫c th∆∞ m·ª•c"""
    if current_depth >= max_depth:
        return
    
    try:
        items = sorted(os.listdir(directory))
        items = [i for i in items if not i.startswith('.')]
        
        for i, item in enumerate(items):
            path = os.path.join(directory, item)
            is_last = i == len(items) - 1
            current_prefix = "‚îî‚îÄ‚îÄ " if is_last else "‚îú‚îÄ‚îÄ "
            print(f"{prefix}{current_prefix}{item}")
            
            if os.path.isdir(path) and item not in ['.venv', '__pycache__', '.git']:
                extension = "    " if is_last else "‚îÇ   "
                print_tree(path, prefix + extension, max_depth, current_depth + 1)
    except PermissionError:
        pass

print("üìÅ Project Structure Ready for Training:\n")
print_tree(str(project_root))

## 7. Next Steps - Start Training

Ch·∫°y l·ªánh sau ƒë·ªÉ b·∫Øt ƒë·∫ßu training trong 10 ph√∫t:

```bash
# T·ª´ th∆∞ m·ª•c project root
python train_model.py --data-dir data/graph_medium --models-dir models --num-epochs 100 --batch-size 32

# Ho·∫∑c d√πng script
./train.sh  # Tr√™n Linux/Mac
train.bat   # Tr√™n Windows
```

**C√°c file s·∫Ω ƒë∆∞·ª£c sinh ra:**
- `models/best_model.pt` - Tr·ªçng s·ªë model (t·ª± ƒë·ªông l∆∞u t·ªët nh·∫•t)
- `models/config.json` - C·∫•u h√¨nh training
- `models/results.json` - K·∫øt qu·∫£ (loss, metrics)
- `logs/training_*.log` - Chi ti·∫øt training