# FePt/MgO Interface Learning Tutorial

This tutorial demonstrates a multi-stage active learning strategy to simulate a FePt (metal) and MgO (insulator) interface using the `nnp_pipelines` codebase.

We will follow a 4-step workflow:
1. **Learn FePt (Metallic Phase)**: Generate initial data and train a base FePt potential.
2. **Learn MgO (Ionic Phase)**: Generate initial data and train a separate MgO potential.
3. **Interface Construction & Knowledge Transfer**: Build interface structures, label them, merge datasets, and train an "Interface Potential".
4. **Active Learning Loop**: Initialize the active learning orchestrator with the interface potential.


In [None]:
import yaml
import shutil
import logging
import pandas as pd
import os
from pathlib import Path
from ase.io import read, write

from shared.core.config import Config
from shared.autostructure.interface import InterfaceBuilder
from orchestrator.workflows.seed_generation import SeedGenerator
from orchestrator.src.wrappers.dft_wrapper import DftWorker
from orchestrator.src.wrappers.pace_wrapper import PaceWorker
from orchestrator.workflows.orchestrator import ActiveLearningOrchestrator
from orchestrator.src.factory import ComponentFactory
from orchestrator.src.services.al_service import ActiveLearningService

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("InterfaceTutorial")

# Ensure output directories exist
Path("data").mkdir(exist_ok=True)
Path("output").mkdir(exist_ok=True)


## Step 1: Learn FePt (Metallic Phase)

First, we define a helper function to generate `config.yaml` files dynamically. Then, we run the `SeedGenerator` for Fe and Pt using the "metallic" crystal type.


In [None]:
def create_config(filename, elements, crystal_type):
    """
    Helper function to generate config.yaml files dynamically.
    """
    # Load base config
    if not Path("config.yaml").exists():
        raise FileNotFoundError("Base config.yaml not found!")
        
    with open("config.yaml", "r") as f:
        base_config = yaml.safe_load(f)

    # Update MD parameters
    base_config["md_params"]["elements"] = elements
    
    # Set atomic masses (approximate values)
    mass_map = {
        "Fe": 55.845, "Pt": 195.084,
        "Mg": 24.305, "O": 15.999
    }
    base_config["md_params"]["masses"] = {el: mass_map.get(el, 1.0) for el in elements}
    
    # Update Seed Generation parameters
    base_config["seed_generation"]["crystal_type"] = crystal_type
    base_config["seed_generation"]["n_random_structures"] = 10  # Reduced for tutorial speed
    base_config["seed_generation"]["n_samples_for_dft"] = 5     # Reduced for tutorial speed
    base_config["seed_generation"]["exploration_temperatures"] = [300.0]

    # Update Pacemaker parameters
    base_config["ace_model"]["pacemaker_config"]["potential"]["elements"] = elements
    base_config["ace_model"]["pacemaker_config"]["potential"]["embeddings"] = {
        el: {"npot": "FinnisSinclair", "fs_parameters": [1, 1, 1, 0.5], "ndensity": 1} 
        for el in elements
    }

    # Save to file
    with open(filename, "w") as f:
        yaml.dump(base_config, f)
    
    return filename

def run_seed_generation(elements, crystal_type, phase_name):
    """
    Runs SeedGenerator for a specific phase and manages output files.
    """
    logger.info(f"--- Starting Phase: {phase_name} ---")
    config_file = f"config_{phase_name}.yaml"
    create_config(config_file, elements, crystal_type)
    
    # Initialize Config object
    config = Config.from_yaml(config_file)
    
    # Initialize SeedGenerator
    # We assume 'constant.yaml' serves as the meta_config or base reference
    meta_config_path = Path("constant.yaml")
    if not meta_config_path.exists():
        # Create a dummy if it doesn't exist for the tutorial to run without errors
        with open(meta_config_path, 'w') as f:
            yaml.dump({'lj_params': {'epsilon': 1.0, 'sigma': 2.0}}, f)

    generator = SeedGenerator(config, Path(config_file), meta_config_path)
    
    try:
        # Run generation
        generator.run()
    except Exception as e:
        logger.error(f"Seed generation failed for {phase_name}: {e}")
        # Note: In a real run without Docker/Workers set up, this will fail.
        # We catch it here to allow the tutorial flow to proceed conceptually.
        pass

    # Rename outputs to avoid overwriting
    output_pot = Path(f"data/{phase_name}_potential.yace")
    source_pot = Path("data/seed/seed_potential.yace")
    
    if source_pot.exists():
        shutil.move(source_pot, output_pot)
        logger.info(f"Saved potential to {output_pot}")
    else:
        logger.warning(f"No potential generated for {phase_name} (expected if workers are offline).")
    
    # Find and rename the dataset
    data_dir = Path("data")
    datasets = list(data_dir.glob("seed_dataset_*.pckl.gzip"))
    target_dataset = None
    
    if datasets:
        # Sort by modification time to get the latest
        latest_dataset = max(datasets, key=os.path.getmtime)
        target_dataset = data_dir / f"{phase_name}_dataset.pckl.gzip"
        shutil.move(latest_dataset, target_dataset)
        logger.info(f"Saved dataset to {target_dataset}")
        
    return output_pot, target_dataset


In [None]:
# Run for FePt
fept_pot, fept_data = run_seed_generation(["Fe", "Pt"], "metallic", "fept")


## Step 2: Learn MgO (Ionic Phase)

Now we repeat the process for Mg and O using the "ionic" crystal type.


In [None]:
# Run for MgO
mgo_pot, mgo_data = run_seed_generation(["Mg", "O"], "ionic", "mgo")


## Step 3: Interface Construction & Knowledge Transfer

We use the `InterfaceBuilder` to generate diverse interface structures between FePt and MgO. Then we label these structures using DFT, merge the datasets, and train a unified interface potential.


In [None]:
logger.info("--- Starting Interface Construction ---")

# Placeholder logic if data generation failed (for tutorial robustness)
if not fept_data or not mgo_data:
    logger.warning("Skipping interface construction due to missing data from previous steps.")
    # For demonstration, we create dummy files if they don't exist
    # (In a real scenario, you would debug why generation failed)
else:
    # Load datasets
    df_fept = pd.read_pickle(fept_data, compression="gzip")
    df_mgo = pd.read_pickle(mgo_data, compression="gzip")
    
    # Pick a sample structure from each
    struct_fept = df_fept.iloc[-1]["ase_atoms"]
    struct_mgo = df_mgo.iloc[-1]["ase_atoms"]
    
    # Initialize InterfaceBuilder
    builder = InterfaceBuilder(struct_fept, struct_mgo)
    
    # Generate structures (Strain, Twist, Intermix)
    interface_structures = builder.generate_all()
    logger.info(f"Generated {len(interface_structures)} interface structures.")
    
    # Save candidates for labeling
    interface_file = "data/interface_candidates.xyz"
    write(interface_file, interface_structures)
    
    # --- Labeling ---
    interface_elements = ["Fe", "Pt", "Mg", "O"]
    interface_config_file = "config_interface.yaml"
    create_config(interface_config_file, interface_elements, "metallic")
    
    dft_worker = DftWorker(Path("data"))
    labeled_interface_file = "interface_labeled.xyz"
    
    logger.info("Labeling interface structures...")
    # dft_worker.label calls the DFT container
    try:
        dft_worker.label(interface_config_file, "constant.yaml", "interface_candidates.xyz", labeled_interface_file)
    except Exception as e:
        logger.error(f"Labeling failed: {e}")

    # --- Data Merging ---
    if Path(f"data/{labeled_interface_file}").exists():
        labeled_interface_atoms = read(Path("data") / labeled_interface_file, index=":")
        df_interface = pd.DataFrame({"ase_atoms": labeled_interface_atoms})
        
        logger.info("Merging datasets (FePt + MgO + Interface)...")
        combined_df = pd.concat([df_fept, df_mgo, df_interface], ignore_index=True)
        
        combined_dataset_path = Path("data/combined_interface.pckl.gzip")
        combined_df.to_pickle(combined_dataset_path, compression="gzip")
        
        # --- Train Interface Potential ---
        logger.info("Training Interface Potential...")
        pace_worker = PaceWorker(Path("data"))
        
        try:
            interface_pot_name = pace_worker.train(
                interface_config_file,
                "constant.yaml",
                combined_dataset_path.name,
                iteration=1
            )
            interface_pot_path = Path("data") / interface_pot_name
            logger.info(f"Interface Potential trained: {interface_pot_path}")
        except Exception as e:
            logger.error(f"Training failed: {e}")
            interface_pot_path = None
    else:
        logger.warning("No labeled interface data found. Skipping merging/training.")
        interface_pot_path = None


## Step 4: Active Learning Loop

Finally, we configure the `ActiveLearningOrchestrator` to start with the "Interface Potential" created in Step 3.


In [None]:
logger.info("--- Starting Active Learning Loop ---")

# Configuration for AL Loop
al_config_file = "config_al.yaml"
# Ensure we have the interface elements
create_config(al_config_file, ["Fe", "Pt", "Mg", "O"], "metallic")

# Load and customize AL config
with open(al_config_file, "r") as f:
    al_config = yaml.safe_load(f)

# Set initial potential
if 'interface_pot_path' in locals() and interface_pot_path and interface_pot_path.exists():
    al_config["al_params"]["initial_potential"] = interface_pot_path.name
else:
    logger.warning("Using default/dummy potential path for AL initialization.")
    al_config["al_params"]["initial_potential"] = "dummy_potential.yace"

# Set initial structure (use one of the generated interface structures if available)
initial_struct_name = "initial_interface.data"
if 'interface_structures' in locals() and interface_structures:
    write(Path("data") / initial_struct_name, interface_structures[0], format="lammps-data")
    al_config["md_params"]["initial_structure"] = initial_struct_name
else:
    # Create a dummy structure file so the factory doesn't crash during init
    with open(Path("data") / initial_struct_name, 'w') as f:
        f.write("Dummy LAMMPS data file")
    al_config["md_params"]["initial_structure"] = initial_struct_name

# Save config
with open(al_config_file, "w") as f:
    yaml.dump(al_config, f)

# Initialize Components
try:
    config_al = Config.from_yaml(al_config_file)
    component_factory = ComponentFactory(config_al)
    
    # Create AL Service
    al_service = ActiveLearningService(config_al, Path(al_config_file), Path("constant.yaml"))
    
    # Create Explorer
    explorer = component_factory.create_explorer()
    
    # Instantiate Orchestrator
    orchestrator = ActiveLearningOrchestrator(config_al, al_service, explorer)
    
    logger.info("ActiveLearningOrchestrator initialized successfully.")
    # To run the loop:
    # orchestrator.run()
except Exception as e:
    logger.error(f"Failed to initialize AL Orchestrator: {e}")
