# Collab settings

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os

drive_path = '/content/drive/MyDrive/Inferencja'
if os.path.exists(drive_path):
    %cd {drive_path}
    print(f"Successfully changed directory to {drive_path}")
else:
    print(f"Error: The directory {drive_path} does not exist.")

# Imports and Utility Functions

In [None]:
import os
import subprocess
import sys

def run(cmd):
    print(f"==> Running: {cmd}")
    ret = subprocess.call(cmd, shell=True)
    if ret != 0:
        print(f"Error running command: {cmd}")
        sys.exit(ret)


# Environment Setup (LLVM 17 + Python Dependencies)

In [None]:
def setup_environment():
    try:
        subprocess.check_call("llvm-config-17 --version", shell=True, stdout=subprocess.DEVNULL)
        print("LLVM 17 already installed.")
    except:
        print("Installing LLVM 17...")
        run("wget https://apt.llvm.org/llvm.sh")
        run("chmod +x llvm.sh")
        run("sudo ./llvm.sh 17 all")
        # Symlinks
        run("sudo ln -sf /usr/bin/clang-17 /usr/bin/clang")
        run("sudo ln -sf /usr/bin/clang++-17 /usr/bin/clang++")
        run("sudo ln -sf /usr/bin/llvm-config-17 /usr/bin/llvm-config")
        run("sudo ln -sf /usr/bin/opt-17 /usr/bin/opt")
    
    run("pip install torch-geometric pytorch-lightning hydra-core wandb transformers datasets scikit-learn")
    run("sudo apt-get install -y cmake build-essential")

setup_environment()

# Build the LLVM GraphExtractor Pass

In [None]:
def build_pass():
    cwd = os.getcwd()
    pass_dir = os.path.join(cwd, "llvm_pass")
    
    if not os.path.exists(pass_dir):
        print("Error: llvm_pass directory not found. Are you in the project root?")
        sys.exit(1)
        
    os.chdir(pass_dir)
    # Clean build
    if os.path.exists("build"):
        import shutil
        shutil.rmtree("build")
        
    run("chmod +x build.sh")
    run("./build.sh")
    os.chdir(cwd)

build_pass()

# Data Pipeline (Download + Extract Graphs)

In [None]:
def run_pipeline():    
    # Download POJ-104
    run("python3 data_pipeline/download_poj104.py")
    
    # Compile & Extract (Regenerate Data)
    print("Generating TOON graphs from C++ sources...")
    input_dir = "data/raw/poj104/val"
    output_dir = "data/processed/val"
    
    if os.path.exists(output_dir):
        import shutil
        shutil.rmtree(output_dir)
    
    env_vars = f"PYTHONPATH={os.getcwd()}"
    run(f"{env_vars} python3 data_pipeline/compile_and_extract.py --input {input_dir} --output {output_dir} --jobs 4 --optimize")

run_pipeline()

# Train the Hybrid GNN + BERT Model (GPU)

In [None]:
!PYTHONPATH=. python3 ml_core/train.py model.data_dir=data/processed/val/graphs model.use_bert=True model.batch_size=32 train.epochs=5 +trainer.accelerator=gpu

# Generate t-SNE visualisation for GNN + BERT

In [None]:
!PYTHONPATH=. python3 ml_core/visualize.py --ckpt "lightning_logs/*/checkpoints/*.ckpt" --data data/processed/val/graphs --output tsne_result.png

# Train the Baseline (No CodeBERT)

In [None]:
!PYTHONPATH=. python3 ml_core/train.py model.data_dir=data/processed/val/graphs   model.use_bert=False   model.batch_size=64   train.epochs=5   hydra.run.dir=outputs/baseline +trainer.accelerator=gpu

# Generate t-SNE visualisation for Baseline

In [None]:
!PYTHONPATH=. python3 ml_core/visualize.py --ckpt "outputs/baseline/lightning_logs/*/checkpoints/*.ckpt" --data data/processed/val/graphs  --output tsne_baseline.png

# Comparission

In [None]:
!PYTHONPATH=. python3 ml_core/visualize.py  --ckpt "lightning_logs/*/checkpoints/*.ckpt" --data data/processed/val/graphs  --output tsne_hybrid.png