# NewAIBench Experiment Runner Tutorial

Notebook n√†y s·∫Ω h∆∞·ªõng d·∫´n b·∫°n t·ª´ng b∆∞·ªõc ƒë·ªÉ ch·∫°y experiment v·ªõi NewAIBench framework, t·ª´ vi·ªác pull git repository ƒë·∫øn khi ho√†n th√†nh experiment.

## T·ªïng quan
- Clone repository v√† c·∫≠p nh·∫≠t code m·ªõi nh·∫•t
- C√†i ƒë·∫∑t dependencies
- T·∫°o custom YAML configuration
- Ch·∫°y experiment v√† ph√¢n t√≠ch k·∫øt qu·∫£

‚ö†Ô∏è **L∆∞u √Ω**: Notebook n√†y ƒë∆∞·ª£c thi·∫øt k·∫ø ƒë·ªÉ ch·∫°y trong environment ƒë√£ c√≥ Python v√† Git.

In [None]:
import os
import sys
import subprocess
import yaml
import json
from pathlib import Path
import datetime
from IPython.display import Markdown, display

# Thi·∫øt l·∫≠p working directory
WORK_DIR = "/home/hkduy/NewAI/new_bench"
os.chdir(WORK_DIR)
print(f"Working directory: {os.getcwd()}")

## 1. Clone Repository v√† C·∫≠p nh·∫≠t Code

ƒê·∫ßu ti√™n, ch√∫ng ta s·∫Ω clone repository (n·∫øu ch∆∞a c√≥) ho·∫∑c pull code m·ªõi nh·∫•t t·ª´ remote repository.

In [None]:
# Ki·ªÉm tra xem ƒë√£ c√≥ git repository ch∆∞a
if os.path.exists('.git'):
    print("‚úÖ Git repository ƒë√£ t·ªìn t·∫°i")
    # Pull latest changes
    try:
        result = subprocess.run(['git', 'status', '--porcelain'], 
                              capture_output=True, text=True, check=True)
        if result.stdout.strip():
            print("‚ö†Ô∏è C√≥ changes ch∆∞a commit:")
            print(result.stdout)
            print("\nB·∫°n c√≥ th·ªÉ stash changes tr∆∞·ªõc khi pull:")
            print("git stash")
        else:
            print("Working directory clean, pulling latest changes...")
            pull_result = subprocess.run(['git', 'pull'], 
                                       capture_output=True, text=True, check=True)
            print(f"‚úÖ Git pull completed: {pull_result.stdout}")
    except subprocess.CalledProcessError as e:
        print(f"‚ùå Git operation failed: {e}")
else:
    print("‚ùå Kh√¥ng t√¨m th·∫•y git repository")
    print("N·∫øu b·∫°n c·∫ßn clone repository, uncomment v√† ch·∫°y l·ªánh d∆∞·ªõi ƒë√¢y:")
    print("# git clone <repository_url> .")

In [None]:
# Hi·ªÉn th·ªã th√¥ng tin branch hi·ªán t·∫°i
try:
    branch_result = subprocess.run(['git', 'branch', '--show-current'], 
                                 capture_output=True, text=True, check=True)
    current_branch = branch_result.stdout.strip()
    print(f"üîÑ Current branch: {current_branch}")
    
    # Hi·ªÉn th·ªã commit g·∫ßn nh·∫•t
    commit_result = subprocess.run(['git', 'log', '-1', '--oneline'], 
                                 capture_output=True, text=True, check=True)
    latest_commit = commit_result.stdout.strip()
    print(f"üìù Latest commit: {latest_commit}")
except subprocess.CalledProcessError as e:
    print(f"‚ùå Cannot get git info: {e}")

## 2. C√†i ƒë·∫∑t Dependencies

C√†i ƒë·∫∑t t·∫•t c·∫£ c√°c dependencies c·∫ßn thi·∫øt cho NewAIBench framework.

In [None]:
# Ki·ªÉm tra Python version
print(f"üêç Python version: {sys.version}")
print(f"üìÇ Python executable: {sys.executable}")

# Ki·ªÉm tra xem c√≥ requirements.txt kh√¥ng
if os.path.exists('requirements.txt'):
    print("‚úÖ Found requirements.txt")
    with open('requirements.txt', 'r') as f:
        requirements = f.read()
    print("\nüìã Requirements:")
    print(requirements[:500] + "..." if len(requirements) > 500 else requirements)
else:
    print("‚ùå requirements.txt not found")

In [None]:
# C√†i ƒë·∫∑t dependencies
print("üîß Installing dependencies...")
try:
    # Upgrade pip first
    subprocess.run([sys.executable, '-m', 'pip', 'install', '--upgrade', 'pip'], 
                  check=True)
    print("‚úÖ Pip upgraded")
    
    # Install requirements
    if os.path.exists('requirements.txt'):
        result = subprocess.run([sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt'], 
                              capture_output=True, text=True, check=True)
        print("‚úÖ Requirements installed successfully")
        if result.stderr:
            print(f"‚ö†Ô∏è Warnings: {result.stderr[:200]}...")
    
    # Install package in development mode
    if os.path.exists('setup.py'):
        subprocess.run([sys.executable, '-m', 'pip', 'install', '-e', '.'], 
                      check=True)
        print("‚úÖ Package installed in development mode")
        
except subprocess.CalledProcessError as e:
    print(f"‚ùå Installation failed: {e}")
    print(f"Error output: {e.stderr if hasattr(e, 'stderr') else 'No error details'}")

In [None]:
# Ki·ªÉm tra xem NewAIBench ƒë√£ ƒë∆∞·ª£c c√†i ƒë·∫∑t ch∆∞a
try:
    # Add src to path if needed
    src_path = os.path.join(os.getcwd(), "src")
    if src_path not in sys.path:
        sys.path.insert(0, src_path)
    
    # Test import
    from newaibench.experiment import ExperimentRunner, ExperimentConfig
    print("‚úÖ NewAIBench import successful")
    
    # Check if run_experiment.py exists
    if os.path.exists('run_experiment.py'):
        print("‚úÖ run_experiment.py found")
    else:
        print("‚ùå run_experiment.py not found")
        
except ImportError as e:
    print(f"‚ùå Import failed: {e}")
    print("C√≥ th·ªÉ c·∫ßn c√†i ƒë·∫∑t th√™m dependencies ho·∫∑c ki·ªÉm tra PYTHONPATH")

## 3. T·∫°o Custom YAML Configuration

B√¢y gi·ªù ch√∫ng ta s·∫Ω t·∫°o m·ªôt file c·∫•u h√¨nh YAML t√πy ch·ªânh cho experiment c·ªßa b·∫°n.

### 3.1 Xem c√°c template c√≥ s·∫µn

In [None]:
# Li·ªát k√™ c√°c experiment templates c√≥ s·∫µn
print("üìÇ Existing experiment configurations:")
print("\n1. In examples/experiments/:")
examples_exp_dir = Path("examples/experiments")
if examples_exp_dir.exists():
    for yaml_file in examples_exp_dir.glob("*.yaml"):
        print(f"   - {yaml_file.name}")
    for json_file in examples_exp_dir.glob("*.json"):
        print(f"   - {json_file.name}")
else:
    print("   (directory not found)")

print("\n2. In experiments/:")
exp_dir = Path("experiments")
if exp_dir.exists():
    for yaml_file in exp_dir.glob("*.yaml"):
        print(f"   - {yaml_file.name}")
    for json_file in exp_dir.glob("*.json"):
        print(f"   - {json_file.name}")
else:
    print("   (directory not found)")

In [None]:
# T·∫°o template m·∫´u b·∫±ng CLI tool
print("üîß Creating a basic template...")
try:
    result = subprocess.run([sys.executable, 'run_experiment.py', 
                           '--create-template', 'basic', 
                           '--output-config', 'my_custom_experiment.yaml'], 
                          capture_output=True, text=True, check=True)
    print("‚úÖ Template created successfully")
    print(f"Output: {result.stdout}")
except subprocess.CalledProcessError as e:
    print(f"‚ùå Template creation failed: {e}")
    print(f"Error: {e.stderr}")

In [None]:
# Hi·ªÉn th·ªã n·ªôi dung template v·ª´a t·∫°o
if os.path.exists('my_custom_experiment.yaml'):
    print("üìù Generated template content:")
    with open('my_custom_experiment.yaml', 'r') as f:
        template_content = f.read()
    print(template_content)
else:
    print("‚ùå Template file not found")

### 3.2 T√πy ch·ªânh Configuration

B√¢y gi·ªù ch√∫ng ta s·∫Ω t√πy ch·ªânh file YAML theo nhu c·∫ßu c·ªßa b·∫°n. B·∫°n c√≥ th·ªÉ ch·ªânh s·ª≠a tr·ª±c ti·∫øp file ho·∫∑c t·∫°o m·ªôt file m·ªõi.

In [None]:
# T·∫°o custom YAML configuration
custom_config = {
    'description': 'Custom experiment for testing NewAIBench',
    'models': [
        {
            'name': 'bm25_model',
            'type': 'sparse',
            'model_name_or_path': '',  # BM25 kh√¥ng c·∫ßn model path
            'parameters': {
                'k1': 1.2,
                'b': 0.75
            },
            'device': 'cpu',
            'batch_size': 32
        }
    ],
    'datasets': [
        {
            'name': 'test_dataset',
            'type': 'text',
            'data_dir': './BKAI_law_data/newaibench_formatted_data/legal_data',  # S·ª≠ d·ª•ng dataset c√≥ s·∫µn
            'config_overrides': {
                'cache_enabled': True
            }
        }
    ],
    'evaluation': {
        'metrics': ['ndcg', 'map', 'recall'],
        'k_values': [1, 5, 10],
        'top_k': 100,
        'save_run_file': True
    },
    'output': {
        'output_dir': './results',
        'experiment_name': f'custom_experiment_{datetime.datetime.now().strftime("%Y%m%d_%H%M%S")}',
        'log_level': 'INFO',
        'overwrite': True
    }
}

# L∆∞u v√†o file YAML
with open('my_custom_experiment.yaml', 'w') as f:
    yaml.dump(custom_config, f, default_flow_style=False, sort_keys=False)

print("‚úÖ Custom YAML configuration created:")
with open('my_custom_experiment.yaml', 'r') as f:
    print(f.read())

In [None]:
# Validate configuration file
print("üîç Validating configuration...")
try:
    # Test loading configuration
    from newaibench.experiment import load_experiment_config
    config = load_experiment_config('my_custom_experiment.yaml')
    print("‚úÖ Configuration file is valid")
    
    # Show summary
    print(f"\nüìä Configuration Summary:")
    print(f"   - Models: {len(config.models)}")
    print(f"   - Datasets: {len(config.datasets)}")
    print(f"   - Metrics: {', '.join(config.evaluation.metrics)}")
    print(f"   - Output: {config.output.output_dir}")
    
except Exception as e:
    print(f"‚ùå Configuration validation failed: {e}")
    print("Vui l√≤ng ki·ªÉm tra l·∫°i file YAML")

### 3.3 Ki·ªÉm tra Dataset

Tr∆∞·ªõc khi ch·∫°y experiment, h√£y ki·ªÉm tra xem dataset c√≥ t·ªìn t·∫°i v√† ƒë√∫ng format kh√¥ng.

In [None]:
# Ki·ªÉm tra dataset
dataset_path = './BKAI_law_data/newaibench_formatted_data/legal_data'
print(f"üîç Checking dataset at: {dataset_path}")

if os.path.exists(dataset_path):
    print("‚úÖ Dataset directory exists")
    
    # List contents
    contents = list(Path(dataset_path).iterdir())
    print(f"üìÅ Dataset contains {len(contents)} items:")
    for item in contents[:10]:  # Show first 10 items
        print(f"   - {item.name}")
    if len(contents) > 10:
        print(f"   ... and {len(contents) - 10} more items")
    
    # Check for required files
    required_files = ['corpus.jsonl', 'queries.jsonl', 'qrels.jsonl']
    for req_file in required_files:
        file_path = Path(dataset_path) / req_file
        if file_path.exists():
            print(f"‚úÖ Found {req_file} ({file_path.stat().st_size} bytes)")
        else:
            print(f"‚ùå Missing {req_file}")
else:
    print("‚ùå Dataset directory not found")
    print("Available datasets in BKAI_law_data/:")
    if os.path.exists('./BKAI_law_data'):
        for item in Path('./BKAI_law_data').iterdir():
            print(f"   - {item.name}")

## 4. Ch·∫°y Experiment

B√¢y gi·ªù ch√∫ng ta s·∫Ω ch·∫°y experiment v·ªõi c·∫•u h√¨nh ƒë√£ t·∫°o.

In [None]:
# Ch·∫°y dry-run tr∆∞·ªõc ƒë·ªÉ ki·ªÉm tra
print("üß™ Running dry-run to validate setup...")
try:
    result = subprocess.run([sys.executable, 'run_experiment.py', 
                           '--config', 'my_custom_experiment.yaml',
                           '--dry-run'], 
                          capture_output=True, text=True, check=True)
    print("‚úÖ Dry-run completed successfully")
    print("üìã Execution plan:")
    print(result.stdout)
except subprocess.CalledProcessError as e:
    print(f"‚ùå Dry-run failed: {e}")
    print(f"Error output: {e.stderr}")
    print("\nVui l√≤ng ki·ªÉm tra l·∫°i configuration v√† dataset")

In [None]:
# Ch·∫°y experiment th·ª±c t·∫ø
print("üöÄ Starting actual experiment...")
print("‚ö†Ô∏è Experiment n√†y c√≥ th·ªÉ m·∫•t v√†i ph√∫t ƒë·ªÉ ho√†n th√†nh")
print("\nN·∫øu b·∫°n mu·ªën ch·∫°y experiment, uncomment d√≤ng code d∆∞·ªõi ƒë√¢y:")
print("\n# Ch·∫°y experiment (uncomment ƒë·ªÉ th·ª±c thi)")
run_experiment = False  # ƒê·∫∑t th√†nh True n·∫øu mu·ªën ch·∫°y ngay

if run_experiment:
    try:
        # T·∫°o process ƒë·ªÉ ch·∫°y experiment
        process = subprocess.Popen([sys.executable, 'run_experiment.py', 
                                  '--config', 'my_custom_experiment.yaml'],
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 text=True,
                                 bufsize=1,
                                 universal_newlines=True)
        
        print("Experiment ƒëang ch·∫°y...")
        
        # Real-time output
        while True:
            output = process.stdout.readline()
            if output == '' and process.poll() is not None:
                break
            if output:
                print(output.strip())
        
        # Get final result
        return_code = process.poll()
        stderr_output = process.stderr.read()
        
        if return_code == 0:
            print("\n‚úÖ Experiment completed successfully!")
        else:
            print(f"\n‚ùå Experiment failed with return code {return_code}")
            if stderr_output:
                print(f"Error: {stderr_output}")
                
    except Exception as e:
        print(f"‚ùå Failed to run experiment: {e}")
else:
    print("\nƒê·ªÉ ch·∫°y experiment, b·∫°n c√≥ th·ªÉ:")
    print("1. ƒê·∫∑t run_experiment = True ·ªü cell tr√™n")
    print("2. Ho·∫∑c ch·∫°y command line: python run_experiment.py --config my_custom_experiment.yaml")

### 4.1 Ch·∫°y Experiment Manually (T√πy ch·ªçn)

N·∫øu b·∫°n mu·ªën ch·∫°y experiment t·ª´ command line ho·∫∑c c√≥ ki·ªÉm so√°t nhi·ªÅu h∆°n:

In [None]:
# Hi·ªÉn th·ªã command ƒë·ªÉ ch·∫°y experiment
config_file = 'my_custom_experiment.yaml'
command = f"python run_experiment.py --config {config_file}"

print("üîß Command ƒë·ªÉ ch·∫°y experiment:")
print(f"cd {os.getcwd()}")
print(command)
print()
print("üìã C√°c options kh√°c:")
print(f"# Ch·∫°y dry-run: python run_experiment.py --config {config_file} --dry-run")
print(f"# V·ªõi log debug: python run_experiment.py --config {config_file} --log-level DEBUG")
print(f"# Overwrite results: python run_experiment.py --config {config_file} --overwrite")

# Copy command to clipboard (if possible)
try:
    import pyperclip
    pyperclip.copy(command)
    print("\n‚úÖ Command copied to clipboard!")
except ImportError:
    print("\nüí° Tip: Install pyperclip ƒë·ªÉ auto-copy command: pip install pyperclip")

## 5. Ki·ªÉm tra K·∫øt qu·∫£

Sau khi experiment ho√†n th√†nh, ch√∫ng ta s·∫Ω ki·ªÉm tra k·∫øt qu·∫£.

In [None]:
# Ki·ªÉm tra k·∫øt qu·∫£ experiment
results_dir = './results'
print(f"üîç Checking results in: {results_dir}")

if os.path.exists(results_dir):
    print("‚úÖ Results directory exists")
    
    # List experiment directories
    experiments = [d for d in Path(results_dir).iterdir() if d.is_dir()]
    print(f"\nüìä Found {len(experiments)} experiment(s):")
    
    for exp_dir in sorted(experiments, key=lambda x: x.stat().st_mtime, reverse=True):
        print(f"\nüìÅ {exp_dir.name}:")
        contents = list(exp_dir.iterdir())
        for item in contents:
            if item.is_file():
                size = item.stat().st_size
                print(f"   üìÑ {item.name} ({size} bytes)")
            else:
                print(f"   üìÅ {item.name}/")
        
        # Check for evaluation results
        eval_file = exp_dir / 'evaluation_results.json'
        if eval_file.exists():
            print(f"   ‚úÖ Found evaluation results")
            try:
                with open(eval_file, 'r') as f:
                    eval_data = json.load(f)
                print(f"   üìà {len(eval_data)} result(s) available")
            except Exception as e:
                print(f"   ‚ùå Could not read evaluation results: {e}")
else:
    print("‚ùå Results directory not found")
    print("Experiment ch∆∞a ƒë∆∞·ª£c ch·∫°y ho·∫∑c ch∆∞a ho√†n th√†nh")

In [None]:
# Load v√† hi·ªÉn th·ªã k·∫øt qu·∫£ chi ti·∫øt (n·∫øu c√≥)
def display_evaluation_results(results_dir):
    experiments = [d for d in Path(results_dir).iterdir() if d.is_dir()]
    
    if not experiments:
        print("‚ùå No experiment results found")
        return
    
    # Get latest experiment
    latest_exp = max(experiments, key=lambda x: x.stat().st_mtime)
    print(f"üìä Latest experiment: {latest_exp.name}")
    
    eval_file = latest_exp / 'evaluation_results.json'
    if eval_file.exists():
        try:
            with open(eval_file, 'r') as f:
                results = json.load(f)
            
            print("\nüìà Evaluation Results:")
            print("=" * 50)
            
            for result in results:
                model_name = result.get('model', 'Unknown')
                dataset_name = result.get('dataset', 'Unknown')
                metrics = result.get('metrics', {})
                
                print(f"\nü§ñ Model: {model_name}")
                print(f"üìö Dataset: {dataset_name}")
                print(f"üìä Metrics:")
                
                for metric_name, values in metrics.items():
                    print(f"   {metric_name}:")
                    if isinstance(values, dict):
                        for k, v in values.items():
                            print(f"     @{k}: {v:.4f}")
                    else:
                        print(f"     {values:.4f}")
                        
        except Exception as e:
            print(f"‚ùå Error reading results: {e}")
    else:
        print("‚ùå No evaluation_results.json found")

if os.path.exists(results_dir):
    display_evaluation_results(results_dir)
else:
    print("Ch∆∞a c√≥ k·∫øt qu·∫£ ƒë·ªÉ hi·ªÉn th·ªã")

## 6. T√≥m t·∫Øt v√† B∆∞·ªõc ti·∫øp theo

üéâ **Ch√∫c m·ª´ng!** B·∫°n ƒë√£ ho√†n th√†nh tutorial s·ª≠ d·ª•ng NewAIBench.

### Nh·ªØng g√¨ ƒë√£ l√†m:
1. ‚úÖ Clone/update repository
2. ‚úÖ C√†i ƒë·∫∑t dependencies
3. ‚úÖ T·∫°o custom YAML configuration
4. ‚úÖ Ch·∫°y experiment (ho·∫∑c chu·∫©n b·ªã ƒë·ªÉ ch·∫°y)
5. ‚úÖ Ki·ªÉm tra k·∫øt qu·∫£

### B∆∞·ªõc ti·∫øp theo:
1. **T√πy ch·ªânh th√™m**: Ch·ªânh s·ª≠a file YAML ƒë·ªÉ th·ª≠ nghi·ªám v·ªõi c√°c models v√† datasets kh√°c
2. **Ph√¢n t√≠ch k·∫øt qu·∫£**: S·ª≠ d·ª•ng c√°c tools c√≥ s·∫µn ƒë·ªÉ ph√¢n t√≠ch k·∫øt qu·∫£ chi ti·∫øt
3. **Ch·∫°y nhi·ªÅu experiments**: So s√°nh hi·ªáu su·∫•t c·ªßa c√°c models kh√°c nhau
4. **T·ªëi ∆∞u h√≥a**: ƒêi·ªÅu ch·ªânh parameters ƒë·ªÉ c·∫£i thi·ªán k·∫øt qu·∫£

### T√†i li·ªáu tham kh·∫£o:
- üìö **Documentation**: `docs/` directory
- üß™ **Examples**: `examples/` directory  
- ‚öôÔ∏è **Configuration**: `examples/experiments/` directory

### L∆∞u √Ω quan tr·ªçng:
- ‚ö†Ô∏è Lu√¥n ch·∫°y `--dry-run` tr∆∞·ªõc khi ch·∫°y experiment th·ª±c t·∫ø
- üíæ Backup k·∫øt qu·∫£ quan tr·ªçng
- üîß Ki·ªÉm tra logs n·∫øu c√≥ l·ªói
- üìä So s√°nh k·∫øt qu·∫£ t·ª´ nhi·ªÅu experiments kh√°c nhau

In [None]:
# üõ†Ô∏è Utility functions ƒë·ªÉ s·ª≠ d·ª•ng sau n√†y

def quick_experiment(model_type, dataset_path, experiment_name=None):
    """T·∫°o v√† ch·∫°y experiment nhanh"""
    if not experiment_name:
        experiment_name = f"quick_{model_type}_{datetime.datetime.now().strftime('%H%M%S')}"
    
    config = {
        'description': f'Quick {model_type} experiment',
        'models': [{
            'name': f'{model_type}_model',
            'type': model_type,
            'model_name_or_path': '',
            'device': 'auto'
        }],
        'datasets': [{
            'name': 'dataset',
            'type': 'text',
            'data_dir': dataset_path
        }],
        'evaluation': {
            'metrics': ['ndcg', 'map'],
            'k_values': [1, 5, 10],
            'top_k': 100
        },
        'output': {
            'output_dir': './results',
            'experiment_name': experiment_name,
            'log_level': 'INFO'
        }
    }
    
    config_file = f'{experiment_name}.yaml'
    with open(config_file, 'w') as f:
        yaml.dump(config, f, default_flow_style=False)
    
    print(f"‚úÖ Created config: {config_file}")
    print(f"‚ñ∂Ô∏è To run: python run_experiment.py --config {config_file}")
    return config_file

def list_recent_experiments(n=5):
    """Li·ªát k√™ c√°c experiment g·∫ßn ƒë√¢y"""
    if not os.path.exists('./results'):
        print("‚ùå No results directory found")
        return
    
    experiments = [d for d in Path('./results').iterdir() if d.is_dir()]
    experiments.sort(key=lambda x: x.stat().st_mtime, reverse=True)
    
    print(f"üìä {min(n, len(experiments))} most recent experiments:")
    for i, exp in enumerate(experiments[:n]):
        mtime = datetime.datetime.fromtimestamp(exp.stat().st_mtime)
        print(f"  {i+1}. {exp.name} ({mtime.strftime('%Y-%m-%d %H:%M')})")

print("üîß Utility functions loaded:")
print("  - quick_experiment(model_type, dataset_path, experiment_name=None)")
print("  - list_recent_experiments(n=5)")
print("\nV√≠ d·ª•:")
print("  quick_experiment('sparse', './BKAI_law_data/newaibench_formatted_data/legal_data')")
print("  list_recent_experiments()")