In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
import os
from llama import Workflow, Llama
from llama.util import find_free_port

os.environ["RANK"] = "0"
os.environ["WORLD_SIZE"] = "1"
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = str(find_free_port())

workflow = Workflow.build(
    ckpt_dir='/scratch4/jeisner1/tjbai/llama_8b',
    tokenizer_path='/scratch4/jeisner1/tjbai/llama_8b/tokenizer.model',
    max_seq_len=2*8192,
    max_batch_size=5,
    model_parallel_size=1,
    max_nodes=100,
)

llama = Llama(workflow.model, workflow.tokenizer)



> initializing model parallel with size 1
> initializing ddp with size 1
> initializing pipeline with size 1
Loaded in 45.66 seconds


In [9]:
from operator import itemgetter as get

from llama.workflows.tot import (
    load_math_problems,
    cot_prompt,
    format_problem,
    format_vote_system_prompt,
    finish_prompt
)

def baseline(
    workflow,
    problem,
    branching_factor=8,
    voters=4,
    temperature=0.7,
    top_p=1.0,
):
    workflow.reset()
    insert_time = []
    step_time = []
    ttft_time = []
    total_ttft = 0
    total_tokens = 0
    force_tokens = []
    s = time.time()

    [cot] = workflow.insert([
        {'messages': [
            {'role': 'system', 'content': cot_prompt},
            {'role': 'user', 'content': format_problem(problem)}
        ], 'parent_ids': []},
    ], time_buffer=insert_time)
    proposal_tokens, proposal_nodes = get('tokens', 'nodes')(workflow.step([
            {'header': ('assistant', None),
            'prefill': '',
            'parent_ids': [cot['id']]}
            for i in range(branching_factor)
        ],
        compact=False,
        max_gen_len=512,
        temperature=temperature,
        top_p=top_p,
        seed=42,
        time_buffer=step_time,
        ttft_buffer=ttft_time,
    ))
    total_ttft += insert_time[-1] + ttft_time[-1]
    total_tokens += sum(len(a) for a in proposal_tokens)
    force_tokens.append(max(len(p) for p in proposal_tokens))

    vote_user_prompt = f'{format_problem(problem)}\n\nHere are the proposals:'
    for i, prop in enumerate(proposal_tokens):
        vote_user_prompt += f'\n\nSolution #{i+1}:\n{workflow.tokenizer.decode(prop)}'
    [vote] = workflow.insert([
        {'messages': [
            {'role': 'system', 'content': format_vote_system_prompt(branching_factor)},
            {'role': 'system', 'content': vote_user_prompt}
        ], 'parent_ids': []}
    ], time_buffer=insert_time)
    vote_tokens = get('tokens')(workflow.step([
            {'header': ('assistant', None),
            'prefill': 'BEST CHOICE: ',
            'parent_ids': [vote['id']]}
            for _ in range(voters)
        ],
        compact=False,
        max_gen_len=256,
        temperature=temperature,
        top_p=top_p,
        seed=42,
        time_buffer=step_time,
        ttft_buffer=ttft_time
    ))
    total_ttft += insert_time[-1] + ttft_time[-1]
    total_tokens += sum(len(a) for a in vote_tokens)
    force_tokens.append(max(len(p) for p in vote_tokens))

    # doesn't matter which is best, we should just simulate always
    best_proposal = workflow.tokenizer.decode(proposal_tokens[0])
    [finish] = workflow.insert([
        {'messages': [
            {"role": "system", "content": finish_prompt},
            {"role": "user", "content": f"{format_problem(problem)}\n\nHere is the proposed approach: {best_proposal}"}],
        'parent_ids': []
    }], time_buffer=insert_time)
    [final_tokens] = get('tokens')(workflow.step([
            {'header': ('assistant', None),
            'prefill': '',
            'parent_ids': [finish['id']]}
        ],
        max_gen_len=256,
        temperature=temperature,
        top_p=top_p,
        time_buffer=step_time,
        ttft_buffer=ttft_time
    ))
    total_ttft += insert_time[-1] + ttft_time[-1]
    total_tokens += len(final_tokens)
    force_tokens.append(len(final_tokens))

    return {
        'wall_time': time.time() - s,
        'cuda_time': sum(insert_time) + sum(step_time),
        'ttft': total_ttft,
        'tokens': total_tokens,
        'force_tokens': force_tokens
    }

def cached(
    workflow,
    problem,
    branching_factor=8,
    voters=4,
    temperature=0.7,
    top_p=1.0,
    force_tokens=[],
):
    workflow.reset()
    assert len(force_tokens) == 3

    insert_time = []
    step_time = []
    ttft_time = []
    total_ttft = 0
    s = time.time()

    cot, vote, finish = workflow.insert([
        {'messages': [
            {'role': 'system', 'content': cot_prompt},
            {'role': 'user', 'content': format_problem(problem)}
        ], 'parent_ids': []},
        {'messages': [
            {'role': 'system', 'content': format_vote_system_prompt(branching_factor)},
            {'role': 'user', 'content': format_problem(problem)}
        ], 'parent_ids': []},
        {'messages': [
            {'role': 'system', 'content': finish_prompt},
            {'role': 'user', 'content': format_problem(problem)}
        ], 'parent_ids': []},
    ], time_buffer=insert_time)

    proposal_tokens, proposal_nodes = get('tokens', 'nodes')(workflow.step([
            {'header': ('assistant', None),
            'prefill': f'Solution #{i+1}:\n\n',
            'parent_ids': [cot['id']]}
            for i in range(branching_factor)
        ],
        compact=False,
        max_gen_len=512,
        temperature=temperature,
        top_p=top_p,
        force_tokens=force_tokens.pop(0),
        time_buffer=step_time,
        ttft_buffer=ttft_time,
    ))
    total_ttft += insert_time[-1] + ttft_time[-1]

    vote_tokens, vote_nodes = get('tokens', 'nodes')(workflow.step([
            {'header': ('assistant', None),
            'prefill': 'BEST CHOICE: ',
            'parent_ids': [vote['id']] + [p['id'] for p in proposal_nodes]}
            for _ in range(voters)
        ],
        stateless=False,
        compact=False,
        max_gen_len=256,
        temperature=temperature,
        top_p=top_p,
        force_tokens=force_tokens.pop(0),
        time_buffer=step_time,
        ttft_buffer=ttft_time,
    ))
    total_ttft += ttft_time[-1]

    [final_tokens] = get('tokens')(workflow.step([
            {'header': ('assistant', None),
            'prefill': 'ANSWER: ',
            'parent_ids': [finish['id']] + [proposal_nodes[0]['id']]}
        ],
        stateless=False,
        max_gen_len=256,
        temperature=temperature,
        top_p=top_p,
        time_buffer=step_time,
        ttft_buffer=ttft_time,
        force_tokens=force_tokens.pop(0)
    ))
    total_ttft += ttft_time[-1]

    return {
        'wall_time': time.time() - s,
        'cuda_time': sum(insert_time) + sum(step_time),
        'ttft': total_ttft,
    }

def benchmark(*args, **kwargs):
    baseline_res = baseline(*args, **kwargs)
    print(baseline_res)
    cached_res = cached(*args, **kwargs, force_tokens=baseline_res['force_tokens'])
    return baseline_res, cached_res

problems = load_math_problems('/home/tbai4/llama3/data/MATH', split='val')[:500]

In [10]:
baseline_res, cached_res = benchmark(workflow, problems[0]['problem'])

{'time': 16.93652828979492, 'ttft': 0.38639542388916015, 'tokens': 1495, 'force_tokens': [217, 50, 137]}


In [11]:
print(baseline_res)
print(cached_res)

{'time': 16.93652828979492, 'ttft': 0.38639542388916015, 'tokens': 1495, 'force_tokens': []}
{'time': 16.640416851043703, 'ttft': 0.18356931304931642}


In [7]:
import time

s = time.time()
[final_tokens] = get('tokens')(workflow.step([
        {'header': ('assistant', None),
        'prefill': '',
        'parent_ids': [finish['id']]}
    ],
    max_gen_len=1024,
    temperature=temperature,
    top_p=top_p,
    time_buffer=step_time,
    ttft_buffer=ttft_time,
    force_tokens=512,
))
print(time.time() - s)
print(len(final_tokens))

s = time.time()
[final_tokens] = get('tokens')(workflow.step([
        {'header': ('assistant', None),
        'prefill': '',
        'parent_ids': [finish['id']]}
    ],
    max_gen_len=1024,
    temperature=temperature,
    top_p=top_p,
    time_buffer=step_time,
    ttft_buffer=ttft_time,
))
print(time.time() - s)
print(len(final_tokens))

19.990097045898438
120
3.6810128688812256
90


In [6]:
workflow.tokenizer.decode(final_tokens)

'ANS'

In [18]:
print('total time', sum(insert_time) + sum(step_time))
print('total ttft', total_ttft)
print('tokens generate', total_tokens)

total time 14.470890384674071
total ttft 0.37766575622558596
tokens generate 1445


In [13]:
import time
import torch
import statistics
from collections import Counter, defaultdict
from operator import itemgetter as get

from llama import Workflow
from llama.workflows.benchmark import measure_step
from llama.workflows.tot import load_math_problems
from llama.workflows.tot import (
    cot_prompt,
    finish_prompt,
    format_problem,
    format_vote_system_prompt,
    parse_choice,
)

def benchmark(
    workflow: Workflow,
    problem: str,
    branching_factor: int = 8,
    voters: int = 4,
    num_runs: int = 3,
):
    metrics = {
        "baseline": {"tps": [], "latency": [], "cuda_latency": [], "ttft": [], "steps": []},
        "cached": {"tps": [], "latency": [], "cuda_latency": [], "ttft": [], "steps": []}
    }

    for run in range(num_runs):
        print(f"Run {run+1}/{num_runs}")
        torch.cuda.synchronize()

        # ====== BASELINE ======
        workflow.reset()
        baseline_start = time.time()
        cuda_start_event = torch.cuda.Event(enable_timing=True)
        cuda_end_event = torch.cuda.Event(enable_timing=True)
        cuda_start_event.record(stream=None)

        baseline_step_metrics = []
        total_tokens = 0

        # Start timing complete TTFT for proposal step (including insert)
        proposal_ttft_start = time.time()
        
        [cot] = workflow.insert([
            {'messages': [
                {'role': 'system', 'content': cot_prompt},
                {'role': 'user', 'content': format_problem(problem)}
            ], 'parent_ids': []},
        ])

        proposal_step_args = {
            'tasks': [
                {'header': ('assistant', None),
                'prefill': '',
                'parent_ids': [cot['id']]}
                for i in range(branching_factor)
            ],
            'compact': False,
            'max_gen_len': 512,
        }

        outputs, step1_metrics = measure_step(workflow, proposal_step_args)
        (proposal_tokens, proposal_nodes) = get('tokens', 'nodes')(outputs)
        
        # Complete TTFT for proposal step
        proposal_complete_ttft = time.time() - proposal_ttft_start
        step1_metrics["complete_ttft"] = proposal_complete_ttft
        
        baseline_step_metrics.append(step1_metrics)
        total_tokens += step1_metrics["token_count"]

        # Start timing complete TTFT for vote step (including insert)
        vote_ttft_start = time.time()
        
        vote_user_prompt = f'{format_problem(problem)}\n\nHere are the proposals:'
        for i, prop in enumerate(proposal_tokens):
            vote_user_prompt += f'\n\nSolution #{i+1}:\n{workflow.tokenizer.decode(prop)}'

        [vote] = workflow.insert([
            {'messages': [
                {'role': 'system', 'content': format_vote_system_prompt(branching_factor)},
                {'role': 'user', 'content': vote_user_prompt}
            ], 'parent_ids': []}
        ])

        vote_step_args = {
            'tasks': [
                {'header': ('assistant', None),
                'prefill': 'BEST CHOICE: ',
                'parent_ids': [vote['id']]}
                for _ in range(voters)
            ],
            'compact': False,
            'max_gen_len': 256,
        }

        outputs, step2_metrics = measure_step(workflow, vote_step_args)
        vote_tokens = get('tokens')(outputs)
        
        # Complete TTFT for vote step
        vote_complete_ttft = time.time() - vote_ttft_start
        step2_metrics["complete_ttft"] = vote_complete_ttft
        
        baseline_step_metrics.append(step2_metrics)
        total_tokens += step2_metrics["token_count"]

        votes = [choice for v in vote_tokens if (choice := parse_choice(workflow.tokenizer.decode(v))) is not None]
        final_tokens = None

        if votes:
            # Start timing complete TTFT for final step (including insert)
            final_ttft_start = time.time()
            
            best = Counter(votes).most_common(1)[0][0]
            best_proposal = workflow.tokenizer.decode(proposal_tokens[best - 1])
            [finish] = workflow.insert([
                {'messages': [
                    {"role": "system", "content": finish_prompt},
                    {"role": "user", "content": f"{format_problem(problem)}\n\nHere is the proposed approach: {best_proposal}"}],
                'parent_ids': []
            }])

            final_step_args = {
                'tasks': [
                    {'header': ('assistant', None),
                    'prefill': '',
                    'parent_ids': [finish['id']]}
                ],
                'max_gen_len': 256,
            }

            outputs, step3_metrics = measure_step(workflow, final_step_args)
            final_tokens = get('tokens')(outputs)[0]
            
            # Complete TTFT for final step
            final_complete_ttft = time.time() - final_ttft_start
            step3_metrics["complete_ttft"] = final_complete_ttft
            
            baseline_step_metrics.append(step3_metrics)
            total_tokens += step3_metrics["token_count"]

        cuda_end_event.record(stream=None)
        torch.cuda.synchronize()
        baseline_cuda_time = cuda_start_event.elapsed_time(cuda_end_event) / 1000

        baseline_end = time.time()
        baseline_latency = baseline_end - baseline_start
        baseline_tps = total_tokens / baseline_latency
        baseline_ttft_avg = statistics.mean([m["complete_ttft"] for m in baseline_step_metrics])

        # Prepare teacher forcing tensors
        proposal_force = torch.full((branching_factor, 512), workflow.tokenizer.eot_id, device=workflow.device)
        for i, tokens in enumerate(proposal_tokens):
            proposal_force[i, :len(tokens)] = torch.tensor(tokens, device=workflow.device)

        voter_force = torch.full((voters, 256), workflow.tokenizer.eot_id, device=workflow.device)
        for i, tokens in enumerate(vote_tokens):
            voter_force[i, :len(tokens)] = torch.tensor(tokens, device=workflow.device)

        final_force = None
        if final_tokens is not None:
            final_force = torch.full((1, 256), workflow.tokenizer.eot_id, device=workflow.device)
            final_force[0, :len(final_tokens)] = torch.tensor(final_tokens, device=workflow.device)

        # ====== CACHED ======
        workflow.reset()
        cached_start = time.time()
        cuda_start_event = torch.cuda.Event(enable_timing=True)
        cuda_end_event = torch.cuda.Event(enable_timing=True)
        cuda_start_event.record(stream=None)

        cached_step_metrics = []

        # For cached implementation, insert all contexts upfront
        insert_start = time.time()
        cot, vote, finish = workflow.insert([
            {'messages': [
                {'role': 'system', 'content': cot_prompt},
                {'role': 'user', 'content': format_problem(problem)}
            ], 'parent_ids': []},
            {'messages': [
                {'role': 'system', 'content': format_vote_system_prompt(branching_factor)},
                {'role': 'user', 'content': format_problem(problem)}
            ], 'parent_ids': []},
            {'messages': [
                {'role': 'system', 'content': finish_prompt},
                {'role': 'user', 'content': format_problem(problem)}
            ], 'parent_ids': []},
        ])
        insert_time = time.time() - insert_start

        # Start timing complete TTFT for proposal step
        proposal_ttft_start = time.time()
        
        cached_proposal_step_args = {
            'tasks': [
                {'header': ('assistant', None),
                'prefill': f'Solution #{i+1}:\n\n',
                'parent_ids': [cot['id']]}
                for i in range(branching_factor)
            ],
            'teacher_force': proposal_force,
            'compact': False,
            'max_gen_len': 512,
        }

        outputs, cached_step1_metrics = measure_step(workflow, cached_proposal_step_args)
        cached_proposal_tokens, cached_proposal_nodes = get('tokens', 'nodes')(outputs)
        
        # Complete TTFT for proposal step (only need to add initial insert time to first step)
        cached_step1_metrics["complete_ttft"] = (time.time() - proposal_ttft_start) + insert_time
        
        cached_step_metrics.append(cached_step1_metrics)

        # Start timing complete TTFT for vote step
        vote_ttft_start = time.time()
        
        cached_vote_step_args = {
            'tasks': [
                {'header': ('assistant', None),
                'prefill': 'BEST CHOICE: ',
                'parent_ids': [vote['id']] + [p['id'] for p in cached_proposal_nodes]}
                for _ in range(voters)
            ],
            'teacher_force': voter_force,
            'stateless': False,
            'compact': False,
            'max_gen_len': 256,
        }

        outputs, cached_step2_metrics = measure_step(workflow, cached_vote_step_args)
        cached_vote_tokens, cached_vote_nodes = get('tokens', 'nodes')(outputs)
        
        # Complete TTFT for vote step
        cached_step2_metrics["complete_ttft"] = time.time() - vote_ttft_start
        
        cached_step_metrics.append(cached_step2_metrics)

        if final_force is not None and votes:
            # Start timing complete TTFT for final step
            final_ttft_start = time.time()
            
            best = Counter(votes).most_common(1)[0][0]
            cached_final_step_args = {
                'tasks': [
                    {'header': ('assistant', None),
                    'prefill': 'ANSWER: ',
                    'parent_ids': [finish['id']] + [cached_proposal_nodes[best-1]['id']]}
                ],
                'teacher_force': final_force,
                'stateless': False,
                'compact': False,
                'max_gen_len': 256,
            }

            outputs, cached_step3_metrics = measure_step(workflow, cached_final_step_args)
            cached_final_tokens = get('tokens')(outputs)
            
            # Complete TTFT for final step
            cached_step3_metrics["complete_ttft"] = time.time() - final_ttft_start
            
            cached_step_metrics.append(cached_step3_metrics)

        cuda_end_event.record(stream=None)
        torch.cuda.synchronize()
        cached_cuda_time = cuda_start_event.elapsed_time(cuda_end_event) / 1000

        cached_end = time.time()
        cached_latency = cached_end - cached_start
        cached_tps = total_tokens / cached_latency
        cached_ttft_avg = statistics.mean([m["complete_ttft"] for m in cached_step_metrics])

        metrics["baseline"]["tps"].append(baseline_tps)
        metrics["baseline"]["latency"].append(baseline_latency)
        metrics["baseline"]["cuda_latency"].append(baseline_cuda_time)
        metrics["baseline"]["ttft"].append(baseline_ttft_avg)
        metrics["baseline"]["steps"].append(baseline_step_metrics)

        metrics["cached"]["tps"].append(cached_tps)
        metrics["cached"]["latency"].append(cached_latency)
        metrics["cached"]["cuda_latency"].append(cached_cuda_time)
        metrics["cached"]["ttft"].append(cached_ttft_avg)
        metrics["cached"]["steps"].append(cached_step_metrics)

        print(f"  Tokens generated: {total_tokens}")
        print(f"  Baseline: TPS={baseline_tps:.1f}, Wall latency={baseline_latency:.3f}s, CUDA latency={baseline_cuda_time:.3f}s, Avg TTFT={baseline_ttft_avg*1000:.1f}ms")
        print(f"  Cached: TPS={cached_tps:.1f}, Wall latency={cached_latency:.3f}s, CUDA latency={cached_cuda_time:.3f}s, Avg TTFT={cached_ttft_avg*1000:.1f}ms")
        print(f"  Speedup: TPS={cached_tps/baseline_tps:.2f}x, Wall={baseline_latency/cached_latency:.2f}x, CUDA={baseline_cuda_time/cached_cuda_time:.2f}x, TTFT={baseline_ttft_avg/cached_ttft_avg:.2f}x")

        print("  Per-step complete TTFT improvement:")
        for i, (b_step, c_step) in enumerate(zip(baseline_step_metrics, cached_step_metrics)):
            step_name = ["proposals", "voting", "final"][i]
            ttft_improvement = b_step["complete_ttft"] / c_step["complete_ttft"] if c_step["complete_ttft"] > 0 else float('inf')
            print(f"    {step_name}: {b_step['complete_ttft']*1000:.1f}ms → {c_step['complete_ttft']*1000:.1f}ms ({ttft_improvement:.2f}x)")

    summary = {}
    for impl in ["baseline", "cached"]:
        summary[impl] = {}
        for metric in ["tps", "latency", "cuda_latency", "ttft"]:
            values = metrics[impl][metric]
            summary[impl][metric] = {
                "mean": statistics.mean(values),
                "stdev": statistics.stdev(values) if len(values) > 1 else 0
            }

    improvement = {
        "tps": summary["cached"]["tps"]["mean"] / summary["baseline"]["tps"]["mean"],
        "latency": summary["baseline"]["latency"]["mean"] / summary["cached"]["latency"]["mean"],
        "cuda_latency": summary["baseline"]["cuda_latency"]["mean"] / summary["cached"]["cuda_latency"]["mean"],
        "ttft": summary["baseline"]["ttft"]["mean"] / summary["cached"]["ttft"]["mean"]
    }

    print("\nFinal Results:")
    for impl in ["baseline", "cached"]:
        print(f"{impl.capitalize()}:")
        print(f"  TPS: {summary[impl]['tps']['mean']:.2f} ± {summary[impl]['tps']['stdev']:.2f}")
        print(f"  Wall Latency: {summary[impl]['latency']['mean']:.3f}s ± {summary[impl]['latency']['stdev']:.3f}s")
        print(f"  CUDA Latency: {summary[impl]['cuda_latency']['mean']:.3f}s ± {summary[impl]['cuda_latency']['stdev']:.3f}s")
        print(f"  Avg TTFT: {summary[impl]['ttft']['mean']*1000:.1f}ms ± {summary[impl]['ttft']['stdev']*1000:.1f}ms")

    print("\nImprovement Ratios:")
    print(f"  TPS: {improvement['tps']:.2f}x")
    print(f"  Wall Latency: {improvement['latency']:.2f}x")
    print(f"  CUDA Latency: {improvement['cuda_latency']:.2f}x")
    print(f"  Avg TTFT: {improvement['ttft']:.2f}x")

    step_improvements = defaultdict(list)
    for run in range(num_runs):
        for i, (step_name, b_step, c_step) in enumerate([
            (name, b, c)
            for name, b, c in zip(
                ["proposals", "voting", "final"],
                metrics["baseline"]["steps"][run],
                metrics["cached"]["steps"][run]
            )
            if len(metrics["baseline"]["steps"][run]) > i and len(metrics["cached"]["steps"][run]) > i
        ]):
            step_improvements[f"{step_name}_ttft"].append(b_step["complete_ttft"] / c_step["complete_ttft"])

    print("\nPer-step TTFT Improvements:")
    for step_name, values in step_improvements.items():
        mean = statistics.mean(values)
        stdev = statistics.stdev(values) if len(values) > 1 else 0
        print(f"  {step_name}: {mean:.2f}x ± {stdev:.2f}x")

    return {
        "summary": summary,
        "improvement": improvement,
        "raw": metrics,
        "step_improvements": {k: {
            "mean": statistics.mean(v),
            "stdev": statistics.stdev(v) if len(v) > 1 else 0
        } for k, v in step_improvements.items()}
    }

In [14]:
from llama.workflows.tot import load_math_problems
problems = load_math_problems('/home/tbai4/llama3/data/MATH', split='val')

benchmark(
    workflow,
    problems[0]['problem'],
)

Run 1/3
  Tokens generated: 1281
  Baseline: TPS=79.9, Wall latency=16.037s, CUDA latency=16.037s, Avg TTFT=5345.6ms
  Cached: TPS=80.4, Wall latency=15.931s, CUDA latency=15.930s, Avg TTFT=5310.1ms
  Speedup: TPS=1.01x, Wall=1.01x, CUDA=1.01x, TTFT=1.01x
  Per-step complete TTFT improvement:
    proposals: 6489.3ms → 6358.9ms (1.02x)
    voting: 1931.4ms → 2008.7ms (0.96x)
    final: 7616.1ms → 7562.8ms (1.01x)
Run 2/3
  Tokens generated: 1281
  Baseline: TPS=79.8, Wall latency=16.050s, CUDA latency=16.050s, Avg TTFT=5349.8ms
  Cached: TPS=80.4, Wall latency=15.935s, CUDA latency=15.935s, Avg TTFT=5311.8ms
  Speedup: TPS=1.01x, Wall=1.01x, CUDA=1.01x, TTFT=1.01x
  Per-step complete TTFT improvement:
    proposals: 6489.7ms → 6354.3ms (1.02x)
    voting: 1936.5ms → 2010.3ms (0.96x)
    final: 7623.3ms → 7570.7ms (1.01x)
Run 3/3
  Tokens generated: 1281
  Baseline: TPS=79.9, Wall latency=16.026s, CUDA latency=16.025s, Avg TTFT=5341.8ms
  Cached: TPS=80.3, Wall latency=15.943s, CUDA late

{'summary': {'baseline': {'tps': {'mean': 79.87575043165356,
    'stdev': 0.06013996787366561},
   'latency': {'mean': 16.037414073944092, 'stdev': 0.012075318023978323},
   'cuda_latency': {'mean': 16.037323567708334, 'stdev': 0.012077594121968668},
   'ttft': {'mean': 5.345736026763916, 'stdev': 0.004023550558760566}},
  'cached': {'tps': {'mean': 80.38205126479731, 'stdev': 0.03251895356200607},
   'latency': {'mean': 15.936395247777304, 'stdev': 0.006447758699455069},
   'cuda_latency': {'mean': 15.936302408854166, 'stdev': 0.006440445112084791},
   'ttft': {'mean': 5.312100887298584, 'stdev': 0.0021524800112217167}}},
 'improvement': {'tps': 1.006338605026027,
  'latency': 1.0063388755484637,
  'cuda_latency': 1.0063390588520735,
  'ttft': 1.0063317960593998},
 'raw': {'baseline': {'tps': [79.87769860795333,
    79.81466004627343,
    79.9348926407339],
   'latency': [16.03701686859131, 16.049683094024658, 16.02554225921631],
   'cuda_latency': [16.03694921875, 16.049583984375, 16

In [None]:
from collections import Counter

def create_tot_transcript(result, tokenizer, problem: str) -> str:
    transcript = []
    
    # Header
    transcript.append(f"# Tree of Thoughts Process for Problem:\n{problem}\n")
    
    # Stage 1: Strategies
    transcript.append("## Stage 1: Strategy Generation")
    for i, tokens in enumerate(result['strategy_tokens']):
        transcript.append(f"\n### Strategy #{i+1}:\n{tokenizer.decode(tokens)}")
    
    # Strategy Votes
    transcript.append("\n## Strategy Voting")
    for i, tokens in enumerate(result['strategy_vote_tokens']):
        transcript.append(f"\nVoter #{i+1}: {tokenizer.decode(tokens)}")
    
    # Best Strategy
    if result['strategy_votes']:
        vote_counts = Counter(result['strategy_votes'])
        best_strategy_idx = vote_counts.most_common(1)[0][0] - 1
        best_strategy = tokenizer.decode(result['strategy_tokens'][best_strategy_idx])
        
        transcript.append(f"\n## Best Strategy (#{best_strategy_idx+1}):\n{best_strategy}")
    else:
        transcript.append("\n## No valid strategy votes")
        return "\n".join(transcript)
    
    # Stage 2: Solutions
    transcript.append("\n## Stage 2: Solution Generation")
    for i, tokens in enumerate(result['solution_tokens']):
        transcript.append(f"\n### Solution #{i+1}:\n{tokenizer.decode(tokens)}")
    
    # Solution Votes
    transcript.append("\n## Solution Voting")
    for i, tokens in enumerate(result['solution_vote_tokens']):
        transcript.append(f"\nVoter #{i+1}: {tokenizer.decode(tokens)}")
    
    # Final Solution
    if result['solution_votes']:
        vote_counts = Counter(result['solution_votes'])
        best_solution_idx = vote_counts.most_common(1)[0][0] - 1
        transcript.append(f"\n## Final Selected Solution (#{best_solution_idx+1}):")
        if result['final_tokens'] is not None:
            transcript.append(f"\n{tokenizer.decode(result['final_tokens'])}")
    else:
        transcript.append("\n## No valid solution votes")
    
    return "\n".join(transcript)

print(create_tot_transcript(outputs, workflow.tokenizer, problems[0]))

In [31]:
import json
from llama.workflows.tot import load_math_problems, eval_solutions

problems = load_math_problems('/home/tbai4/llama3/data/MATH', split='val')

with open('/home/tbai4/llama3/dumps/mad/math_cached_e2e.json') as f:
    data = json.load(f)

llama.model.reshape_cache(4)
results = eval_solutions(
    llama=llama,
    solutions=data,
    problems=problems
)

print(sum(results))

100%|██████████| 280/280 [01:01<00:00,  4.53it/s]


In [None]:
from llama.workflows.tot import tot_baseline, tot_baseline_shuffled, load_math_problems

problems = load_math_problems('/home/tbai4/llama3/data/MATH', split='val')

workflow.reset()
outputs = tot_baseline_shuffled(
    workflow,
    problems[0]['problem'],
    8, 4,
    debug=True,
)

## generate fine-tuning examples

In [None]:
from llama.workflows.tot import collect_samples

samples = collect_samples(
    llama=llama,
    save_dir='/home/tbai4/llama3/dumps',
    n_problems=1000,
    branching_factor=8,
    voters=4,
    temperature=1.0,
    top_p=1.0,
    seed=42,
    math_path='/home/tbai4/llama3/data/MATH',
    split='train',
)