In [None]:
%%capture
!pip install 'transformer_engine[pytorch]' datasets einops accelerate transformers ipywidgets

In [4]:
from datasets import load_dataset
import transformer_engine.pytorch as te
from transformer_engine.common import recipe
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [45]:
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-1.5B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
math_tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-Math-1.5B-Instruct")
model.to("cuda")

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 1536)
    (layers): ModuleList(
      (0-27): 28 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear(in_features=1536, out_features=1536, bias=True)
          (k_proj): Linear(in_features=1536, out_features=256, bias=True)
          (v_proj): Linear(in_features=1536, out_features=256, bias=True)
          (o_proj): Linear(in_features=1536, out_features=1536, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (up_proj): Linear(in_features=1536, out_features=8960, bias=False)
          (down_proj): Linear(in_features=8960, out_features=1536, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((1536,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((1536,), eps=1e-06)
    (rotary_emb): Qw

In [15]:
dataset = load_dataset("nvidia/OpenMathInstruct-2", split="train")

Resolving data files:   0%|          | 0/32 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/32 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/32 [00:00<?, ?it/s]

In [7]:
fp8_recipe = recipe.Float8BlockScaling(fp8_format=recipe.Format.HYBRID)

In [8]:
dataset

Dataset({
    features: ['problem', 'generated_solution', 'expected_answer', 'problem_source'],
    num_rows: 13972791
})

In [11]:
!nproc

32


In [None]:
def apply_chat_template(sample_batch):
    return { "text": [
        math_tokenizer.apply_chat_template(
            [
                {"role": "user", "content": s[0]},
                {"role": "assistant", "content": s[1]}
            ],
            add_generation_prompt=False,
            tokenize=False
        )
         for s in zip(sample_batch["problem"], sample_batch["generated_solution"])]
    }
text_dataset = dataset.map(apply_chat_template, batched=True, remove_columns=dataset.column_names, num_proc=32)

Map (num_proc=32):   0%|          | 0/13972791 [00:00<?, ? examples/s]

In [None]:
text_dataset[0]

{'text': '<|im_start|>system\nPlease reason step by step, and put your final answer within \\boxed{}.<|im_end|>\n<|im_start|>user\nAva is planning a camping trip with her friends. She wants to make sure they have enough granola bars for snacks. There will be five people total: Ava, her two friends, and her parents. They will spend 3 days and 2 nights at the campsite, and they plan to have 2 granola bars per person for breakfast and 1 granola bar per person for an afternoon snack each day. How many granola bars will Ava need to pack in total for the entire trip?<|im_end|>\n<|im_start|>assistant\nThere will be a total of 5 people.\nEach person needs 2 granola bars for breakfast and 1 granola bar for snack. This amounts to a total of 3 granola bars per person per day.\nSince the trip is 3 days long, each person will need 3 granola bars/day * 3 days = 9 granola bars.\nSo for 5 people, Ava will need 5 * 9 = 45 granola bars.\nThus, Ava will need to pack \\boxed{45} granola bars in total for 

In [16]:
samples = dataset.shuffle(seed=42).select(range(12))


In [23]:
samples['problem']

Column(['Find the volume of a sphere with a diameter of 18 meters. Express your answer in terms of $\\pi$.', 'The following line is parameterized, so that its direction vector is of the form $\\begin{pmatrix} a \\\\ -1 \\end{pmatrix}.$  Find $a.$\n\n[asy]\nunitsize(0.4 cm);\n\npair A, B, L, R;\nint i, n;\n\nfor (i = -8; i <= 8; ++i) {\n  draw((i,-8)--(i,8),gray(0.7));\n  draw((-8,i)--(8,i),gray(0.7));\n}\n\ndraw((-8,0)--(8,0),Arrows(6));\ndraw((0,-8)--(0,8),Arrows(6));\n\nA = (-2,5);\nB = (1,0);\nL = extension(A, B, (0,8), (1,8));\nR = extension(A, B, (0,-8), (1,-8));\n\ndraw(L--R, red);\n\nlabel("$x$", (8,0), E);\nlabel("$y$", (0,8), N);\n[/asy]', 'At a school party, there are 15 students who like math, 20 students who like history, and 7 students who like both subjects. If there are 42 students in total, how many students like neither math nor history?', 'A water tank can be filled by two pipes, A and B. Pipe A fills the tank at a rate of 2 cubic meters per hour, while pipe B fills i

In [47]:
import time
inputs = math_tokenizer.batch_encode_plus(list(samples['problem']), return_tensors='pt', padding=True, truncation=True, max_length=512)
inputs.to("cuda")
t0 = time.time()
outputs = model.generate(
    **inputs,
    max_new_tokens=256,
    # do_sample=False,
    # use_cache=True,
    # return_dict_in_generate=True
)
t1 = time.time()
print(f"Time taken: {t1 - t0:.2f} seconds")

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Time taken: 3.14 seconds


In [50]:
import time
inputs = math_tokenizer.batch_encode_plus(list(samples['problem']), return_tensors='pt', padding=True, truncation=True, max_length=512)
inputs.to("cuda")
t0 = time.time()
with te.fp8_autocast(fp8_recipe):
    outputs = model.generate(
        **inputs,
        max_new_tokens=256,
        # do_sample=False,
        # use_cache=True,
        # return_dict_in_generate=True
    )
t1 = time.time()
print(f"Time taken: {t1 - t0:.2f} seconds")

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Time taken: 2.79 seconds


In [48]:
decoded_outputs = math_tokenizer.batch_decode(outputs, skip_special_tokens=True)

In [49]:
decoded_outputs

['Find the volume of a sphere with a diameter of 18 meters. Express your answer in terms of $\\pi$. of a sphere with a diameter of 18 meters.\nTo find the volume of a sphere, we use the formula for the volume \\( V \\) of a sphere:\n\n\\[\nV = \\frac{4}{3} \\pi r^3\n\\]\n\nwhere \\( r \\) is the radius of the sphere.\n\nFirst, we need to determine the radius from the given diameter. The diameter \\( d \\) is twice the radius \\( r \\):\n\n\\[\nd = 2r\n\\]\n\nGiven that the diameter \\( d \\) is 18 meters, we can solve for the radius \\( r \\):\n\n\\[\nr = \\frac{d}{2} = \\frac{18}{2} = 9 \\text{ meters}\n\\]\n\nNow, substitute the radius \\( r = 9 \\) meters into the volume formula:\n\n\\[\nV = \\frac{4}{3} \\pi (9)^3\n\\]\n\nNext, calculate \\( 9^3 \\):\n\n\\[\n9^3 = 9 \\times 9 \\times 9 = 729\n\\]\n\nSo the volume becomes:\n\n\\[\nV = \\frac{4}{3} \\pi \\times 729\n\\]\n\n',
 'The following line is parameterized, so that its direction vector is of the form $\\begin{pmatrix} a \\\\ -