# Prediction generation on M0 .. M5 models using nnsight
- Developed on Google Colab using an A100 with 40GB GPU and 80GB system RAM.
- Runs with GPT2/TinyStories/Qwen/Llama/Granite/SmolLM with base/CS1/CS2/CS3.
- Requires a GITHUB_TOKEN secret to access Martian quanta_text_to_sql code repository.
- Requires a HF_TOKEN secret to access Martian HuggingFace repository.


# Import libraries
Imports standard libraries. Do not read.

In [None]:
# https://nnsight.net/
!pip install -U nnsight

In [None]:
from IPython.display import clear_output
import einops
import torch
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "colab"

import nnsight
from nnsight import LanguageModel, util
from nnsight.tracing.Proxy import Proxy

In [None]:
from getpass import getpass
from google.colab import userdata
import gc
import weakref

In [None]:
github_token = userdata.get("GITHUB_TOKEN")

# Install the private repository using the token
!pip install --upgrade git+https://{github_token}@github.com/withmartian/quanta_text_to_sql.git

import QuantaTextToSql as qts

# Select model, command set and feature to investigate


In [None]:
model_num = 3                 # 0=GPT2, 1=TinyStories, 2=Qwen, 3=Llama, 4=Granite, 5=SmolLM
cs_num = 1                    # 0=BaseModel, 1=CS1, 2=CS2 or 3=CS3
max_new_tokens = 100          # Max number of tokens to generate

# Run m0 : nnsight tutorial using GPT2
Based on https://nnsight.net/notebooks/tutorials/activation_patching/



In [None]:
if model_num == 0:
    model0 = LanguageModel("openai-community/gpt2", device_map="auto")
    clear_output()
    print(model0)

In [None]:
if model_num == 0: # GPT2
    the_prompt = "After John and Mary went to the store, Mary gave a bottle of milk to"
    print("Model input: ", the_prompt)
    inputs = model0.tokenizer(the_prompt, return_tensors="pt", padding=True)

    with model0.generate(inputs['input_ids'], max_new_tokens=25, pad_token_id=model0.tokenizer.eos_token_id) as tracer:
        final_output = model0.generator.output.save()

    final_output = final_output.detach().cpu().numpy()

    decoded_output = model0.tokenizer.decode(final_output[0], skip_special_tokens=True)
    print("Model output:", decoded_output)

# Run m1, m2 and m3 models

In [None]:
if model_num > 0:

    if model_num == 1:
        the_tokenizer, the_model = qts.load_sql_interp_model(model_num, cs_num, auth_token=userdata.get("HF_TOKEN"), use_flash_attention=False)
        model = LanguageModel(the_model, the_tokenizer)
        model.tokenizer = the_tokenizer
    else:
        model = LanguageModel(qts.sql_interp_model_location(model_num, cs_num), device_map="auto")


    clear_output()
    print(model)

In [None]:
if model_num > 0:
    # Generate a batch of prompts
    batch_size = 50

    if cs_num == 0 or cs_num == 1:
      examples = qts.generate_cs1(batch_size)
    elif cs_num == 2:
      examples = qts.generate_cs2(batch_size)
    elif cs_num == 3:
      examples = qts.generate_cs3(batch_size)

In [None]:
if model_num > 0:

    score_sum = 0
    for idx in range(batch_size):
        example = examples[idx]

        the_prompt = example.get_alpaca_prompt()
        #print("Run:", idx, "Model input:", the_prompt)

        inputs = model.tokenizer(the_prompt, return_tensors="pt", padding=True)
        with model.generate(inputs['input_ids'], max_new_tokens=max_new_tokens) as tracer:
            final_output = model.generator.output.save()

        final_output = final_output.detach().cpu().numpy()
        decoded_output = model.tokenizer.decode(final_output[0], skip_special_tokens=True)

        assert decoded_output.startswith(the_prompt)
        model_added = decoded_output[len(the_prompt):]
        #print("Run:", idx, "Model added:", model_added)

        score = 0
        if cs_num == 0 or cs_num == 1:
            score = qts.evaluate_cs1_prediction(example, model_added)
        elif cs_num == 2:
            score = qts.evaluate_cs2_prediction(example, model_added)
        elif cs_num == 3:
            score = qts.evaluate_cs3_prediction(example, model_added)
        if score < 1:
          print("Run:", idx, "Score:", score)
        score_sum += score

    print()
    print("Average score:", score_sum/batch_size)

In [None]:
print("Pad token ID:", model.tokenizer.pad_token_id)
print("EOS token ID:", model.tokenizer.eos_token_id)
#model.tokenizer.pad_token_id = model.tokenizer.eos_token_id