# Model Comparison

In [None]:
# Make results folder

! mkdir model_results/

In [55]:
# Imports

import torch
import random
from transformers import AutoTokenizer, AutoModelForCausalLM
from openai import OpenAI
import os
import csv
from icalfa import submit_command
from datasets import load_dataset
from tqdm import tqdm
import re
from tabulate import tabulate

In [56]:
# For reproducibility

seed = 123
torch.manual_seed(seed)
random.seed(seed)
torch.cuda.manual_seed_all(seed)

In [57]:
# Model names

models = [
    "meta-llama/Llama-3.2-1B-Instruct",
    "meta-llama/Llama-3.2-3B-Instruct",
    "meta-llama/Llama-3.1-8B-Instruct",
    "Qwen/Qwen2.5-Coder-0.5B-Instruct",
    "Qwen/Qwen2.5-Coder-1.5B-Instruct",
    "Qwen/Qwen2.5-Coder-3B-Instruct",
    "Qwen/Qwen2.5-Coder-7B-Instruct",
    "gpt-3.5-turbo-0125",
    "gpt-4o-mini-2024-07-18",
    "gpt-4o-2024-08-06",
    "gpt-4-0613",
]

In [58]:
# OpenAI function

client = OpenAI(api_key='...')

def openai_completion(prompt, system_prompt, model, tokenizer):
    completion = client.chat.completions.create(
    model=model,
    messages=[
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': prompt}
    ],
    temperature=0,
    seed=123
    )
    content = completion.choices[0].message.content
    return content

In [59]:
# Transformers function

def transformers_completion(prompt, system_prompt, model, tokenizer):
    tokenizer.pad_token = tokenizer.eos_token
    
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"{prompt}"},
    ]

    tokens = tokenizer.apply_chat_template(
        messages,
        add_generation_prompt=True,
        tokenize=True,
        return_tensors="pt"
    ).to(model.device)

    attention_mask = torch.ones_like(tokens)

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    if model.__class__.__name__.startswith("Qwen"):
        outputs = model.generate(
            tokens,
            attention_mask=attention_mask,
            max_new_tokens=100,
            do_sample=False,
            temperature=None,
            top_p=None,
            top_k=None,
        )
    else:
        outputs = model.generate(
            tokens,
            attention_mask=attention_mask,
            max_new_tokens=100,
            eos_token_id=terminators,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=False,
            temperature=None,
            top_p=None,
            top_k=None,
        )
    
    response = outputs[0][tokens.shape[-1]:]
    return tokenizer.decode(response, skip_special_tokens=True)

In [60]:
# Strip markdown formatting

def parse_bash(text):
    patterns = [
        r"```bash\s*(.*?)\s*```",
        r"```(.*?)```",
        r"`(.*?)`",
    ]
    
    for pattern in patterns:
        match = re.search(pattern, text, re.DOTALL)
        if match:
            return match.group(1).strip()
    
    return text

In [61]:
# Benchmark function

def benchmark(model_name):
    filename = model_name.split("/")[-1]
    results_file = f"model_results/{filename}.csv"
    if os.path.exists(results_file):
        print(f"{results_file} already exists, skipping")

    else:
        system_prompt = "Your task is to translate a natural language instruction to a Bash command. You will receive an instruction in English and output a Bash command that can be run in a Linux terminal."

        if model_name.startswith("gpt"):
            get_completion = openai_completion
            model = model_name
            tokenizer = None
        else:
            get_completion = transformers_completion
            tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16).to("cuda")

        dataset = load_dataset("westenfelder/NL2SH-ALFA", "test", split="train")

        with open(results_file, mode='a', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(['prompt', 'ground_truth_command', 'model_command', 'correct'])

            for index, row in tqdm(enumerate(dataset), total=len(dataset)):
                prompt = row['nl']
                ground_truth_command = row['bash']
                response = get_completion(prompt=prompt, system_prompt=system_prompt, model=model, tokenizer=tokenizer)
                model_command = parse_bash(response)
                correct = submit_command(index=index, command=model_command, eval_mode="embed", eval_param=0.75)
                writer.writerow([prompt, ground_truth_command, model_command, correct])

            file.close()

In [None]:
# Benchmark models

for model_name in models:
    benchmark(model_name)
    torch.cuda.empty_cache()

In [None]:
# Print results

results = [["Model", "Accuracy"]]
for model_name in models:
    filename = model_name.split("/")[-1]
    results_file = f"model_results/{filename}.csv"
    if os.path.exists(results_file):
        with open(results_file, mode='r') as file:
            reader = csv.reader(file)
            correct = 0
            for row in reader:
                if row[3] == '1':
                    correct += 1
            results.append([model_name, f"{(correct/300):0.2f}"])
            file.close()

latex_table = tabulate(results, headers="firstrow", tablefmt="latex")
print(latex_table)