# Small models from july 2024

## 1. Install dependencies

In [None]:
pip install vllm

In [2]:
from importlib.metadata import version

In [4]:
version('torch')

'2.3.1'

In [5]:
version('transformers')

'4.43.3'

In [6]:
version('vllm')

'0.5.3.post1'

In [8]:
version('vllm-flash-attn')

'2.5.9.post1'

## Mistral Nemo 12b

https://mistral.ai/fr/news/mistral-nemo/

https://huggingface.co/mistralai/Mistral-Nemo-Instruct-2407

https://huggingface.co/neuralmagic/Mistral-Nemo-Instruct-2407-FP8

**LICENSE :** Apache 2.0

In [None]:
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

model_id = "neuralmagic/Mistral-Nemo-Instruct-2407-FP8"

sampling_params = SamplingParams(temperature=0.3, top_p=0.9, max_tokens=256)

tokenizer = AutoTokenizer.from_pretrained(model_id)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

prompts = tokenizer.apply_chat_template(messages, tokenize=False)

llm = LLM(model=model_id, max_model_len=4096)

outputs = llm.generate(prompts, sampling_params)

generated_text = outputs[0].outputs[0].text
print(generated_text)

## Llama 3.1 8b

https://llama.meta.com/

https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct

https://huggingface.co/neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic

**LICENSE :** https://llama.meta.com/llama3_1/license/

In [None]:
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

model_id = "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic"

sampling_params = SamplingParams(temperature=0.6, top_p=0.9, max_tokens=256)

tokenizer = AutoTokenizer.from_pretrained(model_id)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

prompts = tokenizer.apply_chat_template(messages, tokenize=False)

llm = LLM(model=model_id)

outputs = llm.generate(prompts, sampling_params)

generated_text = outputs[0].outputs[0].text
print(generated_text)

## Gemma 2 9b

https://ai.google.dev/gemma/docs/model_card_2

https://huggingface.co/google/gemma-2-9b-it

https://huggingface.co/neuralmagic/gemma-2-9b-it-FP8

**LICENSE :** https://ai.google.dev/gemma/terms

In [None]:
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

model_id = "neuralmagic/gemma-2-9b-it-FP8"

sampling_params = SamplingParams(temperature=0.6, top_p=0.9, max_tokens=256)

tokenizer = AutoTokenizer.from_pretrained(model_id)

messages = [
    {"role": "user", "content": "Who are you? Please respond in pirate speak!"},
]

prompts = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

llm = LLM(model=model_id)

outputs = llm.generate(prompts, sampling_params)

generated_text = outputs[0].outputs[0].text
print(generated_text)

## Phi-3 mini 128k 3.8b

https://azure.microsoft.com/en-us/blog/new-models-added-to-the-phi-3-family-available-on-microsoft-azure/

https://huggingface.co/microsoft/Phi-3-mini-128k-instruct

https://huggingface.co/neuralmagic/Phi-3-mini-128k-instruct-FP8

**LICENSE :** MIT

In [None]:
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

model_id = "neuralmagic/Phi-3-mini-128k-instruct-FP8"

sampling_params = SamplingParams(temperature=0.6, top_p=0.9, max_tokens=256)

tokenizer = AutoTokenizer.from_pretrained(model_id)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you? Remember to respond in pirate speak!"},
]

prompts = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

llm = LLM(model=model_id)

outputs = llm.generate(prompts, sampling_params)

generated_text = outputs[0].outputs[0].text
print(generated_text)

## Phi-3 medium 128k 14b

https://azure.microsoft.com/en-us/blog/new-models-added-to-the-phi-3-family-available-on-microsoft-azure/

https://huggingface.co/microsoft/Phi-3-medium-128k-instruct

https://huggingface.co/neuralmagic/Phi-3-medium-128k-instruct-FP8

**LICENSE :** MIT

In [None]:
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

model_id = "neuralmagic/Phi-3-medium-128k-instruct-FP8"

sampling_params = SamplingParams(temperature=0.6, top_p=0.9, max_tokens=256)

tokenizer = AutoTokenizer.from_pretrained(model_id)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you? Remember to respond in pirate speak!"},
]

prompts = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

llm = LLM(model=model_id)

outputs = llm.generate(prompts, sampling_params)

generated_text = outputs[0].outputs[0].text
print(generated_text)

# Deepseek coder v2 Lite 16b

https://arxiv.org/abs/2406.11931

https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct

https://huggingface.co/neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8

**LICENSE :** https://github.com/deepseek-ai/DeepSeek-Coder-V2/blob/main/LICENSE-MODEL

In [None]:
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

model_id = "neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"

sampling_params = SamplingParams(temperature=0.6, top_p=0.9, max_tokens=256)

tokenizer = AutoTokenizer.from_pretrained(model_id)

messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]

prompts = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

llm = LLM(model=model_id, trust_remote_code=True, max_model_len=4096)

outputs = llm.generate(prompts, sampling_params)

generated_text = outputs[0].outputs[0].text
print(generated_text)