IMPORTANT: This notebook is meant to be run on Google Collab, will not work for you on local unless you have powerful Nvidia GPU

https://colab.research.google.com/drive/185RaaIFspi9x2EYM42CUbrWjAYha1ynZ?usp=sharing

In [None]:
!pip install -U transformers pandas

In [None]:
from huggingface_hub import login
login(new_session=False)

In [None]:
# Use a pipeline as a high-level helper
from transformers import pipeline

pipe = pipeline("text-generation", model="mistralai/Mistral-7B-Instruct-v0.3")
messages = [
    {"role": "user", "content": "Who are you?"},
]
pipe(messages)

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")
model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3")

In [None]:
import os
from huggingface_hub import InferenceClient

client = InferenceClient(
    provider="auto",
    api_key=os.environ["HF_TOKEN"],
)

completion = client.chat.completions.create(
    model="mistralai/Mistral-7B-Instruct-v0.3",
    messages=[
        {
            "role": "user",
            "content": "What is the capital of France?"
        }
    ],
)

print(completion.choices[0].message)

In [None]:
# prompt: Can you do another chat completions call but this time — show log probabilities of top 10 tokens for each token inferred. And then the final message

import math, pandas as pd


completion = client.chat.completions.create(
    model="mistralai/Mistral-7B-Instruct-v0.3",
    messages=[
        {
            "role": "user",
            "content": "What is the capital of France?"
        }
    ],
    logprobs=True,
    top_logprobs=5,
)
# print(completion.choices[0].message)
# completion.choices[0].logprobs

logprobs = completion.choices[0].logprobs     # shortcut
records = []
for idx, (tok, lp, top) in enumerate(zip(
        logprobs.tokens,
        logprobs.token_logprobs,
        logprobs.top_logprobs)):

    chosen = f"{tok} ({math.exp(lp):.3f})"
    alts = sorted(top.items(), key=lambda kv: kv[1], reverse=True)[:5]
    alts_fmt = [f"{t} ({math.exp(lp_):.3f})" for t, lp_ in alts]
    alts_fmt += [''] * (5 - len(alts_fmt))          # pad to 5

    records.append([idx, chosen, *alts_fmt])

df = pd.DataFrame(records,
                  columns=["Idx", "Chosen (p)", "Alt-1", "Alt-2",
                           "Alt-3", "Alt-4", "Alt-5"])

print(df.to_string(index=False))



In [None]:
# prompt: Do same question "What is capital of france" with chat completion — allow controlling other factors like output tokens, temperature

completion = client.chat.completions.create(
    model="mistralai/Mistral-7B-Instruct-v0.3",
    messages=[
        {
            "role": "user",
            "content": "What is the capital of France?"
        }
    ],
    max_tokens=10,  # Set the maximum number of output tokens
    temperature=2 # Set the temperature for creativity (0.0 to 1.0)
)

completion.choices[0].message.content