In [1]:
!pip install -qU \
  datasets==2.14.6 \
  transformers==4.35.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m78.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m33.2 MB/s[0m eta [36m0:00:00[0m
[?25h

## Dataset Download

We're going to test with a more real world use-case, with messy, imperfect data. We will use the [`jamescalam/ai-arxiv-chunked`](https://huggingface.co/datasets/jamescalam/ai-arxiv-chunked) dataset.

In [2]:
from datasets import load_dataset

data = load_dataset("jamescalam/ai-arxiv-chunked", split="train")
data

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/153M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 41584
})

First we define our embedding function.

In [3]:
import torch
from torch.nn.functional import normalize
from transformers import AutoModel, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device}")

model_id = "infgrad/stella-base-en-v2"

# initialize tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModel.from_pretrained(model_id).to(device)
model.eval()

def embed(docs: list[str]) -> list[list[float]]:
    # tokenize
    tokens = tokenizer(
        docs, padding=True, max_length=512, truncation=True, return_tensors="pt"
    ).to(device)
    with torch.no_grad():
        # process with model for token-level embeddings
        out = model(**tokens)
        # mask padding tokens
        last_hidden = out.last_hidden_state.masked_fill(
            ~tokens["attention_mask"][..., None].bool(), 0.0
        )
        # create mean pooled embeddings
        doc_embeds = last_hidden.sum(dim=1) / \
            tokens["attention_mask"].sum(dim=1)[..., None]
    return doc_embeds.cpu().numpy()

Using cuda


Downloading (…)okenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/696 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/219M [00:00<?, ?B/s]

Use this to build a Numpy array of cohere embedding vectors.

In [4]:
from tqdm.auto import tqdm
import numpy as np

chunks = data["chunk"]
batch_size = 128

for i in tqdm(range(0, len(chunks), batch_size)):
    i_end = min(len(chunks), i+batch_size)
    chunk_batch = chunks[i:i_end]
    # embed current batch
    embed_batch = embed(chunk_batch)
    # add to existing np array if exists (otherwise create)
    if i == 0:
        arr = embed_batch.copy()
    else:
        arr = np.concatenate([arr, embed_batch.copy()])

  0%|          | 0/325 [00:00<?, ?it/s]

Now we need to create the query mechanism, this is simply a cosine similarity calculation between a query vector and our `arr` vectors.

In [5]:
from numpy.linalg import norm

# convert chunks list to array for easy indexing
chunk_arr = np.array(chunks)

def query(text: str, top_k: int=3) -> list[str]:
    # create query embedding
    xq = embed([text])[0]
    # calculate cosine similarities
    sim = np.dot(arr, xq.T) / (norm(arr, axis=1)*norm(xq.T))
    # get indices of top_k records
    idx = np.argpartition(sim, -top_k)[-top_k:]
    docs = chunk_arr[idx]
    for d in docs.tolist():
        print(d)
        print("----------")

In [6]:
query("why should I use llama 2?")

chat with itself. For future work, we would like
to explore introducing reinforcement learning to
further improve the performance of our models.
Limitations
Foundation Model Similar to other language
models, Baize may suffer from hallucination, toxicity and stereotypes. Particularly, Baize inherits the
out-of-date knowledge from LLaMA. Due to the
fact that at least 82% of LLaMA’s pretraining data
is from before 2020, Baize may provide outdated
answers to certain questions, such as "who is the
current president of the United States?" Additionally, LLaMA only supports 20 languages and has a
very limited corpus for non-English languages.
Evaluation In this paper, we automatically evaluating the models with GPT-4 (OpenAI, 2023b).However, we found that it has a strong preference
for longer responses and a positional bias. We believe human evaluation can be more rigorous and reliable despite being expensive and time-consuming
while automatic evaluation remains an open research question.
Lice

In [7]:
query("can you tell me about red teaming for llama 2?")

for red teaming (§3). Throughout the design of our experiments, we arrived at many junctures in which
we were unsure about how to proceed, even after a literature review on red teaming AI systems (§2). As
such, we conducted informational interviews with experts in the ﬁeld of Trust & Safety and incorporated
their suggested best practices (§A.2) into the design of our experiments in order to ensure the well-being of
the red team. In general, we found that red team members enjoyed participating in our experiments and felt
motivated by a mission to make AI systems less harmful (§A.2). Nevertheless, our work suffers from some
limitations, which we discuss in §5.1. Based on our experiences, we propose some policy interventions for
how we can work together as a community to develop shared norms, practices, and technical standards for
how to red team language models (§5.2).
2 Related Work
We use the same models that we developed in our previous work where we train a general language assistant

In [8]:
query("what is the best llm?")

as well as the amount of data required to outperform LLMs. We surpass the performance of 540B
parameter LLMs using a 770M T5 model; this
smaller model only uses 80% of a labeled dataset
that would otherwise be required if using an exist-ing ﬁnetuning method. When only unlabeled data
is present, our small models still perform on par or
better than LLMs. We outperform 540B PaLM’s
performance with only a 11B T5 model. We further
show that when a smaller model performs worse
than an LLM, Distilling step-by-step can more efﬁciently leverage additional unlabeled data to match
the LLM performance compared to the standard
distillation approach.
2 Related work
Our work distills task-speciﬁc knowledge of LLMs
into smaller specialist models by leveraging the
emergent reasoning capabilities of today’s LLMs.
We draw on knowledge distillation research and
methods that learn from both human-generated rationales and LLM-generated rationales.
Knowledge distillation from large models.
Knowledge distilla

In [9]:
query("what is the difference between gpt-4 and llama 2?")

to GPT-3 corresponds to the Stanford Alpaca model. From Figure 3(a), we observe that ( i) For the
“Helpfulness” criterion, GPT-4 is the clear winner with 54.12% of the votes. GPT-3 only wins 19.74%
of the time. ( ii) For the “Honesty” and “Harmlessness” criteria, the largest portion of votes goes
to the tie category, which is substantially higher than the winning categories but GPT-3 (Alpaca) is
slightly superior.
Second, we compare GPT-4-instruction-tuned LLaMA models against the teacher model GPT-4 in
Figure 3(b). The observations are quite consistent over the three criteria: GPT-4-instruction-tuned
LLaMA performs similarly to the original GPT-4. We conclude that learning from GPT-4 generated
5
60% 70% 80% 90% 100%12345BRanking Group 94% 624 : 66792% 614 : 67091% 623 : 68289% 597 : 66989% 605 : 67891% 609 : 666
----------
tasks.
This represents work in progress, and several directions can be explored: (i)Data and model scale .
The GPT-4 data size is 52K and the base LLaMA model size 

---