In [8]:
!pip install -qU datasets==2.14.5 openai==1.14.3 pinecone-client==3.2.2 cohere==5.2.2

In [43]:
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
from datasets import load_dataset

data = load_dataset("jamescalam/ai-arxiv-chunked", split="train")
data

  from .autonotebook import tqdm as notebook_tqdm
Downloading data: 100%|██████████| 153M/153M [00:02<00:00, 66.2MB/s]
Downloading data files: 100%|██████████| 1/1 [00:02<00:00,  2.60s/it]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 1134.21it/s]
Generating train split: 41584 examples [00:00, 121264.58 examples/s]


Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 41584
})

In [5]:
data[0]

{'doi': '1910.01108',
 'chunk-id': '0',
 'chunk': 'DistilBERT, a distilled version of BERT: smaller,\nfaster, cheaper and lighter\nVictor SANH, Lysandre DEBUT, Julien CHAUMOND, Thomas WOLF\nHugging Face\n{victor,lysandre,julien,thomas}@huggingface.co\nAbstract\nAs Transfer Learning from large-scale pre-trained models becomes more prevalent\nin Natural Language Processing (NLP), operating these large models in on-theedge and/or under constrained computational training or inference budgets remains\nchallenging. In this work, we propose a method to pre-train a smaller generalpurpose language representation model, called DistilBERT, which can then be ﬁnetuned with good performances on a wide range of tasks like its larger counterparts.\nWhile most prior work investigated the use of distillation for building task-speciﬁc\nmodels, we leverage knowledge distillation during the pre-training phase and show\nthat it is possible to reduce the size of a BERT model by 40%, while retaining 97%\nof i

In [6]:
data[1]

{'doi': '1910.01108',
 'chunk-id': '1',
 'chunk': 'loss combining language modeling, distillation and cosine-distance losses. Our\nsmaller, faster and lighter model is cheaper to pre-train and we demonstrate its\ncapabilities for on-device computations in a proof-of-concept experiment and a\ncomparative on-device study.\n1 Introduction\nFigure 1: Parameter counts of several recently released\npretrained language models.The last two years have seen the rise\nof Transfer Learning approaches in\nNatural Language Processing (NLP)\nwith large-scale pre-trained language\nmodels becoming a basic tool in\nmany NLP tasks [Devlin et al., 2018,\nRadford et al., 2019, Liu et al., 2019].\nWhile these models lead to signiﬁcant improvement, they often have\nseveral hundred million parameters\nand current research1on pre-trained\nmodels indicates that training even\nlarger models still leads to better performances on downstream tasks.\nThe trend toward bigger models\nraises several concerns. First is 

In [7]:
data = data.map(lambda x: {
    "id": f'{x["id"]}-{x["chunk-id"]}',
    "text": x["chunk"],
    "metadata": {
        "title": x["title"],
        "url": x["source"],
        "primary_category": x["primary_category"],
        "published": x["published"],
        "updated": x["updated"],
        "text": x["chunk"],
    }
})
# drop uneeded columns
data = data.remove_columns([
    "title", "summary", "source",
    "authors", "categories", "comment",
    "journal_ref", "primary_category",
    "published", "updated", "references",
    "doi", "chunk-id",
    "chunk"
])
data

Map: 100%|██████████| 41584/41584 [00:07<00:00, 5689.43 examples/s]


Dataset({
    features: ['id', 'text', 'metadata'],
    num_rows: 41584
})

In [24]:
from openai import OpenAI
client = OpenAI()

embed_model = "text-embedding-ada-002"

In [17]:
import os
# from pinecone import Pinecone, ServerlessSpec
from pinecone import Pinecone, PodSpec
pc = Pinecone(
    api_key=os.environ.get("PINECONE_API_KEY")
)

In [21]:
import time

index_name = "starter-index"

# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes():
    # if does not exist, create index
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=PodSpec(
            environment="gcp-starter"
        )
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)

In [22]:
index

<pinecone.data.index.Index at 0x7fdbd2a226e0>

In [None]:
from openai import OpenAI
client = OpenAI()

def get_embedding(text, model="text-embedding-3-large"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

get_embedding("Hello, world!")

In [None]:
from tqdm.auto import tqdm

batch_size = 100  # how many embeddings we create and insert at once

for i in tqdm(range(0, len(data), batch_size)):
    passed = False
    # find end of batch
    i_end = min(len(data), i+batch_size)
    # create batch
    batch = data[i:i_end]
    # create embeddings (exponential backoff to avoid RateLimitError)
    for j in range(5):  # max 5 retries
        try:
            res = client.embeddings.create(input=batch["text"], model=embed_model)
            print(f"Created embeddings for batch {i}-{i_end}.")
            passed = True
        except Exception as e:
            time.sleep(2**j)  # wait 2^j seconds before retrying
            print("Retrying...")
    if not passed:
        raise RuntimeError("Failed to create embeddings.")
    # get embeddings
    embeds = [record['embedding'] for record in res['data']]
    to_upsert = list(zip(batch["id"], embeds, batch["metadata"]))
    # upsert to Pinecone
    index.upsert(vectors=to_upsert)

In [None]:
def get_docs(query: str, top_k: int):
    # encode query
    xq = get_embedding(query)
    # search pinecone index
    res = index.query(xq, top_k=top_k, include_metadata=True)
    # get doc text
    docs = {x["metadata"]['text']: i for i, x in enumerate(res["matches"])}
    return docs

In [None]:
query = "can you explain why we would want to do rlhf?"
docs = get_docs(query, top_k=25)
print("\n---\n".join(docs.keys()[:3]))  # print the first 3 docs

## Rerank with Cohere

In [45]:
import cohere
# instantiate the Cohere client
co = cohere.Client(api_key=os.environ.get("COHERE_API_KEY"))