In [1]:
!pip install -qU datasets==2.14.5 openai==1.14.3 pinecone-client==3.2.2 cohere==5.2.2 boto3==1.34.72

In [3]:
from dotenv import load_dotenv
load_dotenv()

True

In [25]:
from datasets import load_dataset

data = load_dataset("jamescalam/ai-arxiv-chunked", split="train")
data

Dataset({
    features: ['doi', 'chunk-id', 'chunk', 'id', 'title', 'summary', 'source', 'authors', 'categories', 'comment', 'journal_ref', 'primary_category', 'published', 'updated', 'references'],
    num_rows: 41584
})

In [26]:
data[0]

{'doi': '1910.01108',
 'chunk-id': '0',
 'chunk': 'DistilBERT, a distilled version of BERT: smaller,\nfaster, cheaper and lighter\nVictor SANH, Lysandre DEBUT, Julien CHAUMOND, Thomas WOLF\nHugging Face\n{victor,lysandre,julien,thomas}@huggingface.co\nAbstract\nAs Transfer Learning from large-scale pre-trained models becomes more prevalent\nin Natural Language Processing (NLP), operating these large models in on-theedge and/or under constrained computational training or inference budgets remains\nchallenging. In this work, we propose a method to pre-train a smaller generalpurpose language representation model, called DistilBERT, which can then be ﬁnetuned with good performances on a wide range of tasks like its larger counterparts.\nWhile most prior work investigated the use of distillation for building task-speciﬁc\nmodels, we leverage knowledge distillation during the pre-training phase and show\nthat it is possible to reduce the size of a BERT model by 40%, while retaining 97%\nof i

In [27]:
data = data.map(lambda x: {
    "id": f'{x["id"]}-{x["chunk-id"]}',
    "text": x["chunk"],
    "metadata": {
        "title": x["title"],
        "url": x["source"],
        "primary_category": x["primary_category"],
        "published": x["published"],
        "updated": x["updated"],
        "text": x["chunk"],
    }
})
# drop uneeded columns
data = data.remove_columns([
    "title", "summary", "source",
    "authors", "categories", "comment",
    "journal_ref", "primary_category",
    "published", "updated", "references",
    "doi", "chunk-id",
    "chunk"
])
data

Dataset({
    features: ['id', 'text', 'metadata'],
    num_rows: 41584
})

In [4]:
import os
# from pinecone import Pinecone, ServerlessSpec
from pinecone import Pinecone, PodSpec
pc = Pinecone(
    api_key=os.environ.get("PINECONE_API_KEY")
)

  from tqdm.autonotebook import tqdm


In [None]:
import time

index_name = "rerank"

# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes():
    # if does not exist, create index
    pc.create_index(
        name=index_name,
        # in alignment with the number of dimensions in cohere 1024
        dimension=1024,
        metric="cosine",
        spec=PodSpec(
            environment="gcp-starter"
        )
    )
    # wait for index to be initialized
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

In [12]:
# connect to index
index_name = "rerank"
index = pc.Index(index_name)

In [5]:
# from openai import OpenAI
# client = OpenAI()

# # use OpenAI ada to get embeddings
# def generate_text_embeddings(text, model = "text-embedding-ada-002"):
#    text = text.replace("\n", " ")
#    return client.embeddings.create(input = [text], model=model).data[0].embedding

# generate_text_embeddings("Hello, world!")

import json
import logging
import boto3

from botocore.exceptions import ClientError

logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

def generate_text_embeddings(model_id, body):
    """
    Generate text embedding by using the Cohere Embed model.
    Args:
        model_id (str): The model ID to use.
        body (str) : The reqest body to use.
    Returns:
        dict: The response from the model.
    """

    logger.info(
        "Generating text emdeddings with the Cohere Embed model %s", model_id)

    accept = '*/*'
    content_type = 'application/json'

    bedrock = boto3.client(service_name='bedrock-runtime', region_name="us-east-1")

    response = bedrock.invoke_model(
        body=body,
        modelId=model_id,
        accept=accept,
        contentType=content_type
    )

    logger.info("Successfully generated text with Cohere model %s", model_id)

    return response

model_id = 'cohere.embed-multilingual-v3'
text1 = "hello world"
text2 = "this is a test"
input_type = "search_document"

def preflight():
    try:
        body = json.dumps({
            "texts": [text1, text2],
            "input_type": input_type}
        )
        response = generate_text_embeddings(model_id=model_id, body=body)
        response_body = json.loads(response.get('body').read())

        print(f"ID: {response_body.get('id')}")
        print(f"Response type: {response_body.get('response_type')}")

        print("Embeddings")
        for i, embedding in enumerate(response_body.get('embeddings')):
            print(f"\tEmbedding {i}")
            print(*embedding)

        print("Texts")
        for i, text in enumerate(response_body.get('texts')):
            print(f"\tText {i}: {text}")

    except ClientError as err:
        message = err.response["Error"]["Message"]
        logger.error("A client error occurred: %s", message)
        print("A client error occured: " +
                format(message))
    else:
        print(
            f"Finished generating text embeddings with Cohere model {model_id}.")

# preflight()

In [45]:
data[0:4]["text"]

['DistilBERT, a distilled version of BERT: smaller,\nfaster, cheaper and lighter\nVictor SANH, Lysandre DEBUT, Julien CHAUMOND, Thomas WOLF\nHugging Face\n{victor,lysandre,julien,thomas}@huggingface.co\nAbstract\nAs Transfer Learning from large-scale pre-trained models becomes more prevalent\nin Natural Language Processing (NLP), operating these large models in on-theedge and/or under constrained computational training or inference budgets remains\nchallenging. In this work, we propose a method to pre-train a smaller generalpurpose language representation model, called DistilBERT, which can then be ﬁnetuned with good performances on a wide range of tasks like its larger counterparts.\nWhile most prior work investigated the use of distillation for building task-speciﬁc\nmodels, we leverage knowledge distillation during the pre-training phase and show\nthat it is possible to reduce the size of a BERT model by 40%, while retaining 97%\nof its language understanding capabilities and being 

In [43]:
len(data)

41584

In [None]:
from tqdm.auto import tqdm

batch_size = 5  # how many embeddings we create and insert at once

for i in tqdm(range(0, len(data), batch_size)):
    passed = False
    # find end of batch
    i_end = min(len(data), i+batch_size)
    # create batch
    batch = data[i:i_end]
    # create embeddings (exponential backoff to avoid RateLimitError)
    for j in range(5):  # max 5 retries
        try:
            # res = client.embeddings.create(input=batch["text"], model=embed_model)
            body = json.dumps({
                "texts": batch["text"],
                "input_type": input_type}
            )
            res = generate_text_embeddings(model_id=model_id, body=body)
            print(f"Created embeddings for batch {i}-{i_end}.")
            passed = True
        except Exception as e:
            time.sleep(2**j)  # wait 2^j seconds before retrying
            print("Retrying...")
    if not passed:
        raise RuntimeError("Failed to create embeddings.")
    # get embeddings
    response_body = json.loads(res.get('body').read())
    # for i, embedding in enumerate(response_body.get('embeddings')):
    #     print(f"\tEmbedding {i}")
    #     print(*embedding)
    # embeds = [embedding for i, embedding in enumerate(response_body.get('embeddings'))]
    embeds = [float(embedding) for embedding in response_body.get('embeddings')]
    to_upsert = list(zip(batch["id"], embeds, batch["metadata"]))
    print(f"Upserting batch {i}-{i_end} to Pinecone with embeds {embeds}")
    # upsert to Pinecone
    index.upsert(vectors=to_upsert)

In [26]:
def get_docs(query: str, top_k: int):
    # assemble the body
    body = json.dumps({
        "texts": [query],
        "input_type": input_type}
    )
    res = generate_text_embeddings(model_id=model_id, body=body)
    response_body = json.loads(res.get('body').read())
    # xq = [float(embedding) for embedding in response_body.get('embeddings')]
    xq = [[float(value) for value in embedding] for embedding in response_body.get('embeddings')]
    print(f"Created embeddings for query {query} with model {model_id} and the embeddings are {xq}")
    # search pinecone index
    res = index.query(vector=xq, top_k=top_k, include_metadata=True)
    """

    {'matches': 
        [
                {'id': '1703.04933-25',
                        'metadata': {'primary_category': 'cs.LG',
                                'published': '20170315',
                                'text': '...',
                                'title': 'Sharp Minima Can Generalize For Deep Nets',
                                'updated': '20170515',
                                'url': 'http://arxiv.org/pdf/1703.04933'},
                        'score': 0.588596284,
                        'values': []
              },
              ...
        ]
    }
    """
    # get doc text
    docs = {x["metadata"]['text']: i for i, x in enumerate(res["matches"])}
    return docs

In [33]:
query = "can you explain why we would want to do rlhf?"
docs = get_docs(query, top_k=25)

"""
docs format:
{'docs content 0': 0, 'docs content 1': 1, 'docs content 2': 2}
"""
# Limit the number of documents to show to 3
for doc, i in list(docs.items())[:3]:
    print(f"Document {i}: {doc}\n")


INFO:__main__:Generating text emdeddings with the Cohere Embed model cohere.embed-multilingual-v3
INFO:__main__:Successfully generated text with Cohere model cohere.embed-multilingual-v3


Created embeddings for query can you explain why we would want to do rlhf? with model cohere.embed-multilingual-v3 and the embeddings are [[0.011528015, 0.038635254, 0.0345459, 0.027618408, -0.017654419, 0.012954712, -0.0032405853, -0.03363037, 0.0023975372, -0.04006958, 0.010719299, 0.014305115, -0.008773804, 0.021835327, -0.04385376, 0.017303467, 0.040130615, -0.0043792725, 0.04888916, -0.004753113, -0.027053833, 0.013305664, 0.06011963, -0.05419922, -0.051879883, 0.04498291, -0.0044784546, -0.014053345, 0.022079468, -0.022994995, 0.007904053, -0.0103302, 0.06237793, 0.012413025, 0.027755737, 0.031951904, -0.010765076, -0.028030396, -0.004337311, 0.05456543, 0.00026392937, 0.0031089783, -0.00025773048, 0.020004272, -0.00039863586, 0.004638672, 0.014450073, -0.004722595, -0.060424805, 0.05758667, -0.03942871, 0.015403748, 0.033813477, -0.004016876, 0.015342712, 0.01878357, 0.021118164, -0.0056991577, 0.01612854, -0.0017633438, 0.014228821, -0.0070648193, -0.005718231, 0.01651001, 0.05

In [None]:
i2doc = {docs[doc]: doc for doc in docs.keys()}
i2doc

## Rerank with Cohere

In [34]:
import cohere
# instantiate the Cohere client
co = cohere.Client(api_key=os.environ.get("COHERE_API_KEY"))

In [44]:
rerank_docs = co.rerank(
    query=query, documents=list(docs.keys()), top_n=3
)

"""
rerank_docs format:
RerankResponse(id='8476ace7-df8b-4075-b1c7-50aa8ba4f7b4', results=[RerankResponseResultsItem(document=None, index=10, relevance_score=0.9916195), RerankResponseResultsItem(document=None, index=20, relevance_score=0.9553191), RerankResponseResultsItem(document=None, index=16, relevance_score=0.9465967)], meta=ApiMeta(api_version=ApiMetaApiVersion(version='1', is_deprecated=None, is_experimental=None), billed_units=ApiMetaBilledUnits(input_tokens=None, output_tokens=None, search_units=1.0, classifications=None), warnings=None))
"""
# List the top 3 documents with only its index and relevance score
for i, doc in enumerate(rerank_docs.results):
    print(f"Document {i}: Index {doc.index}, Relevance Score {doc.relevance_score}")

INFO:httpx:HTTP Request: POST https://api.cohere.ai/v1/rerank "HTTP/1.1 200 OK"


Document 0: Index 10, Relevance Score 0.9916195
Document 1: Index 20, Relevance Score 0.9553191
Document 2: Index 16, Relevance Score 0.9465967
