In [None]:
import os
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
AI_STUDIO_KEY = os.getenv("GOOGLE_AI_STUDIO_KEY")

## Connect

In [None]:
import weaviate

# Connect to the local instance deployed with Docker Compose
client = weaviate.connect_to_local(
    headers={
       "X-Google-Api-Key": AI_STUDIO_KEY,
    }
)

client.is_ready()

In [None]:
# import weaviate, os

# client = weaviate.connect_to_wcs(
#     cluster_url=os.getenv("WORKSHOP_DEMO_URL"),
#     auth_credentials=weaviate.auth.AuthApiKey(os.getenv("WORKSHOP_DEMO_KEY_ADMIN")),
#     headers={
#        "X-Palm-Api-Key": AI_STUDIO_KEY,
#     }
# )

# client.is_ready()

## Load Data from arxiv

1. Get chunks from paper - `get_chunks_from_paper`
2. Create a tenant for the paper - `create_tenant`
3. Batch import chunks - `batch_import_chunks`

### 1. Get chunks from paper - `get_chunks_from_paper`

In [None]:
from distyll.text import from_arxiv_paper
from distyll.utils import chunk_text

def get_chunks_from_paper(url):
    paper = from_arxiv_paper(url)
    chunks = chunk_text(source_text=paper["text"])

    paper["arxiv_id"] = url.replace("https://arxiv.org/pdf/", "").replace(".pdf", "").replace(".", "-")
    paper["chunks"] = chunks
    return paper

In [None]:
chunked_2212 = get_chunks_from_paper("https://arxiv.org/pdf/2212.10496.pdf")
chunked_2212

### 2. Batch import chunks - `batch_import_chunks`

In [None]:
papers = client.collections.get("Papers")

def batch_import_chunks(chunked_paper):

    i=0
    with papers.batch.rate_limit(100) as batch:
        for chunk in chunked_paper["chunks"]:
            batch.add_object({
                "arxiv_id": chunked_paper["arxiv_id"],
                "title": chunked_paper["title"],
                "url": chunked_paper["url"],
                "chunk": chunk,
                "chunk_no": i,
            })
            i+=1

    if(len(papers.batch.failed_objects)>0):
        print("Import complete with errors")
        for err in papers.batch.failed_objects:
            print(err)
    else:
        print("Import complete with no errors")

In [None]:
batch_import_chunks(chunked_2212)

## End-to-end paper load

In [None]:
def import_paper(url):
    cp = get_chunks_from_paper(url)
    batch_import_chunks(cp)

In [None]:
import_paper("https://arxiv.org/pdf/2401.00107.pdf")

In [None]:
papers.aggregate.over_all()

## Close the client when done

In [None]:
client.close()