# Multi-tenant Chat with Papers - Load and chunk papers
## Get keys and urls

In [25]:
import os
from dotenv import load_dotenv

load_dotenv()

WEAVIATE_KEY = os.getenv("WEAVIATE_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_URL = os.getenv("OPENAI_URL")

print(f"Weaviate Key:{WEAVIATE_KEY}")
print(f"OpenAI API Key: {OPENAI_API_KEY}")
print(f"OpenAI URL: {OPENAI_URL}")

Weaviate Key:root-user-key
OpenAI API Key: sk-dummy-key-for-local-testing
OpenAI URL: http://host.docker.internal:11434


## Connect to Weaviate

In [26]:
import weaviate
from weaviate.classes.init import Auth

# Connect to the local instance
client = weaviate.connect_to_local(
  host="127.0.0.1", # the address to the learner's instance
  port=8080,
  grpc_port=50051,
  auth_credentials=Auth.api_key(WEAVIATE_KEY),
  headers={
    "X-OpenAI-Api-Key": OPENAI_API_KEY
  }
)

print(client.is_ready())

[2025-09-19 00:22:51,755] INFO in _client: HTTP Request: GET http://127.0.0.1:8080/v1/meta "HTTP/1.1 200 OK"


[2025-09-19 00:22:51,840] INFO in _client: HTTP Request: GET https://pypi.org/pypi/weaviate-client/json "HTTP/1.1 200 OK"
[2025-09-19 00:22:51,863] INFO in _client: HTTP Request: GET http://127.0.0.1:8080/v1/.well-known/ready "HTTP/1.1 200 OK"


True


## Load Data from arxiv

1. Get chunks from paper - `get_chunks_from_paper`
2. Create a tenant for the paper - `create_tenant`
3. Batch import chunks - `batch_import_chunks`

### 1. Get chunks from paper - `get_chunks_from_paper`

In [27]:
from distyll.text import from_arxiv_paper
from distyll.utils import chunk_text

def get_chunks_from_paper(url):
    paper = from_arxiv_paper(url)
    chunks = chunk_text(source_text=paper["text"])

    paper["arxiv_id"] = url.replace("https://arxiv.org/pdf/", "").replace(".pdf", "").replace(".", "-")
    paper["chunks"] = chunks
    return paper

In [28]:
chunked_2212 = get_chunks_from_paper("https://arxiv.org/pdf/2212.10496.pdf")
chunked_2212

[2025-09-19 00:22:51,882] INFO in text: Getting arXiV paper from https://arxiv.org/pdf/2212.10496.pdf
[2025-09-19 00:22:51,887] INFO in utils: Getting arXiV title from https://arxiv.org/abs/2212.10496
[2025-09-19 00:22:51,975] INFO in utils: Chunking text of 41145 characters with words method.
[2025-09-19 00:22:51,979] INFO in utils: Chunking text of 41125 chars by number of words.


{'title': 'Precise Zero-Shot Dense Retrieval without Relevance Labels',
 'url': 'https://arxiv.org/pdf/2212.10496.pdf',
 'text': '\nPrecise Zero-Shot Dense Retrieval without Relevance Labels\nLuyu Gao∗†Xueguang Ma∗‡Jimmy Lin‡Jamie Callan†\n†Language Technologies Institute, Carnegie Mellon University\n‡David R. Cheriton School of Computer Science, University of Waterloo\n{luyug, callan}@cs.cmu.edu, {x93ma, jimmylin}@uwaterloo.ca\nAbstract\nWhile dense retrieval has been shown effec-\ntive and efﬁcient across tasks and languages,\nit remains difﬁcult to create effective fully\nzero-shot dense retrieval systems when no rel-\nevance label is available. In this paper, we\nrecognize the difﬁculty of zero-shot learning\nand encoding relevance. Instead, we pro-\npose to pivot through Hy pothetical D ocument\nEmbeddings ( HyDE ). Given a query, HyDE ﬁrst\nzero-shot instructs an instruction-following\nlanguage model (e.g. InstructGPT ) to gen-\nerate a hypothetical document. The docu-\nment capt

### 2. Create a tenant for the paper - `create_tenant`

In [29]:
from weaviate.classes.tenants import Tenant
papers = client.collections.get("Papers")

def create_tenant(chunked_paper):
    tenant_name = chunked_paper["arxiv_id"]

    papers.tenants.create([
        Tenant(name=tenant_name)
    ])

    return tenant_name

In [30]:
create_tenant(chunked_2212)

[2025-09-19 00:22:52,080] INFO in _client: HTTP Request: POST http://127.0.0.1:8080/v1/schema/Papers/tenants "HTTP/1.1 200 OK"


'2212-10496'

In [31]:
papers.tenants.get()

{'2212-10496': TenantOutput(name='2212-10496', activityStatusInternal=<TenantActivityStatus.ACTIVE: 'ACTIVE'>, activityStatus=<_TenantActivistatusServerValues.HOT: 'HOT'>)}

### 3. Batch import chunks - `batch_import_chunks`

In [32]:
def batch_import_chunks(chunked_paper):
    ten = papers.with_tenant(chunked_paper["arxiv_id"])

    i=0
    with ten.batch.dynamic() as batch:
        for chunk in chunked_paper["chunks"]:
            batch.add_object({
                "title": chunked_paper["title"],
                "url": chunked_paper["url"],
                "chunk": chunk,
                "chunk_no": i,
            })
            i+=1

    # if(len(papers.batch.failed_objects)>0):
    if(len(ten.batch.failed_objects)>0):
        print("Import complete with errors")
        for err in papers.batch.failed_objects:
            print(err)
    else:
        print("Import complete with no errors")

In [33]:
batch_import_chunks(chunked_2212)

[2025-09-19 00:22:52,141] INFO in _client: HTTP Request: GET http://127.0.0.1:8080/v1/schema/Papers "HTTP/1.1 200 OK"
[2025-09-19 00:22:52,147] INFO in _client: HTTP Request: GET http://127.0.0.1:8080/v1/nodes "HTTP/1.1 200 OK"
[2025-09-19 00:22:53,159] INFO in _client: HTTP Request: GET http://127.0.0.1:8080/v1/nodes "HTTP/1.1 200 OK"
[2025-09-19 00:22:54,186] INFO in _client: HTTP Request: GET http://127.0.0.1:8080/v1/nodes "HTTP/1.1 200 OK"
[2025-09-19 00:22:55,191] INFO in _client: HTTP Request: GET http://127.0.0.1:8080/v1/nodes "HTTP/1.1 200 OK"
[2025-09-19 00:22:56,197] INFO in _client: HTTP Request: GET http://127.0.0.1:8080/v1/nodes "HTTP/1.1 200 OK"
[2025-09-19 00:22:57,199] INFO in _client: HTTP Request: GET http://127.0.0.1:8080/v1/nodes "HTTP/1.1 200 OK"
[2025-09-19 00:22:58,205] INFO in _client: HTTP Request: GET http://127.0.0.1:8080/v1/nodes "HTTP/1.1 200 OK"
[2025-09-19 00:22:59,210] INFO in _client: HTTP Request: GET http://127.0.0.1:8080/v1/nodes "HTTP/1.1 200 OK"
[2

Import complete with no errors


## End-to-end paper load

In [34]:
def import_paper(url):
    cp = get_chunks_from_paper(url)
    tenant_name = create_tenant(cp)
    batch_import_chunks(cp)

In [35]:
import_paper("https://arxiv.org/pdf/2401.00107.pdf")

[2025-09-19 00:23:06,239] INFO in text: Getting arXiV paper from https://arxiv.org/pdf/2401.00107.pdf
[2025-09-19 00:23:06,240] INFO in utils: Getting arXiV title from https://arxiv.org/abs/2401.00107
[2025-09-19 00:23:06,315] INFO in utils: Chunking text of 47862 characters with words method.
[2025-09-19 00:23:06,317] INFO in utils: Chunking text of 47618 chars by number of words.
[2025-09-19 00:23:06,323] INFO in _client: HTTP Request: POST http://127.0.0.1:8080/v1/schema/Papers/tenants "HTTP/1.1 200 OK"
[2025-09-19 00:23:06,325] INFO in _client: HTTP Request: GET http://127.0.0.1:8080/v1/schema/Papers "HTTP/1.1 200 OK"
[2025-09-19 00:23:06,328] INFO in _client: HTTP Request: GET http://127.0.0.1:8080/v1/nodes "HTTP/1.1 200 OK"
[2025-09-19 00:23:07,331] INFO in _client: HTTP Request: GET http://127.0.0.1:8080/v1/nodes "HTTP/1.1 200 OK"
[2025-09-19 00:23:08,339] INFO in _client: HTTP Request: GET http://127.0.0.1:8080/v1/nodes "HTTP/1.1 200 OK"
[2025-09-19 00:23:09,344] INFO in _clien

Import complete with no errors


## Close the client

In [36]:
client.close()