# Multi-tenant Chat with Papers - Load and chunk papers
## Get keys and urls

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

WEAVIATE_URL = os.getenv("WEAVIATE_URL")
WEAVIATE_KEY = os.getenv("WEAVIATE_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

print(WEAVIATE_URL)
print(WEAVIATE_KEY)
print(OPENAI_API_KEY)

## Connect to Weaviate

In [None]:
import weaviate
from weaviate.classes.init import Auth

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WEAVIATE_URL,
    auth_credentials=Auth.api_key(WEAVIATE_KEY),

    headers = {
        "X-OpenAI-Api-Key": OPENAI_API_KEY
    },
)

client.is_ready()

## Load Data from arxiv

1. Get chunks from paper - `get_chunks_from_paper`
2. Create a tenant for the paper - `create_tenant`
3. Batch import chunks - `batch_import_chunks`

### 1. Get chunks from paper - `get_chunks_from_paper`

In [None]:
from distyll.text import from_arxiv_paper
from distyll.utils import chunk_text

def get_chunks_from_paper(url):
    paper = from_arxiv_paper(url)
    chunks = chunk_text(source_text=paper["text"])

    paper["arxiv_id"] = url.replace("https://arxiv.org/pdf/", "").replace(".pdf", "").replace(".", "-")
    paper["chunks"] = chunks
    return paper

In [None]:
chunked_2212 = get_chunks_from_paper("https://arxiv.org/pdf/2212.10496.pdf")
chunked_2212

### 2. Create a tenant for the paper - `create_tenant`

In [None]:
from weaviate.classes.tenants import Tenant
papers = client.collections.get("Papers")

def create_tenant(chunked_paper):
    tenant_name = chunked_paper["arxiv_id"]

    papers.tenants.create([
        Tenant(name=tenant_name)
    ])

    return tenant_name

In [None]:
create_tenant(chunked_2212)

In [None]:
papers.tenants.get()

### 3. Batch import chunks - `batch_import_chunks`

In [None]:
def batch_import_chunks(chunked_paper):
    ten = papers.with_tenant(chunked_paper["arxiv_id"])

    i=0
    with ten.batch.dynamic() as batch:
        for chunk in chunked_paper["chunks"]:
            batch.add_object({
                "title": chunked_paper["title"],
                "url": chunked_paper["url"],
                "chunk": chunk,
                "chunk_no": i,
            })
            i+=1

    # if(len(papers.batch.failed_objects)>0):
    if(len(ten.batch.failed_objects)>0):
        print("Import complete with errors")
        for err in papers.batch.failed_objects:
            print(err)
    else:
        print("Import complete with no errors")

In [None]:
batch_import_chunks(chunked_2212)

## End-to-end paper load

In [None]:
def import_paper(url):
    cp = get_chunks_from_paper(url)
    tenant_name = create_tenant(cp)
    batch_import_chunks(cp)

In [None]:
import_paper("https://arxiv.org/pdf/2401.00107.pdf")

## Close the client

In [None]:
client.close()