[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/weaviate/recipes/blob/main/integrations/data-platforms/box/weaviate_box.ipynb)

## Python Jupyter Notebook Recipe: Weaviate + Box Integration RAG Chat Demo

Author: Alexander Novotny from Box

This notebook demonstrates how to:
1. Authenticate with Box using a developer token via the Box Python-gen SDK.
2. Create a Box folder and upload demo files.
3. Retrieve the file content from the Box folder, using Box's text representations.
4. Generate embeddings for the file content using Weviate Embeddings.
5. Store the embeddings and metadata in Weaviate.
6. Implement a q/a service to query the content using Weaviate’s new agent service.

### Prerequisites
- A Box account with a custom application and developer token (you can generate one in the Box Developer Console).
- A Weaviate cloud instance + cluster.

### Step 1: Install Dependencies
First, install the required Python packages in your Jupyter environment.

In [None]:
!python3 -m venv venv
!source venv/bin/activate
!pip3 install "weaviate-client[agents]==4.11.1" box-sdk-gen requests

### Step 2: Import Libraries
Import the necessary libraries for Box, Weaviate, and Cohere.

In [3]:
import weaviate
from weaviate.auth import AuthApiKey
from weaviate.agents.query import QueryAgent
from box_sdk_gen import BoxClient, BoxDeveloperTokenAuth, CreateFolderParent
import re
import requests
from pathlib import Path

### Step 3: Authentication
Set up authentication for Box and Weaviate

In [None]:
# Box Developer Token (replace with your own)
BOX_DEVELOPER_TOKEN = 'TOKEN'

# Weaviate Instance URL and API Key (replace with your own)
WEAVIATE_URL = 'URL'
WEAVIATE_API_KEY = 'API KEY'


def main(box_token: str, weaviate_url: str, weaviate_api_key: str):
    # Initialize Box Client
    auth = BoxDeveloperTokenAuth(token=box_token)
    box_client = BoxClient(auth=auth)
    
    # Initialize Weaviate Client with Cohere for vectorization
    weaviate_client = weaviate.connect_to_wcs(
        cluster_url=weaviate_url,
        auth_credentials=AuthApiKey(weaviate_api_key) if weaviate_api_key else None
    )
    
    return box_client, weaviate_client

box_client, weaviate_client = main(
    BOX_DEVELOPER_TOKEN, WEAVIATE_URL, WEAVIATE_API_KEY
)
print("Clients initialized successfully.")

### Step 4: Define Weaviate Schema
Create a schema in Weaviate to store document embeddings and metadata. We’ll use Weaviate's build in `text2vec_weaviate` vectorizer.

In [None]:
from weaviate.classes.config import Property, DataType, Configure

if not weaviate_client.collections.exists("Documents"):
    weaviate_client.collections.create(
        name="Documents",
        generative_config=Configure.Generative.cohere(),
        properties=[
            Property(name="file_id", data_type=DataType.TEXT, skip_vectorization=True),
            Property(name="file_name", data_type=DataType.TEXT, skip_vectorization=True),
            Property(name="chunk_index", data_type=DataType.INT, skip_vectorization=True),
            Property(name="content", data_type=DataType.TEXT),
            Property(name="created_date", data_type=DataType.TEXT, skip_vectorization=True),
        ],
        vectorizer_config=Configure.Vectorizer.text2vec_weaviate()
    )
    print("Schema 'Documents' created successfully.")
else:
    print("Schema 'Documents' already exists.")

### Step 5: Upload files to Box
Uploads demo content to Box

In [None]:
SUPPORTED_TEXT_FILE_TYPES = {
    ".doc", ".docx", ".pdf", ".txt", ".html", ".md", ".json", ".xml",
    ".ppt", ".pptx", ".key", ".xls", ".xlsx", ".csv"
}

def is_supported_file_type(file_name):
    return any(file_name.endswith(ext) for ext in SUPPORTED_TEXT_FILE_TYPES)

def create_and_populate_folder(client: BoxClient):
    parent = CreateFolderParent(id="0")
    folder = client.folders.create_folder(name="Box_Weaviate_Demo_Folder", parent=parent)
    print(f"Created folder 'Demo_Folder' with ID: {folder.id}")

    demo_dir = Path("demo_files")
    if not demo_dir.exists():
        raise FileNotFoundError("Please create a 'demo_files' directory with example files.")

    file_objects = []
    for file_path in demo_dir.iterdir():
        if file_path.is_file() and is_supported_file_type(file_path.name):
            with open(file_path, "rb") as file_content:
                uploaded_files = client.uploads.upload_file(
                    attributes={"name": file_path.name, "parent": {"id": folder.id}},
                    file=file_content
                )
                uploaded_file = uploaded_files.entries[0]
                file_objects.append(uploaded_file)
                print(f"Uploaded {file_path.name} with ID: {uploaded_file.id}")
        elif file_path.is_file():
            print(f"Skipped {file_path.name} - unsupported file type.")

    if not file_objects:
        print("Warning: No supported files were uploaded from demo_files directory.")
    
    return file_objects

uploaded_files = create_and_populate_folder(box_client)

### Step 6: Extract Text and Import into Weaviate
Batch import the data into Weaviate, where Cohere’s vectorizer will automatically generate embeddings.

In [None]:
def clean_up_text(content: str) -> str:
    content = re.sub(r'(\w+)-\n(\w+)', r'\1\2', content)
    unwanted_patterns = ["\\\\n", "  —", "——————————", "—————————", "—————", r'\\\\u[\dA-Fa-f]{4}', r'\\uf075', r'\\uf0b7']
    for pattern in unwanted_patterns:
        content = re.sub(pattern, "", content)
    content = re.sub(r'(\w)\s*-\s*(\w)', r'\1-\2', content)
    content = re.sub(r'\s+', ' ', content)
    return content

def get_file_text_content(file, max_retries=5, delay=5):
    for attempt in range(max_retries):
        special_client = box_client.with_extra_headers(extra_headers={"x-rep-hints": "[extracted_text]", "x-box-ai-library": "weaviate"})
        file = special_client.files.get_file_by_id(file.id, fields=["representations"])
        if file.representations and file.representations.entries:
            for rep in file.representations.entries:
                if rep.representation == "extracted_text":
                    download_url = rep.content.url_template.replace("{+asset_path}", "") + '?access_token=' + box_client.auth.token
                    response = requests.get(download_url)
                    response.raise_for_status()
                    return clean_up_text(response.text)
                else:
                    print(f"Text representation not ready for file {file.id}")
                    raise ValueError(f"Text representation not ready for file {file.id}")
        if attempt == max_retries - 1:
            raise ValueError(f"Text representation not ready for {file.name} after {max_retries} attempts.")

def chunk_text(text, chunk_size=4000, overlap=200):
    chunks = []
    start = 0
    while start < len(text):
        end = start + chunk_size
        chunk = text[start:end]
        chunks.append(chunk)
        start = end - overlap
    return chunks

def extract_and_import(files):
    collection = weaviate_client.collections.get("Documents")
    
    # Check existing objects to avoid duplicates (idempotency on rerun)
    existing_chunks = set()
    try:
        response = collection.query.fetch_objects(
            return_properties=["file_id", "chunk_index"],
            limit=10000  # Adjust based on expected data size
        )
        for obj in response.objects:
            file_id = obj.properties["file_id"]
            chunk_index = obj.properties["chunk_index"]
            existing_chunks.add((file_id, chunk_index))
    except Exception as e:
        print(f"Warning: Could not fetch existing objects for deduplication: {e}")

    with collection.batch.dynamic() as batch:
        for file in files:
            try:
                text = get_file_text_content(file)
                chunks = chunk_text(text)
                for i, chunk in enumerate(chunks):
                    chunk_key = (file.id, i)
                    if chunk_key in existing_chunks:
                        continue  # Skip if chunk already exists
                    batch.add_object(properties={
                        "file_id": file.id,
                        "file_name": file.name,
                        "chunk_index": i,
                        "content": chunk,
                        "created_date": file.created_at
                    })
            except Exception as e:
                print(f"Error processing {file.name}: {e}")

extract_and_import(uploaded_files)

### Step 7: Search with Weaviate Query Agent
Ask a question and get a response based on the imported content

In [None]:
# Initialize Query Agent
query_agent = QueryAgent(
    client=weaviate_client,
    collections=["Documents"],
    system_prompt="Provide concise answers based on the data in the Documents collection."
)

# Define your query
query = "Based on Google and Apple's reports, which company has the most revenue?"

# Use Query Agent to get an answer
response = query_agent.run(query)

# Print the response
response.display()