# PrivateQ

## Set-up

Import necessary libraries and configurations

In [1]:
# Set up
import os
import sys; sys.path.append("..")
import warnings; warnings.filterwarnings("ignore")

from dotenv import load_dotenv; load_dotenv()

import ray

In [2]:
from privateq.config import ROOT_DIR

# Set up ray with credentials and start ray
ray.init(runtime_env={
    "env_vars": {
        "OPENAI_API_BASE": os.environ["OPENAI_API_BASE"],
        "OPENAI_API_KEY": os.environ["OPENAI_API_KEY"], 
    },
    "working_dir": str(ROOT_DIR)
})

# Show resources
ray.cluster_resources()

2023-12-31 19:59:19,670	INFO worker.py:1715 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m
2023-12-31 19:59:19,687	INFO packaging.py:530 -- Creating a file package for local directory '/Users/tiluan/git/privateq'.
2023-12-31 19:59:19,699	INFO packaging.py:358 -- Pushing file package 'gcs://_ray_pkg_ce909f245981bd35.zip' (0.04MiB) to Ray cluster...
2023-12-31 19:59:19,701	INFO packaging.py:371 -- Successfully pushed file package 'gcs://_ray_pkg_ce909f245981bd35.zip'.


{'object_store_memory': 1580767641.0,
 'node:127.0.0.1': 1.0,
 'CPU': 8.0,
 'node:__internal_head__': 1.0,
 'memory': 3161535284.0}

## Data Preparation

Load, chunk, embed and index data

In [3]:
# Import model parameters
from privateq.config import EMBEDDING_DIMENSIONS, MAX_CONTENT_LENGTH

print(f'EMBEDDING_DIMENSIONS: {EMBEDDING_DIMENSIONS}\nMAX_CONTENT_LENGTH: {MAX_CONTENT_LENGTH}')

EMBEDDING_DIMENSIONS: 768
MAX_CONTENT_LENGTH: 8192


In [4]:
# Load Data using Ray

from pathlib import Path
from privateq.config import FILE_DIR

# Find documents
DOCS_DIR = Path(FILE_DIR, os.environ.get("DOCS_DIR"))
print(f'DOCS_DIR: {DOCS_DIR}')
assert DOCS_DIR.exists(), f'{DOCS_DIR} does not exist.'

# Filter out html files and load them as ray dataset
ds = ray.data.from_items([{"path": path} for path in DOCS_DIR.rglob("*.html") if not path.is_dir()])
print(f"{ds.count()} documents")

DOCS_DIR: /Users/tiluan/git/.tmp/docs.ray.io/en/master
1967 documents


In [9]:
# Shutdown Ray to clean resources
ray.shutdown()