# PrivateQ

## Set-up

Import necessary libraries and configurations

In [None]:
# Set up
import os
import sys; sys.path.append("..")
import warnings; warnings.filterwarnings("ignore")

from dotenv import load_dotenv; load_dotenv()

import ray

In [None]:
from privateq.config import ROOT_DIR

# Set up ray with credentials and start ray
ray.init(runtime_env={
    "env_vars": {
        "OPENAI_API_BASE": os.environ["OPENAI_API_BASE"],
        "OPENAI_API_KEY": os.environ["OPENAI_API_KEY"], 
    },
    "working_dir": str(ROOT_DIR)
})

# Show resources
ray.cluster_resources()

## Data Processing

Load, sectionalize, chunk and embed data

In [None]:
# Import model parameters
from privateq.config import EMBEDDING_DIMENSIONS, MAX_CONTENT_LENGTH

print(f'EMBEDDING_DIMENSIONS: {EMBEDDING_DIMENSIONS}\nMAX_CONTENT_LENGTH: {MAX_CONTENT_LENGTH}')

### Load Data

In [None]:
# Load Data using Ray

from pathlib import Path
from privateq.config import FILE_DIR

# Find documents
DOCS_DIR = Path(FILE_DIR, os.environ.get("DOCS_DIR"))
print(f'DOCS_DIR: {DOCS_DIR}')
assert DOCS_DIR.exists(), f'{DOCS_DIR} does not exist.'

# Filter out html files and load them as ray dataset
ds = ray.data.from_items([{"path": path} for path in DOCS_DIR.rglob("*.html") if not path.is_dir()])
print(f"{ds.count()} documents")

### Extract Data
Extract content from dataset, identify sections in html pages and extract text between them. Then save all of them to a list of dictionaries that map text of a section to a specific url with specific anchor id.

In [None]:
from privateq.data_process import extract_sections
import matplotlib.pyplot as plt

# Sample extraction process
# sample_html_fp = Path(FILE_DIR, "docs.ray.io/en/master/rllib/rllib-env.html")
# extract_sections({"path": sample_html_fp})[0]

# Extract sections from dataset
sections_ds = ds.flat_map(extract_sections)
print(f'Sections Count: {sections_ds.count()}')

In [None]:
# Plot section lengths to observe the dataset
section_lengths = []
for section in sections_ds.take_all():
    section_lengths.append(len(section["text"]))

plt.figure(figsize=(12, 3))
plt.plot(section_lengths, marker='x', color='g')
plt.title("Section Lengths")
plt.ylabel("# of characters")
plt.show()

### Chunk Data

In [None]:
# Chunk sectionalized data

from functools import partial
from privateq.data_process import chunk_section

chunk_size = 300
chunk_overlap = 50

# Chunk a sample section
# text_splitter = RecursiveCharacterTextSplitter(
#     separators=["\n\n", "\n", " ", ""],
#     chunk_size=chunk_size,
#     chunk_overlap=chunk_overlap,
#     length_function=len)
# sample_section = sections_ds.take(1)[0]
# chunks = text_splitter.create_documents(
#     texts=[sample_section["text"]], 
#     metadatas=[{"source": sample_section["source"]}])
# print (chunks[0])

# Chunk all data using ray for scalability
chunks_ds = sections_ds.flat_map(partial(
    chunk_section, 
    chunk_size=chunk_size, 
    chunk_overlap=chunk_overlap))
print(f"{chunks_ds.count()} chunks")
chunks_ds.show(1)

### Embed Data

In [None]:
import torch

print("Torch version:",torch.__version__)

print("Is CUDA enabled?",torch.cuda.is_available())

In [None]:
# Import embedding libraries
from privateq.embed_chunks import EmbedChunks

# Embed chunks
embedding_model_name = "thenlper/gte-base"
# Tune gpu number to suit environment
embedded_chunks = chunks_ds.map_batches(
    EmbedChunks,
    fn_constructor_kwargs={"model_name": embedding_model_name},
    batch_size=100, 
    num_gpus=1,
    concurrency=1)

In [None]:
# Sample one to check
sample = embedded_chunks.take(1)
print ("embedding size:", len(sample[0]["embeddings"]))
print (sample[0]["text"])

In [None]:
# Shutdown Ray to clean resources
ray.shutdown()