In [None]:
!pip install openai

In [None]:
!pip install chromadb

In [None]:
import pandas as pd
import numpy as np
import pandas as pd
import chromadb
from chromadb.config import Settings
import openai

In [None]:
# Initialize ChromaDB client with persistence
client = chromadb.PersistentClient(path="gs://project-yelp/reviews_rag_chroma_reviews_db")
# Create or load a ChromaDB collection
collection = client.get_or_create_collection(name="reviews_embeddings_collection")

In [None]:
df = pd.read_csv('concatenated_reviews.csv')
df.head()

In [None]:
df.shape

In [None]:
metadata_col = list(df)[:-1]
print(metadata_col)

In [None]:
openai.api_key = "lorem_ipsum"
# client = OpenAI()
# Function to chunk text
def chunk_text_with_overlap(text, token_limit=1000, overlap=50):
    words = text.split()
    chunks = []
    start = 0

    while start < len(words):
        # Get the chunk
        end = min(start + token_limit, len(words))
        chunk = words[start:end]
        chunks.append(" ".join(chunk))

        # Move the start pointer, including overlap
        start += token_limit - overlap

    return chunks

# Function to generate embeddings for a chunk
def get_embeddings(text):
    response = openai.embeddings.create(
        model="text-embedding-ada-002",
        input=text
    )
    # print(response.data[0].embedding)
    return np.array(response.data[0].embedding, dtype = np.float64)

# Function to aggregate embeddings
def aggregate_embeddings(embeddings, method="mean"):
    if method == "mean":
        return np.mean(embeddings, axis=0)
    elif method == "max":
        return np.max(embeddings, axis=0)
    elif method == "concat":
        return np.concatenate(embeddings, axis=0)
    else:
        raise ValueError("Unsupported aggregation method!")

# Function to process text: Chunk -> Embed -> Aggregate
def process_text_for_embeddings(text, token_limit=1024, aggregation_method="mean"):
    # Step 1: Chunk the text
    chunks = chunk_text_with_overlap(text, token_limit=token_limit)

    # Step 2: Generate embeddings for each chunk
    chunk_embeddings = [get_embeddings(chunk) for chunk in chunks]

    # Step 3: Aggregate embeddings
    aggregated_embedding = aggregate_embeddings(chunk_embeddings, method=aggregation_method)

    return aggregated_embedding

def store_embeddings_in_chromadb(df, text_col, metadata_col):
    for _, row in df.iterrows():
        text = row[text_col]
        metadata_rows = row[metadata_col]

        # Handle non-string text values
        if not isinstance(text, str):
            text = str(text) if not pd.isna(text) else ""

        metadata = metadata_rows.to_dict()
        unique_id = str(metadata.pop("business_id"))

        try:
            # Generate embedding for the text
            embedding = process_text_for_embeddings(text, token_limit=1024, aggregation_method="max")
            
            # Add text, embedding, and metadata to ChromaDB
            collection.add(
                documents=[text],
                embeddings=[embedding],
                metadatas=[metadata],
                ids=[unique_id]
            )
            print(f"Successfully stored embedding for {unique_id}.")
        except ValueError as e:
            print(f"Error processing {unique_id}: {e}")


In [None]:
store_embeddings_in_chromadb(df, "concatenated_reviews", metadata_col)