In [1]:
%load_ext dotenv
%dotenv ../../05_src/.secrets

In [2]:
from langchain_community.document_loaders import JSONLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter

from openai import OpenAI

import chromadb

import json
import os


In [3]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY not found in environment variables")
else:
    client = OpenAI(api_key = OPENAI_API_KEY)

def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    if model == "sentence_transformers":
        from sentence_transformers import SentenceTransformer
        model = SentenceTransformer('all-MiniLM-L6-v2')
        return model.encode(text).tolist()
    
    return client.embeddings.create(input=[text], model=model).data[0].embedding


In [4]:
chroma_client = chromadb.Client() 

collection_name = "reviews"

try:
    # Attempt to get the collection
    collection = chroma_client.get_collection(name=collection_name)
    # If successful, the collection exists, so delete it
    chroma_client.delete_collection(name=collection_name)
    print(f"Collection '{collection_name}' deleted successfully.")
except Exception as e:
    # If get_collection raises a ValueError, the collection does not exist
    print(f"Collection '{collection_name}' does not exist. No action taken.")

chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="reviews")

Collection 'reviews' does not exist. No action taken.


## Data Loader

+ Use JSONLoader.

In [5]:
def get_metadata(record:dict, metadata: dict) -> dict:
    metadata['reviewid'] = record.get('reviewid')
    return metadata

loader = JSONLoader("../../05_src/documents/pitchfork_content.jsonl", 
                    jq_schema=".",
                    content_key="content",
                    json_lines=True,
                    text_content=True,
                    metadata_func=get_metadata)

In [6]:
data = loader.load()

In [7]:
data[0].to_json()

{'lc': 1,
 'type': 'constructor',
 'id': ['langchain', 'schema', 'document', 'Document'],
 'kwargs': {'metadata': {'source': 'C:\\Users\\JesusCalderon\\work\\dsi_deploying_ai\\05_src\\documents\\pitchfork_content.jsonl',
   'seq_num': 1,
   'reviewid': 22703},
  'page_content': "Trip-hop eventually became a 90s punchline, a music-press shorthand for overhyped hotel lounge music. But today, the much-maligned subgenre almost feels like a secret precedent. Listen to any of the canonical Bristol-scene albums of the mid-late 90s, when the genre was starting to chafe against its boundaries, and youd think the claustrophobic, anxious 21st century started a few years ahead of schedule. Looked at from the right angle, trip-hopis part of an unbroken chain that runs from the abrasion of 80s post-punk to the ruminative pop-R&B-dance fusion of the moment.The best of it has aged far more gracefully (and forcefully) than anything recorded in the waning days of the record industrys pre-filesharing mon

In [None]:
data

## Create Chunks

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000, 
    chunk_overlap=200, 
    length_function = len, 
    add_start_index = True
)

In [None]:
chunks = text_splitter.split_documents(data)

In [None]:
print(f'Split {len(data)} reviews (documents) into {len(chunks)} chunks.' )

In [None]:
chunks[0].to_json()

In [None]:
chunks

## Batch Embedding and Insertion

[Documentation](https://platform.openai.com/docs/guides/batch)
[API Reference](https://platform.openai.com/docs/api-reference/batch)

In [None]:
chunks[0].page_content

In [None]:
chunks[0].metadata

In [None]:
def prep_batch_file_for_embedding(input:list, output_path:str, max_lines_per_file:int=10000):
    total_lines = len(input)
    num_files = (total_lines // max_lines_per_file) + 1
    print(f'Total lines: {total_lines}, Number of files to create: {num_files}')

    for num_file in range(num_files):
        start_index = num_file * max_lines_per_file
        end_index = min(start_index + max_lines_per_file, total_lines)
        output_file = os.path.join(output_path, f"pitchfork_reviews_batch_{num_file+1}.jsonl")
        print(f'Creating file: {output_file} with lines from {start_index} to {end_index-1}')
        create_single_batch_file(input, start_index, end_index, output_file)

def create_single_batch_file(input, start_index, end_index, output_file):
    with open(output_file, 'w') as outfile:
        for line in input[start_index:end_index]:
            custom_id = (
                    str(line.metadata['reviewid']) + "_" + 
                    str(line.metadata['seq_num']) + "_" + 
                    str(line.metadata['start_index'])
                )
            content = line.page_content
            out_dict = {
                    "custom_id": custom_id, 
                    "method": "POST", 
                    "url": "/v1/embeddings", 
                    "body": {
                        "model": "text-embedding-3-small", 
                        "input": content
                    }
                }
            outfile.write(json.dumps(out_dict) + '\n')
        
            

In [None]:
prep_batch_file_for_embedding(
    input=chunks, 
    output_path='../../05_src/documents/'
)

In [9]:
from glob import glob

batch_files = glob('../../05_src/documents/pitchfork_reviews_batch_*.jsonl')
batch_files

['../../05_src/documents\\pitchfork_reviews_batch_1.jsonl',
 '../../05_src/documents\\pitchfork_reviews_batch_2.jsonl',
 '../../05_src/documents\\pitchfork_reviews_batch_3.jsonl']

In [10]:
from openai import OpenAI
from tqdm import tqdm
client = OpenAI()


for b_file in tqdm(batch_files):
    batch_input_file = client.files.create(
        file=open(b_file, "rb"), 
        purpose='batch'
    )
    print(batch_input_file)

 33%|███▎      | 1/3 [00:05<00:11,  5.78s/it]

FileObject(id='file-WhSUccy96odJzMfoRuM2i7', bytes=47726616, created_at=1760290155, filename='pitchfork_reviews_batch_1.jsonl', object='file', purpose='batch', status='processed', expires_at=1762882155, status_details=None)


 33%|███▎      | 1/3 [00:06<00:12,  6.28s/it]


KeyboardInterrupt: 

In [11]:
from openai import OpenAI
client = OpenAI()

my_files = client.files.list().to_dict()
file_ids = [file['id'] for file in my_files['data']]

In [12]:
file_ids

['file-WhSUccy96odJzMfoRuM2i7',
 'file-FTq9RsT65xWZyrVQExttJ2',
 'file-NFcTFR5oAt2GCU7mr1vgZr',
 'file-2xdCe69XzkfQG4kcpbAusw',
 'file-6iM8nbyPQFybjTfNoqXVj3']

In [13]:
client = OpenAI()

for file_id in tqdm(file_ids):
    client.batches.create(
            input_file_id = file_id,
            endpoint="/v1/embeddings",
            completion_window="24h",
            metadata={
                "description": "Pitchfork reviews content embeddings"
            }
        )

100%|██████████| 5/5 [00:02<00:00,  2.39it/s]


In [None]:
my_batches = client.batches.list().to_dict()
batches_ids = [batch['id'] for batch in my_batches['data']]

In [None]:
batches_ids

In [None]:
for batch_id in tqdm(batches_ids):
    batch = client.batches.retrieve(batch_id)
    print(f'batch id: {batch.to_dict()['id']}')
    print(f'batch status: {batch.to_dict()['status']}')    
    print(f'batch request counts: {batch.to_dict()['request_counts']}')
    print(f'output file id: {batch.to_dict()['output_file_id']}')
    print("\n\n")

In [None]:
file_response = client.files.content(batch.to_dict()['output_file_id'])


In [None]:
text_response = file_respose.text

In [None]:
for lines in file_response.text.split("\n"):
    if lines.strip():
        record = json.loads(lines)
        embedding = record['embedding']
        metadata = {
            "custom_id": record['custom_id']
        }
        collection.add(
            documents=[record['input']],
            embeddings=[embedding],
            metadatas=[metadata],
            ids=[record['custom_id']]
        )

In [None]:
print(batch.to_dict()['output_file_id'])

In [None]:
def query_chromadb(query, top_n=2):
    query_embedding = get_embedding(query)
    results = collection.query(
        query_embeddings = [query_embedding],
        n_results = top_n
    )
    return [(id, score, text) for id, score, text in zip(results['ids'][0], results['distances'][0], results['documents'][0])]

In [None]:
collection.add(
    embeddings = embeddings,
    documents = documents, 
    ids = ids
)

In [None]:
query_chromadb("Are bats blind?", top_n=3)

# Tokenization

In [None]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size = 100, chunk_overlap=10
)

In [None]:
documents = text_splitter.split_documents(data)

In [None]:
documents = [doc for doc in documents]

In [None]:
from langchain.vectorstores import Chroma

db = Chroma.from_documents(documents, OpenAIEmbeddings())

In [None]:
def query_documents(query, top_n=2):
    docs = db.similarity_search(query, top_n)
    return docs

In [None]:
query_documents("Are bats blind?")