In [2]:
import pandas as pd 
import chromadb
import os

chroma_path ="../chroma_db"

df_train = pd.read_pickle("../data/train.pkl")
df_test = pd.read_pickle("../data/test.pkl")



In [3]:
df_train.head()

Unnamed: 0,text,label,label_text,text_normalized,id,embedding
0,Wall St. Bears Claw Back Into the Black (Reute...,2,Business,wall st bears claw back black reuters reuters ...,train_0,"[0.07367804646492004, -0.02631131373345852, 0...."
1,Carlyle Looks Toward Commercial Aerospace (Reu...,2,Business,carlyle looks toward commercial aerospace reut...,train_1,"[-0.0068568033166229725, -0.06906827539205551,..."
2,Oil and Economy Cloud Stocks' Outlook (Reuters...,2,Business,oil economy cloud stocks outlook reuters reute...,train_2,"[-0.01750737801194191, -0.0378166139125824, 0...."
3,Iraq Halts Oil Exports from Main Southern Pipe...,2,Business,iraq halts oil exports main southern pipeline ...,train_3,"[-0.023455876857042313, -0.012443509884178638,..."
4,"Oil prices soar to all-time record, posing new...",2,Business,oil prices soar alltime record posing new mena...,train_4,"[-0.07173541933298111, -0.022663453593850136, ..."


In [4]:
client = chromadb.PersistentClient(path=chroma_path)

try:
    client.delete_collection(name="news_train")
    client.delete_collection(name="news_test")
    print("the collectios was deleted")
except:
    print("pass")
    pass

# metadata={"hnsw:space": "cosine"} tells Chroma to use Cosine Similarity for search

collection_train = client.create_collection(name="news_train", metadata={"hnsw:space": "cosine"})
collection_test = client.create_collection(name="news_test", metadata={"hnsw:space": "cosine"})


pass


In [5]:
# We will use a loop (batching) to avoid crashing the computer. We map the DataFrame columns to the 3 ChromaDB parts (ids, embeddings, metadatas)

from tqdm import tqdm

def add_to_collection(collection, df, batch_size=2000):
    total_rows = len(df)
    # Loop from 0 to total_rows in steps of 2000
    
    for i in tqdm(range(0, total_rows, batch_size),desc=f'add to {collection.name}'):
        
        batch = df.iloc[i : i+ batch_size]
        
        ids = batch['id'].tolist()
        
        embeddings = batch["embedding"].tolist()
        
        metadatas = batch[["label", "text_normalized"]].to_dict("records")
        
        collection.add(
            ids=ids,
            embeddings=embeddings,
            metadatas=metadatas
        )
    


In [6]:
print("Starting ingestion for Training Data...")
add_to_collection(collection_train, df_train)        
    
print("Starting ingestion for Testing Data...")
add_to_collection(collection_test, df_test )

Starting ingestion for Training Data...


add to news_train: 100%|██████████| 60/60 [34:45<00:00, 34.76s/it]


Starting ingestion for Testing Data...


add to news_test: 100%|██████████| 4/4 [01:08<00:00, 17.01s/it]


In [7]:
count_train = collection_train.count()
count_test = collection_test.count()

print(f"Total items in Training Collection: {count_train}")
print(f"Total items in Test Collection:     {count_test}")

first_item = collection_train.peek(limit=1)

print("\n--- Sample Item ---")
print(f"ID: {first_item['ids'][0]}")
print(f"Label: {first_item['metadatas'][0]['label']}")
print(f"Text Snippet: {first_item['metadatas'][0]['text_normalized'][:100]}...") 


Total items in Training Collection: 120000
Total items in Test Collection:     7600

--- Sample Item ---
ID: train_0
Label: 2
Text Snippet: wall st bears claw back black reuters reuters shortsellers wall streets dwindlingband ultracynics se...
