In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer
import numpy as np
import time
import os

In [4]:
DATA_PATH = "D:/tldr-bot/data/CNN_Articels_clean.csv"
EMBEDDING_PATH = "saved_index/article_embeddings.npy"

print("Loading and preparing data...")
start_time = time.time()

try:
    df = pd.read_csv(DATA_PATH)
    df.dropna(subset=['Article text'], inplace=True)
    article_texts = df['Article text'].tolist()
    print(f"   > Loaded {len(article_texts)} articles.")
except FileNotFoundError:
    print(f"   > Error: File not found at {DATA_PATH}")
    article_texts = []

end_time = time.time()
print(f"   > Done in {end_time - start_time:.2f}s.")


if article_texts:
    print("\nGenerating or Loading Embeddings...")
    step2_start_time = time.time()

    if os.path.exists(EMBEDDING_PATH):
        print("   > Found existing embeddings file, loading...")
        embeddings = np.load(EMBEDDING_PATH)
    else:
        print("   > No embeddings file found. Creating new embeddings...")
        model = SentenceTransformer('all-MiniLM-L6-v2')
        embeddings = model.encode(article_texts, show_progress_bar=True)
        os.makedirs(os.path.dirname(EMBEDDING_PATH), exist_ok=True)
        np.save(EMBEDDING_PATH, embeddings)
        print(f"   > Embeddings saved to {EMBEDDING_PATH}")

    print(f"   > Embeddings ready with shape: {embeddings.shape}")
    step2_end_time = time.time()
    print(f"   > Done in {step2_end_time - step2_start_time:.2f}s.")

Loading and preparing data...
   > Loaded 4076 articles.
   > Done in 0.34s.

Generating or Loading Embeddings...
   > Found existing embeddings file, loading...
   > Embeddings ready with shape: (4076, 384)
   > Done in 0.02s.
