In [25]:
import os
import json
import time
import hashlib
from langchain.text_splitters import RecursiveCharacterTextSplitter
# except ImportError:
    # from langchain.text_splitter import RecursiveCharacterTextSplitter
from openai import OpenAI
from pinecone import Pinecone
from dotenv import load_dotenv

load_dotenv()

# Configuration
openai_api_key = os.getenv("OPENAI_API_KEY", "YOUR_OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY", "YOUR_PINECONE_API_KEY")
index_name = os.getenv("PINECONE_INDEX", "YOUR_INDEX_NAME")
data_folder = os.getenv("DATA_DIR", r"YOUR_LOCAL_DATASET_DIRECTORY_PATH")
embedding_model = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-small")  # 1536 dims

# Resume Support
start_batch = int(os.getenv("START_BATCH", "0"))  # Can be overridden via env
batch_size = int(os.getenv("BATCH_SIZE", "100"))  # Batch embeddings to reduce API calls

# Initialize Clients
client = OpenAI(api_key=openai_api_key)
pc = Pinecone(api_key=pinecone_api_key)
index = pc.Index(index_name)

# -------- Helpers --------
def normalize_text(s: str) -> str:
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    lines = [line.strip() for line in s.split("\n")]
    # Remove empty lines to compact chunks
    non_empty = [ln for ln in lines if ln]
    return "\n".join(non_empty)

def read_txt(file_path: str) -> str | None:
    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            text = f.read()
        text = normalize_text(text)
        return text if text else None
    except Exception as e:
        print(f"⚠️ Failed to read TXT {file_path}: {e}")
        return None

def extract_strings_from_json(obj) -> list[str]:
    preferred_keys = {"text", "content", "context", "body", "chunk", "page_content"}
    preferred, others = [], []

    def walk(o, parent_key=None):
        if isinstance(o, dict):
            for k, v in o.items():
                walk(v, k)
        elif isinstance(o, list):
            for item in o:
                walk(item, parent_key)
        elif isinstance(o, str):
            if parent_key in preferred_keys:
                preferred.append(o)
            else:
                others.append(o)

    walk(obj)
    return preferred if preferred else others

def read_json(file_path: str) -> str | None:
    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            data = json.load(f)
        strings = extract_strings_from_json(data)
        if not strings:
            return None
        text = normalize_text("\n".join(strings))
        return text if text else None
    except Exception as e:
        print(f"⚠️ Failed to read JSON {file_path}: {e}")
        return None

def iter_data_files(root_dir: str):
    for dirpath, _, filenames in os.walk(root_dir):
        for name in filenames:
            lower = name.lower()
            if lower.endswith(".txt") or lower.endswith(".json"):
                yield os.path.join(dirpath, name)

# Chunk Preparation
splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,      # slightly larger chunks for better throughput
    chunk_overlap=100,   # moderate overlap for context continuity
)

all_chunks = []
seen_hashes = set()

for file_path in iter_data_files(data_folder):
    try:
        if file_path.lower().endswith(".txt"):
            text = read_txt(file_path)
        else:
            text = read_json(file_path)

        if not text:
            continue

        splits = splitter.split_text(text)
        for chunk in splits:
            chunk = chunk.strip()
            if not chunk:
                continue
            h = hashlib.sha1(chunk.encode("utf-8")).hexdigest()
            if h in seen_hashes:
                continue
            seen_hashes.add(h)
            all_chunks.append({
                "id": f"{os.path.basename(file_path)}-{h[:12]}",
                "text": chunk,
                "metadata": {
                    "source": os.path.relpath(file_path, data_folder).replace("\\", "/"),
                    "hash": h
                }
            })
    except Exception as e:
        print(f"⚠️ Skipping {file_path}: {e}")

if not all_chunks:
    print("⚠️ No chunks prepared. Check your data directory and file formats.")
else:
    print(f"Prepared {len(all_chunks)} unique chunks from {data_folder}")

# -------- Embeddings (batched) --------
def get_embeddings_batch(texts: list[str], retries: int = 3, delay: int = 5) -> list | None:
    for attempt in range(retries):
        try:
            resp = client.embeddings.create(
                model=embedding_model,
                input=texts
            )
            # Align by index
            return [d.embedding for d in resp.data]
        except Exception as e:
            print(f"⚠️ Embedding batch failed (attempt {attempt + 1}): {e}")
            if attempt < retries - 1:
                print(f"Retrying in {delay} seconds...")
                time.sleep(delay)
    print("❌ Giving up on this embedding batch.")
    return None

# -------- Upload to Pinecone in Batches with Resume Support --------
total_batches = (len(all_chunks) + batch_size - 1) // batch_size
for i in range(start_batch * batch_size, len(all_chunks), batch_size):
    batch_index = i // batch_size
    batch = all_chunks[i:i + batch_size]
    print(f"Uploading batch {batch_index + 1} / {total_batches} (size={len(batch)})")

    texts = [item["text"] for item in batch]
    embeddings = get_embeddings_batch(texts)

    if embeddings is None:
        print(f"❌ Skipping upload for batch {batch_index + 1} due to embedding failure.")
        print("❗ Stopping script to prevent data inconsistency. Update START_BATCH to resume.")
        break

    vectors = []
    for item, emb in zip(batch, embeddings):
        if emb is None:
            continue
        vectors.append({
            "id": item["id"],
            "values": emb,
            "metadata": item["metadata"]
        })

    if not vectors:
        print("⚠️ No vectors in this batch to upload.")
        continue

    try:
        index.upsert(vectors=vectors)
        print(f"✅ Uploaded {len(vectors)} vectors.")
    except Exception as e:
        print(f"❌ Failed to upload batch {batch_index + 1}: {e}")
        print("❗ Stopping script to prevent data loss. Please resume by updating START_BATCH.")
        break

    time.sleep(0.5)  # gentle pacing to reduce rate-limit risk

print("✅ Script execution completed.")

ModuleNotFoundError: No module named 'langchain'