In [None]:
# perplixity script with enhancements
import os
import json
import time
import hashlib
import numpy as np
import pandas as pd
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI
from pinecone import Pinecone
import spacy
import yake
from langchain.text_splitter import TokenTextSplitter

# =========================
# CONFIGURATION
# =========================
load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY", "YOUR_OPENAI_API_KEY")
pinecone_api_key = os.getenv("PINECONE_API_KEY", "YOUR_PINECONE_API_KEY")
index_name = os.getenv("PINECONE_INDEX", "YOUR_INDEX_NAME")
data_folder = os.getenv("DATA_DIR", r"YOUR_LOCAL_DATA_DIRECTORY")
embedding_model = os.getenv("OPENAI_EMBEDDING_MODEL", "text-embedding-3-large")  # 3072 dims for accuracy
pinecone_environment = os.getenv("PINECONE_ENVIRONMENT", "us-east-1")

batch_size = int(os.getenv("BATCH_SIZE", 100))
start_batch = int(os.getenv("START_BATCH", 0))

# =========================
# INIT CLIENTS
# =========================
client = OpenAI(api_key=openai_api_key)
pc = Pinecone(api_key=pinecone_api_key)
nlp = spacy.load("en_core_web_sm")  # NER model
kw_extractor = yake.KeywordExtractor(top=10, stopwords=None)

# Create Pinecone index if not exists
index_list = [idx["name"] for idx in pc.list_indexes()]
if index_name not in index_list:
    print(f"Creating index '{index_name}'...")
    pc.create_index(
        name=index_name,
        dimension=3072,
        metric="cosine",
        cloud="aws",
        region=pinecone_environment,
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(2)
else:
    print(f"Using existing index: {index_name}")

index = pc.Index(index_name)

# =========================
# HELPERS
# =========================
def normalize_text(s: str) -> str:
    s = s.replace("\r\n", "\n").replace("\r", "\n")
    return "\n".join(line.strip() for line in s.split("\n") if line.strip())

def read_txt(file_path: str) -> str | None:
    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            return normalize_text(f.read())
    except:
        return None

def read_json(file_path: str) -> str | None:
    try:
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            data = json.load(f)
        preferred_keys = {"text", "content", "context", "body", "chunk", "page_content", "data", "message"}
        strings = []

        def walk(o, parent_key=None):
            if isinstance(o, dict):
                for k, v in o.items():
                    walk(v, k)
            elif isinstance(o, list):
                for item in o:
                    walk(item, parent_key)
            elif isinstance(o, str) and o.strip():
                if parent_key and parent_key.lower() in preferred_keys:
                    strings.append(o)
                else:
                    strings.append(o)

        walk(data)
        return normalize_text("\n".join(strings)) if strings else None
    except:
        return None

def read_tsv(file_path: str) -> str | None:
    try:
        df = pd.read_csv(file_path, sep="\t", dtype=str, encoding="utf-8", error_bad_lines=False)
        text = "\n".join(df.fillna("").astype(str).agg(" ".join, axis=1))
        return normalize_text(text)
    except:
        return None

def iter_data_files(root_dir: str):
    supported_ext = (".txt", ".json", ".tsv")
    for dirpath, _, filenames in os.walk(root_dir):
        for name in filenames:
            if name.lower().endswith(supported_ext):
                yield os.path.join(dirpath, name)

def extract_entities(text: str):
    doc = nlp(text)
    return list({ent.text for ent in doc.ents})

def extract_keywords(text: str):
    return [kw for kw, _ in kw_extractor.extract_keywords(text)]

def normalize_vector(vec):
    vec = np.array(vec, dtype=np.float32)
    return (vec / np.linalg.norm(vec)).tolist()

def get_embeddings_batch(texts, retries=3):
    for attempt in range(retries):
        try:
            resp = client.embeddings.create(model=embedding_model, input=texts)
            return [d.embedding for d in resp.data]
        except Exception as e:
            print(f"Embedding batch failed: {e}")
            time.sleep(2 ** attempt)
    return None

# =========================
# CHUNKING
# =========================
splitter = TokenTextSplitter(chunk_size=800, chunk_overlap=100, encoding_name="cl100k_base")

all_chunks = []
seen_hashes = set()

print(f"Scanning dataset in {data_folder}...")
for file_path in tqdm(list(iter_data_files(data_folder))):
    if file_path.lower().endswith(".txt"):
        text = read_txt(file_path)
    elif file_path.lower().endswith(".json"):
        text = read_json(file_path)
    elif file_path.lower().endswith(".tsv"):
        text = read_tsv(file_path)
    else:
        continue

    if not text:
        continue

    chunks = splitter.split_text(text)
    for i, chunk in enumerate(chunks):
        chunk = chunk.strip()
        if not chunk:
            continue
        h = hashlib.sha256(chunk.encode("utf-8")).hexdigest()
        if h in seen_hashes:
            continue
        seen_hashes.add(h)

        metadata = {
            "source": os.path.relpath(file_path, data_folder).replace("\\", "/"),
            "chunk_index": i,
            "file_type": os.path.splitext(file_path)[1][1:],
            "entities": extract_entities(chunk),
            "keywords": extract_keywords(chunk),
            "chunk_length": len(chunk)
        }

        all_chunks.append({
            "id": f"{os.path.basename(file_path)}-{h[:12]}",
            "text": chunk,
            "metadata": metadata
        })

print(f"Prepared {len(all_chunks)} chunks.")

# =========================
# EMBEDDING + UPLOAD
# =========================
total_batches = (len(all_chunks) + batch_size - 1) // batch_size
uploaded_vectors = 0

for idx in range(start_batch, total_batches):
    batch = all_chunks[idx * batch_size : (idx + 1) * batch_size]
    texts = [b["text"] for b in batch]

    embeddings = get_embeddings_batch(texts)
    if not embeddings:
        print("Skipping batch due to embedding failure.")
        continue

    vectors = []
    for item, emb in zip(batch, embeddings):
        vectors.append({
            "id": item["id"],
            "values": normalize_vector(emb),
            "metadata": {**item["metadata"], "text": item["text"]}
        })

    try:
        index.upsert(vectors=vectors)
        uploaded_vectors += len(vectors)
        print(f"Uploaded {len(vectors)} vectors (Total: {uploaded_vectors})")
    except Exception as e:
        print(f"Failed to upsert batch {idx+1}: {e}")
        break

print(f"Total vectors uploaded: {uploaded_vectors}")


ModuleNotFoundError: No module named 'numpy'