# 🧠 01_rag_index_reddit_posts_chunked.ipynb
Build a FAISS vector store from the latest cleaned Reddit posts using chunking.

In [None]:
%pip install faiss-cpu openai pandas python-dotenv tiktoken langchain

In [None]:
import os
import json
import openai
import pandas as pd
import numpy as np
import faiss
from dotenv import load_dotenv
from datetime import datetime
from glob import glob
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sklearn.preprocessing import normalize
from tqdm import tqdm

In [None]:
# ✅ Load environment variables
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
# ✅ Get latest cleaned file
cleaned_path = os.path.abspath("../data/cleaned_posts")
cleaned_files = sorted(glob(os.path.join(cleaned_path, "*clean_all.json")), reverse=True)
latest_file = cleaned_files[0]
print(f"📁 Using file: {latest_file}")

with open(latest_file, encoding="utf-8") as f:
    all_posts = json.load(f)

print(f"✅ Loaded {len(all_posts)} posts from latest file.")

In [None]:
# ✅ Split into chunks using LangChain text splitter
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = []
metadatas = []

for post in tqdm(all_posts):
    base_text = f"{post['title']}\n{post['selftext']}\n\nComments:\n" + "\n".join([c['body'] for c in post.get('comments', [])])
    chunks = splitter.split_text(base_text)
    for chunk in chunks:
        documents.append(chunk)
        metadatas.append({
            "id": post['id'],
            "subreddit": post.get("subreddit", ""),
            "created": post.get("created_local", ""),
            "url": post["url"]
        })

In [None]:
# ✅ Embed using OpenAI Ada model
def get_embedding(text):
    return openai.Embedding.create(input=[text.replace("\n", " ")], model="text-embedding-ada-002")['data'][0]['embedding']

embeddings = [get_embedding(doc) for doc in tqdm(documents)]
embeddings_np = normalize(np.array(embeddings).astype("float32"))

In [None]:
# ✅ Save FAISS index
index = faiss.IndexFlatL2(embeddings_np.shape[1])
index.add(embeddings_np)

output_dir = os.path.abspath("../data/vectorstore")
os.makedirs(output_dir, exist_ok=True)
faiss.write_index(index, os.path.join(output_dir, "reddit_index.faiss"))
with open(os.path.join(output_dir, "reddit_metadata.json"), "w", encoding="utf-8") as f:
    json.dump(metadatas, f, indent=2)

print("✅ Index and metadata saved.")