# 02. Index

- Index all the data downloaded into `_data/*`
- Put the index...somewhere? MongoDB? On the filesystem?


In [None]:
%load_ext autoreload
%autoreload 2

from dotenv import load_dotenv
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
from llama_index.readers.file.base import DEFAULT_FILE_READER_CLS
from llama_index.llms import OpenAI
from llama_index.vector_stores.docarray import DocArrayHnswVectorStore
from llama_index.storage.index_store import MongoIndexStore
from llama_index.storage.storage_context import StorageContext
from llama_index.text_splitter import TokenTextSplitter
from llama_index.node_parser import SimpleNodeParser, SentenceWindowNodeParser
from llama_index.query_engine import CitationQueryEngine
from llama_index.embeddings import OpenAIEmbedding
from llama_index.vector_stores.types import ExactMatchFilter, MetadataFilters
from tai_index import pil_wmf_stub_loader
from tqdm import tqdm
import glob
import openai
import os
from IPython.display import Markdown, display
import pymongo
import logging
import json

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
DATA_DIR = "./_data"

In [None]:
# Configure text splitting and OpenAI service context

# text_splitter = TokenTextSplitter()
# node_parser = SimpleNodeParser.from_defaults(text_splitter=text_splitter)
node_parser = SentenceWindowNodeParser()
llm = OpenAI(model="gpt-4")
embed_model = OpenAIEmbedding(embed_batch_size=128)

service_context = ServiceContext.from_defaults(
    llm=llm, node_parser=node_parser, embed_model=embed_model
)

In [None]:
# Configure the llama-index StorageContext

storage_context = StorageContext.from_defaults()

In [None]:
# Configure the LlamaIndex readers

from llama_index.readers.base import BaseReader
from llama_index.readers.file.docs_reader import DocxReader, HWPReader, PDFReader
from llama_index.readers.file.epub_reader import EpubReader
from llama_index.readers.file.image_reader import ImageReader
from llama_index.readers.file.ipynb_reader import IPYNBReader
from llama_index.readers.file.markdown_reader import MarkdownReader
from llama_index.readers.file.mbox_reader import MboxReader
from llama_index.readers.file.slides_reader import PptxReader
from llama_index.readers.file.tabular_reader import PandasCSVReader
from llama_index.readers.file.video_audio_reader import VideoAudioReader
from llama_hub.file.unstructured.base import UnstructuredReader
from llama_index.schema import Document

from tai_index.pptx_reader import PptxReaderNoCaption

file_readers = {
    ".pdf": PDFReader(),
    ".docx": DocxReader(),
    ".pptx": PptxReaderNoCaption(),
    ".csv": PandasCSVReader(),
    ".epub": EpubReader(),
    ".md": MarkdownReader(),
    ".mbox": MboxReader(),
    ".ipynb": IPYNBReader(),
    ".html": UnstructuredReader(),
}

In [None]:
def file_info(filename, course_id, source_index):
    if source_index is None:
        return {"course_id": course_id}

    name = source_index[filename]["name"]
    link = source_index[filename]["link"]
    return {"course_id": course_id, "name": name, "link": link}

In [None]:
course_ids = [
    "352034",
    # '322048',
    # '272942',
    # "226700",
    # '208522',
    # '130432',
]
display("Course IDs:", course_ids)

# Load data from each course into a document list.
documents = []
for course_id in (bar := tqdm(course_ids, desc="Load Course Data")):
    source_index = None
    filepath = f"_data/course-{course_id}/source_index.json"
    try:
        with open(filepath, "r") as file:
            source_index = json.loads(file.read())
    except:
        print("No source index found for course", course_id)

    bar.set_description(f"{course_id=}")

    bar.set_postfix_str("Reading")
    reader = SimpleDirectoryReader(
        f"{DATA_DIR}/course-{course_id}/",
        recursive=True,
        file_extractor=file_readers,
        required_exts=file_readers.keys(),
        file_metadata=lambda filename: file_info(filename, course_id, source_index),
    )

    bar.set_postfix_str("Loading")
    documents.extend(reader.load_data())

In [None]:
# Build the index from the documents
index = VectorStoreIndex.from_documents(
    documents,
    show_progress=True,
    service_context=service_context,
    storage_context=storage_context,
)

In [None]:
storage_context.persist("_data/index_store")