### Installation

In [None]:
%pip install llama-index
%pip install -U llama-parse
%pip install llama-index-vector-stores-mongodb
%pip install llama-index-llms-openai
%pip install llama-index-llms-cohere
%pip install llama-index-embeddings-cohere
%pip install llama-index-postprocessor-cohere-rerank

### Setup API Keys

In [None]:
import os

os.environ["LLAMA_CLOUD_API_KEY"] = ""
os.environ["COHERE_API_KEY"] = ""
os.environ["MONGO_URI"] = ""


In [None]:
# llama-parse is async-first, running the sync code in a notebook requires the use of nest_asyncio
import nest_asyncio

nest_asyncio.apply()

import pymongo

from llama_index.vector_stores.mongodb import MongoDBAtlasVectorSearch
from llama_parse import LlamaParse
from llama_index.core import SimpleDirectoryReader
from llama_index.core import VectorStoreIndex, StorageContext
from llama_index.core.settings import Settings
from llama_index.llms.openai import OpenAI

### Parse the document using `LlamaParse`.

In [None]:
# When checking in January 2025, we can only pass one language when using the Python SDK.
# You can specify multiple languages via the web UI.
# Note that we are using Premium mode here.
parser = LlamaParse(
    result_type="markdown",
    premium_mode=True,
    language="ch_tra"
)

# Use SimpleDirectoryReader to parse our file
file_extractor = {".docx": parser}

documents = SimpleDirectoryReader(
    input_files=['path to your directory or file'],
    file_extractor=file_extractor).load_data()
print(documents)

# documents = LlamaParse(result_type="text").load_data(file_path)

In [None]:
from llama_index.embeddings.cohere import CohereEmbedding
from llama_index.llms.cohere import Cohere

embed_model = CohereEmbedding(
    api_key=os.environ["COHERE_API_KEY"],
    model_name="embed-multilingual-v3.0",
    input_type="search_document",
)

Settings.embed_model = embed_model
Settings.llm = Cohere(model="command-r-plus", api_key=os.environ["COHERE_API_KEY"])

### Create `MongoDBAtlasVectorSearch`.

In [None]:
mongo_uri = os.environ["MONGO_URI"]

mongodb_client = pymongo.MongoClient(mongo_uri)
atlas_vector_store = MongoDBAtlasVectorSearch(
    mongodb_client,
    db_name = "llamaindex_db",
    collection_name = "llamaparse",
    vector_index_name = "vector_index",

)
vector_store_context = StorageContext.from_defaults(vector_store=atlas_vector_store)

### Create Index and Query Engine.

In [None]:
vector_store_index = VectorStoreIndex.from_documents(
   documents,
   storage_context=vector_store_context,
   show_progress=True
)

In [None]:
# Specify the collection for which to create the index
collection = mongodb_client["llamaindex_db"]["llamaparse"]
# Create your index model, then create the search index
from pymongo.operations import SearchIndexModel

search_index_model = SearchIndexModel(
  definition={
    "fields": [
      {
        "type": "vector",
        "path": "embedding",
        "numDimensions": 1024,
        "similarity": "cosine"
      }
    ]
  },
  name="vector_index",
  type="vectorSearch",
)
collection.create_search_index(model=search_index_model)


### Test Query

In [None]:
retriever = vector_store_index.as_retriever(similarity_top_k=3)
nodes = retriever.retrieve("What is the document about")
for node in nodes:
    print(node)

In [None]:
# Instantiate Atlas Vector Search as a retriever
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
import pprint

vector_store_retriever = VectorIndexRetriever(index=vector_store_index, similarity_top_k=5)

# Pass the retriever into the query engine
embed_model = CohereEmbedding(
    api_key=os.environ["COHERE_API_KEY"],
    model_name="embed-multilingual-v3.0",
    input_type="search_query",
)

Settings.embed_model = embed_model
query_engine = RetrieverQueryEngine(retriever=vector_store_retriever)

# Prompt the LLM
response = query_engine.query("What is the document about")

print(response)
print("\nSource documents: ")
pprint.pprint(response.source_nodes)