In [None]:
!pip install -U llama-index

In [None]:
!pip install -U llama-index-multi-modal-llms-ollama

In [None]:
!pip install llama-index-readers-file
!pip install unstructured
!pip install llama-index-embeddings-huggingface
!pip install llama-index-vector-stores-qdrant
!pip install llama-index-embeddings-clip

In [None]:
from llama_index.multi_modal_llms.ollama import OllamaMultiModal

In [None]:
mm_model = OllamaMultiModal(model='llava:13b')

In [None]:
from pathlib import Path
from llama_index.core import SimpleDirectoryReader
from PIL import Image
import matplotlib.pyplot as plt

In [None]:
# Load image data

input_image_path = Path("img/restaurant_images")
if not input_image_path.exists():
    Path.mkdir(input_image_path)
    
!wget "https://docs.google.com/uc?export=download&id=1GlqcNJhGGbwLKjJK1QJ_nyswCTQ2K2Fq" -O ./img/restaurant_images/fried_chicken.png

image_documents = SimpleDirectoryReader("img/restaurant_images").load_data()

In [None]:
# display image

imageUrl = "./img/restaurant_images/fried_chicken.png"
image = Image.open(imageUrl).convert("RGB")
plt.figure(figsize=(16, 5))
plt.imshow(image)

In [None]:
from pydantic import BaseModel

class Restaurant(BaseModel):
    """Data model for an restaurant."""
    
    restaurant: str
    food: str
    discount: str
    price: str
    rating: str
    review: str

In [None]:
from llama_index.core.program import MultiModalLLMCompletionProgram
from llama_index.core.output_parsers import PydanticOutputParser

prompt_template_str = """\
{query_str}

Return the answer as a Pydantic object. The Pydantic schema is given below:

"""

mm_program = MultiModalLLMCompletionProgram.from_defaults(
    output_parser = PydanticOutputParser(Restaurant),
    image_documents = image_documents,
    prompt_template_str = prompt_template_str,
    multi_modal_llm = mm_model,
    verbose = True,
)

In [None]:
response = mm_program(query_str="Can you summarize what is in the image?")
for res in response:
    print(res)

In [None]:
# Retrieval-Augmented Image Captioning

In [None]:
!wget "https://www.dropbox.com/scl/fi/mlaymdy1ni1ovyeykhhuk/tesla_2021_10k.htm?rlkey=qf9k4zn0ejrbm716j0gg7r802&dl=1" -O ./img/tesla_2021_10k.htm
!wget "https://docs.google.com/uc?export=download&id=1THe1qqM61lretr9N3BmINc_NWDvuthYf" -O ./img/shanghai.jpg

In [2]:
from pathlib import Path
from llama_index.readers.file import UnstructuredReader
from llama_index.core.schema import ImageDocument

In [None]:
# Fix for NLTK lookup error for the next code block

import nltk
nltk.download('averaged_perceptron_tagger')

#nltk.download('all')

In [3]:
loader = UnstructuredReader()
documents = loader.load_data(file=Path("./img/tesla_2021_10k.htm"))
imgage_doc = ImageDocument(image_path="./img/shanghai.jpg")

In [4]:
from llama_index.core import VectorStoreIndex
from llama_index.core.embeddings import resolve_embed_model

embed_model = resolve_embed_model('local:BAAI/bge-m3')
vector_index = VectorStoreIndex.from_documents(
    documents, embed_model=embed_model
)

query_engine = vector_index.as_query_engine()

OutOfMemoryError: CUDA out of memory. Tried to allocate 850.00 MiB. GPU 0 has a total capacity of 21.99 GiB of which 214.12 MiB is free. Process 4161 has 0 bytes memory in use. Including non-PyTorch memory, this process has 21.25 GiB memory in use. Of the allocated memory 20.32 GiB is allocated by PyTorch, and 652.38 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [1]:
import torch, gc
gc.collect()
torch.cuda.empty_cache()