# 🔍 Multimodal RAG System: Pipeline Demo (без UI)

Мета — побудувати мультимодальну RAG-систему на базі статей з The Batch, що включає і текст, і зображення.


In [2]:
## бібліотеки
import requests
from bs4 import BeautifulSoup
from langchain_community.document_loaders import WebBaseLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter # для поділу тексту на шматки
from PIL import Image
from chromadb.utils.embedding_functions import OpenCLIPEmbeddingFunction
import chromadb
import os
import numpy as np
from dotenv import load_dotenv
from google import genai
from google.genai import types

load_dotenv()

api_key = os.getenv("GOOGLE_API")

# from huggingface_hub import InferenceClient # для (типу) хорошої відповіді
# from transformers import BlipProcessor, BlipForConditionalGeneration # для опису зображень
# from chromadb.utils.data_loaders import ImageLoader
# from langchain.embeddings import HuggingFaceEmbeddings # для embeddings - лише текстової

USER_AGENT environment variable not set, consider setting it to identify your requests.


## 1. Завантаження статей та зображень
Витягуємо текст і картинку

In [3]:
# url = "https://www.deeplearning.ai/the-batch/google-upgrades-its-ai-music-tools-for-professional-use/"
urls = ["https://www.deeplearning.ai/the-batch/the-international-energy-agency-examines-the-energy-costs-and-potential-savings-of-the-ai-boom/","https://www.deeplearning.ai/the-batch/ai-co-scientist-an-agent-that-generates-research-hypotheses-aiding-drug-discovery/","https://www.deeplearning.ai/the-batch/ai-and-data-center-boom-challenges-big-techs-emissions-targets/"]


далі треба якось спарсити все те


In [4]:
contents = []
images = []
for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    initial_images = [img.get('src') for img in soup.find_all('img') if img.get('src')]
    images.append([url for url in initial_images if 'gif' not in url.lower() and 'wordpress' not in url.lower() and 'svg+xml' not in url.lower() and 'batch-logo' not in url.lower()])
    elements = soup.select(".prose--styled") # вибираєсмо класи, записується в зворотньому порядку
    print(f"{url} - Знайдено contents {len(elements)} елементs")
    print(f"{url} - Знайдено imgs {len(images)} елементs")
    contents.append(elements[0].get_text(separator=' ')) # нормальний текст
contents
images    

https://www.deeplearning.ai/the-batch/the-international-energy-agency-examines-the-energy-costs-and-potential-savings-of-the-ai-boom/ - Знайдено contents 1 елементs
https://www.deeplearning.ai/the-batch/the-international-energy-agency-examines-the-energy-costs-and-potential-savings-of-the-ai-boom/ - Знайдено imgs 1 елементs
https://www.deeplearning.ai/the-batch/ai-co-scientist-an-agent-that-generates-research-hypotheses-aiding-drug-discovery/ - Знайдено contents 1 елементs
https://www.deeplearning.ai/the-batch/ai-co-scientist-an-agent-that-generates-research-hypotheses-aiding-drug-discovery/ - Знайдено imgs 2 елементs
https://www.deeplearning.ai/the-batch/ai-and-data-center-boom-challenges-big-techs-emissions-targets/ - Знайдено contents 1 елементs
https://www.deeplearning.ai/the-batch/ai-and-data-center-boom-challenges-big-techs-emissions-targets/ - Знайдено imgs 3 елементs


[['/_next/image/?url=https%3A%2F%2Fcharonhub.deeplearning.ai%2Fcontent%2Fimages%2F2025%2F06%2Funnamed---2025-06-04T165349.311-1.png&w=3840&q=75'],
 ['/_next/image/?url=https%3A%2F%2Fcharonhub.deeplearning.ai%2Fcontent%2Fimages%2F2025%2F03%2Funnamed--65--1.png&w=3840&q=75'],
 ['/_next/image/?url=https%3A%2F%2Fcharonhub.deeplearning.ai%2Fcontent%2Fimages%2F2024%2F07%2Funnamed--70--1.jpg&w=3840&q=75']]

## 2. Препроцесінг тексту та зображень

витягнений текст

In [5]:
for content in contents:
    loader = WebBaseLoader(web_paths=urls)
    text_docs = loader.load()
text_docs[0].page_content = content

In [6]:
### перевірка (можна пропустити)
(len(text_docs))

3

збереження зображення

In [7]:
prefix = "https://www.deeplearning.ai"
img_dir = "downloaded_images"
os.makedirs(img_dir, exist_ok=True)

count = 0
for image_urls in images:
    for img_url in image_urls:
        resp = requests.get(prefix+img_url)
        ext_part = img_url.split('.')[-1] # Відокремлюємо частину після останньої крапки, а потім беремо до ? або &
        ext = ext_part.split('?')[0].split('&')[0]  # Обрізаємо параметри
        filename = f"img_{count}.{ext}"
        filepath = os.path.join(img_dir, filename)
        with open(filepath, "wb") as f:
            f.write(resp.content)
        # print(resp)
        count += 1

перетворення зображень на numpy масив

In [8]:
img_dir = "downloaded_images"
numpy_images = []

for filename in os.listdir(img_dir):
    filepath = os.path.join(img_dir, filename)
    with Image.open(filepath) as img:
        img = img.convert("RGB") 
        np_img = np.array(img)
        numpy_images.append(np_img)

print(f"Завантажено та конвертовано {len(numpy_images)} images у numpy")

Завантажено та конвертовано 3 images у numpy


### поділ тексту

використовуємо langchain для поділу тексту на шматки

In [9]:
all_text_splits = []

for text_doc in text_docs:
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=100,  # chunk overlap (characters) (перекриття між суміжними шматками (50 символів (або слів) з кінця попереднього шматка повторюються на початку наступного)
    add_start_index=True,  # track index in original document
    )

    all_text_splits += text_splitter.split_documents(text_docs)

print(f"Split post into {len(all_text_splits)} sub-documents.")

Split post into 54 sub-documents.


## 3-4 мультимодальний ембединг


In [10]:
embedding_function = OpenCLIPEmbeddingFunction()

  from .autonotebook import tqdm as notebook_tqdm


## 5. 🗃️ Створення мультимодального індексу

- Об’єднуємо текстові + візуальні ембединги.
- Індексування за допомогою FAISS / Chroma / Milvus.

pip install chromadb (я хз чи це треба)
pip install langchain-chroma


### метадані та ід

In [11]:
### id
text_ids = [f"text_{i}" for i in range(len(all_text_splits))] # ід для тексту
image_ids = [f"img_{i}" for i in range(len(numpy_images))] # ід для зображень
# img_description_ids = [f"img_desc_{i}" for i in range(len(numpy_images))] # id опису зображень

text_documents = [doc.page_content for doc in all_text_splits] # для хрома (бо док-лангчеін не їсть)

In [12]:
text_metadatas = []
for split in all_text_splits:
    text_metadatas.append({
        "type": "text",
        "source": split.metadata.get("source", "unknown"),  # якщо є
        "title": split.metadata.get("title", "no_title"),
    })

image_metadatas = []
for text_doc in text_docs:
    image_metadatas.append({
        "type": "image",
        "source": text_doc.metadata.get("source", "unknown"),  # якщо є
        "title": text_doc.metadata.get("title", "no_title"),
        "local_path": img_dir + ''
        
    })    

# img_desc_metadatas = [] - на перспективу
# for text_doc,id_img in zip(text_docs,image_ids):
#     img_desc_metadatas.append({
#         "type": "text",
#         "source": text_doc.metadata.get("source", "unknown"),  # якщо є
#         "title": text_doc.metadata.get("title", "no_title"),
#         "image_ids": id_img,
#     })        

In [19]:
#перевірка (optional)
print(text_metadatas)
print(image_metadatas)
# print(img_desc_metadatas)

[{'type': 'text', 'source': 'https://www.deeplearning.ai/the-batch/the-international-energy-agency-examines-the-energy-costs-and-potential-savings-of-the-ai-boom/', 'title': 'The International Energy Agency Examines The Energy Costs and Potential Savings of the AI Boom'}, {'type': 'text', 'source': 'https://www.deeplearning.ai/the-batch/the-international-energy-agency-examines-the-energy-costs-and-potential-savings-of-the-ai-boom/', 'title': 'The International Energy Agency Examines The Energy Costs and Potential Savings of the AI Boom'}, {'type': 'text', 'source': 'https://www.deeplearning.ai/the-batch/the-international-energy-agency-examines-the-energy-costs-and-potential-savings-of-the-ai-boom/', 'title': 'The International Energy Agency Examines The Energy Costs and Potential Savings of the AI Boom'}, {'type': 'text', 'source': 'https://www.deeplearning.ai/the-batch/the-international-energy-agency-examines-the-energy-costs-and-potential-savings-of-the-ai-boom/', 'title': 'The Inter

### також додамо опис для зображень (SKIP)

In [None]:
# processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
# model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# def generate_caption(image):
#     inputs = processor(image, return_tensors="pt")
#     out = model.generate(**inputs)
#     caption = processor.decode(out[0], skip_special_tokens=True)
#     return caption

In [None]:
# image_description = []
# for img in numpy_images:
#     img_pil = Image.fromarray(img)
#     caption = generate_caption(img_pil)
#     image_description.append(caption)    

In [None]:
# print(image_description)

### add до бд

In [13]:
# data_loader = ImageLoader() # для зберігання з uris 
client = chromadb.PersistentClient(path="chroma_langchain_db/") # для збереження локально

collection = client.create_collection(
    name='multimodal_collection',
    embedding_function=embedding_function,
    # data_loader=data_loader,
)

InternalError: Collection [multimodal_collection] already exists

In [21]:
###додаєм до век бд текст
collection.add(
    ids=text_ids, 
    documents=text_documents,
    metadatas=text_metadatas,
               )

In [22]:
###додаєм до век бд зображення
collection.add(
    ids=image_ids,
    images=numpy_images,
    metadatas=image_metadatas,
)

In [None]:
# SKIP
# ###додаєм до век бд описи зображення
# collection.add(
#     ids=img_description_ids,
#     documents=image_description,
#     metadatas=image_metadatas,
# )

In [14]:
# перевірка (optional)
print(collection.count())


NameError: name 'collection' is not defined

## 6. Запит і Ретрівал (тести) (can be skipped)

- Користувач формулює запит (наприклад: “Що нового в архітектурах NVIDIA?”).
- Вивід: текст статті + пов’язане зображення.

In [72]:
#тест на зображенні
results = collection.query(
    query_images=[numpy_images[0]]
)

print((results))

{'ids': [['img_0', 'img_1', 'img_2', 'text_14', 'text_32', 'text_50', 'text_9', 'text_27', 'text_45', 'text_4']], 'embeddings': None, 'documents': [[None, None, None, 'have increased more than four-fold since 2019.Low-emissions energy has reduced Google’s total data-center emissions substantially, but some regions don’t have enough of it to meet demand. Solar, wind, hydro, geothermal, and nuclear energy account for most of the energy consumed by Google’s data centers in Europe, Canada, and South America. However, these sources account for less than 5 percent in Singapore, Qatar, and Saudi Arabia.Countering the trend:\xa0Google is working to reduce its greenhouse gas emissions on several fronts. Its effort to purchase electricity from low-emissions sources cut its net carbon footprint by around 30 percent in 2023. It claims that its owned-and-operated data centers are 1.8 times more energy-efficient than a typical enterprise data center, and its sixth-generation tensor processing units 

In [75]:
docs = results.get('documents', [[]])[0]  # перший список документів

first_non_none_doc = next((doc for doc in docs if doc is not None), None)
print(first_non_none_doc)

have increased more than four-fold since 2019.Low-emissions energy has reduced Google’s total data-center emissions substantially, but some regions don’t have enough of it to meet demand. Solar, wind, hydro, geothermal, and nuclear energy account for most of the energy consumed by Google’s data centers in Europe, Canada, and South America. However, these sources account for less than 5 percent in Singapore, Qatar, and Saudi Arabia.Countering the trend: Google is working to reduce its greenhouse gas emissions on several fronts. Its effort to purchase electricity from low-emissions sources cut its net carbon footprint by around 30 percent in 2023. It claims that its owned-and-operated data centers are 1.8 times more energy-efficient than a typical enterprise data center, and its sixth-generation tensor processing units (TPUs) are 67 percent more efficient than the prior generation. Google has asked its largest hardware partners to match 100 percent of their energy consumption with


In [16]:
#тест на тексті
results = collection.query(
    query_texts=["How is AI growth impacting tech companies' carbon goals and data center emissions?"]
)

print((results))

NameError: name 'collection' is not defined

In [78]:
results['metadatas']

[[{'title': 'The International Energy Agency Examines The Energy Costs and Potential Savings of the AI Boom',
   'source': 'https://www.deeplearning.ai/the-batch/the-international-energy-agency-examines-the-energy-costs-and-potential-savings-of-the-ai-boom/',
   'type': 'text'},
  {'source': 'https://www.deeplearning.ai/the-batch/the-international-energy-agency-examines-the-energy-costs-and-potential-savings-of-the-ai-boom/',
   'type': 'text',
   'title': 'The International Energy Agency Examines The Energy Costs and Potential Savings of the AI Boom'},
  {'title': 'The International Energy Agency Examines The Energy Costs and Potential Savings of the AI Boom',
   'source': 'https://www.deeplearning.ai/the-batch/the-international-energy-agency-examines-the-energy-costs-and-potential-savings-of-the-ai-boom/',
   'type': 'text'},
  {'type': 'text',
   'title': "AI and Data Center Boom Challenges Big Tech's Emissions Targets",
   'source': 'https://www.deeplearning.ai/the-batch/ai-and-dat

In [80]:
first_metadata = results['metadatas'][0][0]
first_source = first_metadata.get('source')
first_source

'https://www.deeplearning.ai/the-batch/the-international-energy-agency-examines-the-energy-costs-and-potential-savings-of-the-ai-boom/'

In [None]:
results_imgs = collection.get(
    where={
        "$and": [
            {"source": first_source},
            {"type": "image"}
        ]
    }
)


вивести зображення

In [64]:
def find_file_by_prefix(prefix, folder):
    for filename in os.listdir(folder):
        if filename.startswith(prefix):
            return os.path.join(folder, filename)
    return None

for img_id in results_imgs['ids']:
    filepath = find_file_by_prefix(img_id, img_dir)
    if filepath and os.path.exists(filepath):
        img = Image.open(filepath)
        img.show()
    else:
        print(f"Файл для {img_id} не знайдено")
filepath        

'downloaded_images\\img_0.png'

### LLM

лише текст

In [69]:
client = genai.Client(api_key=api_key)

with open(filepath, 'rb') as f:
      image_bytes = f.read()
query = "How is AI growth impacting tech companies' carbon goals and data center emissions?"

response = client.models.generate_content(
    model="gemini-2.5-flash", 
    contents=[
        types.Part(text=(
            "You are given a piece of text and an image. "
            "Based on both, provide a clear, structured, and factual response to the following query.\n\n"
            "Context:\n"
            f"{results['documents'][0][0]}\n\n"
            "Query:\n"
            f"{query}\n"
            "Use only the information available in the context and image. If you cannot answer based on that, say so honestly."
        )),
        types.Part.from_bytes(
            data=image_bytes,
            mime_type='image/jpeg',
        ),
    ]
)
print(response.text)

Based on the provided context and image:

*   **Data Center Emissions:** The image demonstrates that as AI models increase in size and complexity (from "Very small LM" to "Large reasoning model"), their inference electricity consumption significantly rises, ranging from approximately 0.1 Wh to 9.0 Wh for text-generation tasks. This indicates that AI growth, particularly through the deployment of larger and more sophisticated models, will lead to increased electricity consumption by data centers. The text notes that data centers and cloud computing currently account for 1% of the world's energy-related greenhouse gas emissions.
*   **Tech Companies' Carbon Goals:** The provided information does not directly state how AI growth impacts specific tech companies' carbon goals. However, the text suggests that AI is viewed as a powerful tool that "stands to create huge benefits relative to the climate impact of its emissions" and can help "develop low-carbon energy sources and boost energy ef

## 7. ✅ Підсумок і оцінка

- Якість відповідей
- Наскільки релевантні результати

In [20]:
client = chromadb.PersistentClient(path="chroma_langchain_db")
collection= client.get_collection(name="multimodal_collection")

In [21]:
results = collection.query(
    query_texts=["How is AI growth impacting tech companies' carbon goals and data center emissions?"]
)

print((results))

{'ids': [['text_4', 'text_22', 'text_40', 'text_16', 'text_34', 'text_52', 'text_13', 'text_31', 'text_49', 'text_20']], 'embeddings': None, 'documents': [['relieved to note that, for now, data centers and cloud computing are responsible for\xa0 1 percent \xa0of the world’s energy-related greenhouse gas emissions; a drop in the bucket compared to transportation, construction, or agriculture. Moreover, we believe that AI stands to create huge benefits relative to the climate impact of its emissions, and AI is one of the most powerful tools we have to develop low-carbon energy sources and boost energy efficiency throughout society. Continuing to improve the technology will help us develop lower-carbon energy sources and efficient ways to harness them.', 'relieved to note that, for now, data centers and cloud computing are responsible for\xa0 1 percent \xa0of the world’s energy-related greenhouse gas emissions; a drop in the bucket compared to transportation, construction, or agriculture.

In [22]:
from abc import ABC, abstractmethod
from typing import Any, Dict
from utils import file_by_prefix
class AbstractContextFetcher(ABC):
    def __init__(self, collection: Any):
        self.collection = collection

    @abstractmethod
    def build_context(self, query_result: Dict) -> Dict:
        """Повертає пов'язаний контекст"""
        pass

class TextContextFetcher(AbstractContextFetcher):
    """пов’язанні(((в однині))) зображення(((в однині))) на основі текстового запиту"""
    def build_context(self, query_result: Dict) -> Dict:
        first_metadata = query_result['metadatas'][0][0]
        results_imgs = self.collection.get(
            where={
                "$and": [
                    {"source": first_metadata.get('source')},
                    {"type": "image"}
                    ]
            }
        )

        filepath = file_by_prefix(results_imgs['ids'][0]) # на перспективу з можливістю більшої кількості зображень
        text = query_result['documents'][0][0] if query_result['documents'][0] else None
   
        return {"image_path": filepath, "text": text}

In [24]:
t = TextContextFetcher(collection)
t.build_context(results)

{'image_path': 'downloaded_images\\img_0.png',
 'text': 'relieved to note that, for now, data centers and cloud computing are responsible for\xa0 1 percent \xa0of the world’s energy-related greenhouse gas emissions; a drop in the bucket compared to transportation, construction, or agriculture. Moreover, we believe that AI stands to create huge benefits relative to the climate impact of its emissions, and AI is one of the most powerful tools we have to develop low-carbon energy sources and boost energy efficiency throughout society. Continuing to improve the technology will help us develop lower-carbon energy sources and efficient ways to harness them.'}