# Data Prep

Code authored by: Shaw Talebi 

[Blog link](https://medium.com/towards-data-science/multimodal-rag-process-any-file-type-with-ai-e6921342c903) 
| [Video link](https://youtu.be/Y7pNmocrmi8)

### imports

In [None]:
from bs4 import BeautifulSoup
import os
from utility_functions import *

from PIL import Image
from transformers import CLIPProcessor, CLIPModel

from torch import cat, save

### Extract text and images

In [2]:

def parse_html_images(html_content):
    """
    Parse HTML content and extract images with their captions.
    
    Args:
        html_content (str): Raw HTML content to parse
        
    Returns:
        list: List of dictionaries containing images and their metadata
    """
    # Parse HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Get article title
    article_title = soup.find('title').get_text().strip() if soup.find('title') else "Untitled"
    
    # Initialize variables
    structured_content = []
    current_section = "Main"  # Default section if no headers found
    
    # Find all headers and images
    content_elements = soup.find_all(['h1', 'h2', 'h3', 'img', 'figure'])
    
    for element in content_elements:
        if element.name in ['h1', 'h2', 'h3']:
            current_section = element.get_text().strip()
        elif element.name == 'img':
            # Get image path
            image_url = element.get('src', '')
            
            if image_url:  # Only proceed if there's an actual image URL
                # Download the image
                response = requests.get(image_url)
                if response.status_code == 200:
                    # Create images directory if it doesn't exist
                    os.makedirs('images', exist_ok=True)
                    
                    # Extract image filename from URL
                    image_filename = os.path.basename(image_url)
                    if "." not in image_filename:
                        image_filename = f"{image_filename}.jpg"
                    
                    # Define the local file path
                    image_filename = image_filename.replace('*', '_')
                    local_image_path = os.path.join('images', image_filename)

                    print(local_image_path)
                    
                    # Save the image to the local file path
                    with open(local_image_path, 'wb') as f:
                        f.write(response.content)
                    
                    # Store the local file path in the dictionary
                    image_path = local_image_path
                else:
                    image_path = ''
            
            # Try to get caption from alt text or figure caption
            caption = element.get('alt', '')
            if not caption and element.parent.name == 'figure':
                figcaption = element.parent.find('figcaption')
                if figcaption:
                    caption = figcaption.get_text().strip()
            
            if image_path:  # Only add if there's an actual image path
                structured_content.append({
                    'article_title': article_title,
                    'section': current_section,
                    'image_path': image_path,
                    'caption': caption or "No caption available"
                })
    
    return structured_content


In [3]:

def parse_html_content(html_content):
    """
    Parse HTML content and extract structured content with sections and paragraphs.
    
    Args:
        html_content (str): Raw HTML content to parse
        
    Returns:
        list: List of dictionaries containing structured content
    """
    # Parse HTML
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Get article title
    article_title = soup.find('title').get_text().strip() if soup.find('title') else "Untitled"
    
    # Initialize variables
    structured_content = []
    current_section = "Main"  # Default section if no headers found
    
    # Find all headers and text content
    content_elements = soup.find_all(['h1', 'h2', 'h3', 'p', 'ul', 'ol'])
    
    for element in content_elements:
        if element.name in ['h1', 'h2', 'h3']:
            current_section = element.get_text().strip()
        elif element.name in ['p', 'ul', 'ol']:
            text = element.get_text().strip()
            # Only add non-empty content that's at least 30 characters long
            if text and len(text) >= 30:
                structured_content.append({
                    'article_title': article_title,
                    'section': current_section,
                    'text': text
                })
    
    return structured_content


In [4]:
# Get all HTML files from raw directory
filename_list = ["raw/"+f for f in os.listdir('raw')]

text_content_list = []
image_content_list = []
for filename in filename_list:

    with open(filename, 'r', encoding='utf-8') as file:
        html_content = file.read()

    text_content_list.extend(parse_html_content(html_content))
    image_content_list.extend(parse_html_images(html_content))

images\0_YE-Q-OuWnrgrUrQw.jpg
images\1_yvfu8VAp1UgCw4SVvUe77Q.png
images\1_Nwc-ZhRFKH17LWWmsNhbdA.png
images\1_pyqGh5Cbrk_EMlPYtrfrQw.png
images\1_lvX8Mut8SQ1vDhsaewLQ_g.jpeg
images\1_IqUoZEX2CYOsX6oFIVeuIw.jpeg
images\1_PRSGngwjIVW01cLHK41lNg.jpeg
images\1_a6BF-kEeo8rd7OW2a3JYGA.png
images\1_jpmC6Kx7DxVeikEr15vooA.png
images\1_5d3HBNjNIXLy0oMIvJjxWw.png
images\1_AGHBVjzwjXapJSe4aUPrjg.png
images\1_2X1aT8fzFsgbqn23zXmmAA.png
images\1_Nzo536sqahqm1Q24Ms2vmA.png
images\1_4wnqr5p_7N3QD5EkXIQeew.png
images\1_tIY3_ONQQT_cracAPWm8NQ.png
images\1_4wnqr5p_7N3QD5EkXIQeew.png
images\1_Nzo536sqahqm1Q24Ms2vmA.png


In [5]:
print(len(text_content_list))
print(len(image_content_list))

86
17


In [6]:
text_list = []
for content in text_content_list:
    # concatenate title and section header
    section = content['section'] + ": "
    # append text from paragraph to fill CLIP's 256 sequence limit
    text = section + content['text'][:256-len(section)]
    
    text_list.append(text)

image_list = []
for content in image_content_list:
    image_list.append(Image.open(content['image_path']))

In [7]:
print(len(text_list))
print(len(image_list))

86
17


### Compute embeddings using CLIP

In [8]:
# import model
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")

# import processor (handles text tokenization and image preprocessing)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16") 

In [9]:
# pre-process text and images
inputs = processor(text=text_list, images=image_list, return_tensors="pt", padding=True)

In [10]:
# compute embeddings with CLIP
outputs = model(**inputs)

In [11]:
# store embeddings in single torch tensor
text_embeddings = outputs.text_embeds
image_embeddings = outputs.image_embeds

In [12]:
print(text_embeddings.shape)
print(image_embeddings.shape)

torch.Size([86, 512])
torch.Size([17, 512])


### Save Data

In [13]:
# save content list as JSON
save_to_json(text_content_list, output_file='data/text_content.json')
save_to_json(image_content_list, output_file='data/image_content.json')

In [14]:
# save embeddings to file
save(text_embeddings, 'data/text_embeddings.pt')
save(image_embeddings, 'data/image_embeddings.pt')