In [None]:
!pip install selenium
!pip install pymilvus
!pip install pytesseract
!pip install langchain-community


In [2]:
import os
import time
import re
import json
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

# === Configuration ===
OUTPUT_DIR = "./zomato_scraped_data"
os.makedirs(OUTPUT_DIR, exist_ok=True)

URLS = [
    "https://www.zomato.com/lucknow/punjab-grill-gomti-nagar/order",
    "https://www.zomato.com/lucknow/royal-cafe-royal-inn-sapru-marg/order",
    "https://www.zomato.com/lucknow/barkaas-indo-arabic-restaurant-1-aliganj/order",
    "https://www.zomato.com/lucknow/hazratganj-social-hazratganj/order",
    "https://www.zomato.com/lucknow/cafe-hons-house-of-no-sugar-gomti-nagar/order",
    "https://www.zomato.com/lucknow/kake-da-hotel-since-1931-jankipuram/order",
    "https://www.zomato.com/lucknow/cafe-delhi-heights-sadar-bazaar/order",
    "https://www.zomato.com/lucknow/mcdonalds-2-hazratganj/order",
    "https://www.zomato.com/lucknow/grand-patio-hotel-savvy-grand-gomti-nagar/order",
    "https://www.zomato.com/lucknow/abongzaa-multi-cuisine-cafe-restaurant-gomti-nagar/order"
]

# Setup Chrome for Selenium
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.add_argument('--remote-debugging-port=9222')
chrome_options.add_argument(
    "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36"
)

driver = webdriver.Chrome(options=chrome_options)

def create_page_folder(title):
    folder_name = re.sub(r'[^a-zA-Z0-9]+', '_', title).strip('_')
    folder_path = os.path.join(OUTPUT_DIR, folder_name)
    os.makedirs(folder_path, exist_ok=True)
    return folder_path

def save_file(content, path, binary=False):
    mode = 'wb' if binary else 'w'
    with open(path, mode, encoding=None if binary else 'utf-8') as f:
        f.write(content)

def extract_markdown(html):
    soup = BeautifulSoup(html, 'html.parser')
    return soup.get_text(separator='\n')

def extract_structured_data(html):
    soup = BeautifulSoup(html, 'html.parser')

    # Extract <script type="application/ld+json"> data
    scripts = soup.find_all('script', type='application/ld+json')
    restaurant_data = None
    for script in scripts:
        try:
            data = json.loads(script.string.strip())
            if isinstance(data, list):
                for d in data:
                    if d.get('@type') == 'Restaurant':
                        restaurant_data = d
            elif data.get('@type') == 'Restaurant':
                restaurant_data = data
        except Exception as e:
            print("Error parsing ld+json:", e)

    # Extract menu items from the page
    menu_items = []
    menu_divs = soup.find_all("div", class_="sc-iAVDmT bWpTfk")  # Use the correct class for menu card

    print(f"Found {len(menu_divs)} menu items.")

    for item in menu_divs:
        try:
            # Extract menu title (name of the dish)
            name_tag = item.find("h4", class_="sc-cGCqpu chKhYc")
            name = name_tag.text.strip() if name_tag else "No name available"

            # Extract bestseller status (True or False)
            bestseller_tag = item.find("div", class_="sc-2gamf4-0 fSJGVb")
            is_bestseller = "BESTSELLER" in bestseller_tag.text.upper() if bestseller_tag else False

            # Extract veg/non-veg type from div
            dish_type = "unknown"
            type_svg = item.find('svg', class_="sc-eOnLuU dlDRKy")  # Using the primary class
            if type_svg:
              svg_str = str(type_svg)
              if "#veg-icon" in svg_str:
                  dish_type = "veg"
              elif "#non-veg-icon" in svg_str:
                  dish_type = "non-veg"



            # Extract price
            price_tag = item.find("span", class_="sc-17hyc2s-1 cCiQWA")
            price_text = price_tag.text.strip().replace("₹", "") if price_tag else "0"
            price = int(re.sub(r"[^\d]", "", price_text)) if price_text else 0

            # Extract description
            desc_tag = item.find("p", class_="sc-gsxalj jqiNmO")
            description = desc_tag.text.strip() if desc_tag else "No description available"

            menu_items.append({
                "name": name,
                "isBestseller": is_bestseller,
                "price": price,
                "priceCurrency": "INR",
                "description": description,
                "isVeg": dish_type,
            })
        except Exception as e:
            print("Error extracting item:", e)

    menu_data = {
        "@type": "Menu",
        "hasMenuItem": menu_items
    }

    # If no menu items were found, log that the extraction failed
    if not menu_items:
        print("No menu items were extracted.")

    return {
        "restaurant": restaurant_data,
        "menu": menu_data
    }


def scrape_page(url):
    print(f"Scraping: {url}")
    driver.get(url)
    time.sleep(5)  # wait for page to load JS content

    html = driver.page_source
    soup = BeautifulSoup(html, 'html.parser')

    title = soup.title.string.strip() if soup.title else 'untitled'
    folder = create_page_folder(title)

    # Save raw HTML
    save_file(html, os.path.join(folder, 'page.html'))

    # Save plain markdown-style content
    md_content = extract_markdown(html)
    save_file(md_content, os.path.join(folder, 'page.md'))

    # Extract and save structured data
    structured_data = extract_structured_data(html)
    save_file(json.dumps(structured_data, indent=2), os.path.join(folder, 'structured_data.json'))

if __name__ == '__main__':
    for link in URLS:
        try:
            scrape_page(link)
        except Exception as err:
            print(f"Error scraping {link}: {err}")
    driver.quit()


Scraping: https://www.zomato.com/lucknow/punjab-grill-gomti-nagar/order
Found 88 menu items.
Scraping: https://www.zomato.com/lucknow/royal-cafe-royal-inn-sapru-marg/order
Found 208 menu items.
Scraping: https://www.zomato.com/lucknow/barkaas-indo-arabic-restaurant-1-aliganj/order
Found 170 menu items.
Scraping: https://www.zomato.com/lucknow/hazratganj-social-hazratganj/order
Found 174 menu items.
Scraping: https://www.zomato.com/lucknow/cafe-hons-house-of-no-sugar-gomti-nagar/order
Found 181 menu items.
Scraping: https://www.zomato.com/lucknow/kake-da-hotel-since-1931-jankipuram/order
Found 176 menu items.
Scraping: https://www.zomato.com/lucknow/cafe-delhi-heights-sadar-bazaar/order
Found 201 menu items.
Scraping: https://www.zomato.com/lucknow/mcdonalds-2-hazratganj/order
Found 205 menu items.
Scraping: https://www.zomato.com/lucknow/grand-patio-hotel-savvy-grand-gomti-nagar/order
Found 99 menu items.
Scraping: https://www.zomato.com/lucknow/abongzaa-multi-cuisine-cafe-restaurant-g

In [3]:
import os
import json

OUTPUT_DIR = "./zomato_scraped_data"

def process_price_range():
    # Iterate through each restaurant directory
    for restaurant_dir in os.listdir(OUTPUT_DIR):
        dir_path = os.path.join(OUTPUT_DIR, restaurant_dir)
        json_path = os.path.join(dir_path, 'structured_data.json')

        # Skip if the JSON file doesn't exist
        if not os.path.exists(json_path):
            continue

        # Load existing data
        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
        except Exception as e:
            print(f"Error loading {json_path}: {e}")
            continue

        # Extract menu items and prices
        menu_items = data.get('menu', {}).get('hasMenuItem', [])
        prices = []
        for item in menu_items:
            price = item.get('price', 0)
            if isinstance(price, (int, float)) and price > 0:  # Only consider valid positive prices
                prices.append(price)

        # Calculate price range
        price_range = ""
        if prices:
            min_price = min(prices)
            max_price = max(prices)
            price_range = f"₹{min_price} - ₹{max_price}"

        # Update restaurant data
        restaurant_data = data.get('restaurant', {})
        if restaurant_data:
            restaurant_data['priceRange'] = price_range
            data['restaurant'] = restaurant_data

        # Save updated data back to file
        try:
            with open(json_path, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)
            print(f"Updated {json_path}")
        except Exception as e:
            print(f"Error saving {json_path}: {e}")

if __name__ == '__main__':
    process_price_range()

Updated ./zomato_scraped_data\Abongzaa_Multi_Cuisine_Cafe_Restaurant_Gomti_Nagar_order_online_Zomato\structured_data.json
Updated ./zomato_scraped_data\Barkaas_Indo_Arabic_Restaurant_Aliganj_order_online_Zomato\structured_data.json
Updated ./zomato_scraped_data\Cafe_Delhi_Heights_Sadar_Bazaar_order_online_Zomato\structured_data.json
Updated ./zomato_scraped_data\Cafe_Hons_House_Of_No_Sugar_Gomti_Nagar_order_online_Zomato\structured_data.json
Updated ./zomato_scraped_data\Grand_Patio_Hotel_Savvy_Grand_Gomti_Nagar_order_online_Zomato\structured_data.json
Updated ./zomato_scraped_data\Hazratganj_SOCIAL_Hazratganj_order_online_Zomato\structured_data.json
Updated ./zomato_scraped_data\Kake_Da_Hotel_Since_1931_Jankipuram_order_online_Zomato\structured_data.json
Updated ./zomato_scraped_data\McDonald_s_Hazratganj_order_online_Zomato\structured_data.json
Updated ./zomato_scraped_data\Punjab_Grill_Gomti_Nagar_order_online_Zomato\structured_data.json
Updated ./zomato_scraped_data\Royal_Cafe_Roya

In [10]:
import os
import json
import re
import logging
from datetime import datetime, timezone
from typing import List, Dict, Any

# === Configuration ===
OUTPUT_DIR = os.getenv("ZOMATO_DATA_DIR", "./zomato_scraped_data")

# Pre-compile regex patterns once
INGREDIENTS_PATTERNS = [
    re.compile(r'(?:contains|made with|ingredients)[:\s]*(.*?)(?:[.;]|$)', re.IGNORECASE),
]
SPICE_LEVEL_PATTERN = re.compile(r'spice level[:\s]*([0-5])(?:/5)?', re.IGNORECASE)

# Dietary and category configurations
DIETARY_KEYWORDS: Dict[str, List[str]] = {
    'vegetarian': ['veg', 'plant based', 'no meat'],
    'vegan': ['vegan', 'dairy free', 'no animal'],
    'gluten-free': ['gluten-free', 'gf', 'no gluten'],
    'spicy': ['spicy', 'hot', 'chili'],
}

MENU_CATEGORIES: Dict[str, List[str]] = {
    'appetizer': ['platter', 'starter', 'soup', 'salad', 'bruschetta'],
    'main_course': ['curry', 'rice', 'noodles', 'burger', 'pizza', 'pasta'],
    'dessert': ['ice cream', 'cake', 'sweet', 'pastry', 'pie'],
    'beverage': ['juice', 'coffee', 'tea', 'smoothie'],
}

# Set up logging
logging.basicConfig(
    filename="preprocessing.log",
    level=logging.INFO,
    format="%(asctime)s %(levelname)s:%(message)s",
)

def normalize_text(text: str) -> str:
    return re.sub(r'[^\w\s]', '', text or "").lower().strip()

def extract_dietary_tags(description: str) -> List[str]:
    desc = normalize_text(description)
    tags = {
        tag for tag, keywords in DIETARY_KEYWORDS.items()
        if any(kw in desc for kw in keywords)
    }
    return sorted(tags)

def classify_menu_category(name: str, description: str) -> str:
    text = f"{normalize_text(name)} {normalize_text(description)}"
    for category, keywords in MENU_CATEGORIES.items():
        if any(kw in text for kw in keywords):
            return category
    return 'other'

def extract_ingredients(description: str) -> List[str]:
    desc = description or ""
    for pattern in INGREDIENTS_PATTERNS:
        match = pattern.search(desc)
        if match:
            parts = re.split(r',\s*| and ', match.group(1))
            return [p.lower().strip() for p in parts if p.strip()]
    return []

def extract_spice_level(description: str) -> int:
    if m := SPICE_LEVEL_PATTERN.search(description or ""):
        return int(m.group(1))
    return None  # Or 0, if you prefer

def process_menu_items(data: Dict[str, Any]) -> None:
    now_iso = datetime.now(timezone.utc).isoformat()
    for item in data.get('menu', {}).get('hasMenuItem', []):
        desc = item.get('description', '') or ""
        name = item.get('name', '') or ""
        item.update({
            'dietary_tags': extract_dietary_tags(desc),
            'menu_category': classify_menu_category(name, desc),
            'normalized_name': normalize_text(name),
            'ingredients': extract_ingredients(desc),
            'spice_level': extract_spice_level(desc),
            'processed_at': now_iso,
        })

def process_all_restaurants(output_dir: str = OUTPUT_DIR) -> None:
    for rest in os.listdir(output_dir):
        path = os.path.join(output_dir, rest, 'structured_data.json')
        if not os.path.isfile(path):
            continue

        try:
            with open(path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            process_menu_items(data)

            with open(path, 'w', encoding='utf-8') as f:
                json.dump(data, f, indent=2, ensure_ascii=False)

            logging.info(f"Processed {len(data['menu']['hasMenuItem'])} items for {rest}")

        except Exception as e:
            logging.error(f"Failed {rest}: {e}", exc_info=True)

if __name__ == "__main__":
    process_all_restaurants()


In [None]:
from pymilvus import connections, utility
MILVUS_URI = "## Enter your uri"
TOKEN = "## enter your token"

connections.connect("default", uri=MILVUS_URI, token=TOKEN)
collection_name = "knowledge_base"
dim = 384  # Dimension of the embedding model

# Check and reset collection if it already exists
check_collection = utility.has_collection(collection_name)
if check_collection:
    utility.drop_collection(collection_name)

In [None]:
from pymilvus import FieldSchema, DataType, CollectionSchema, Collection
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True, auto_id=True),
    FieldSchema(name="content", dtype=DataType.VARCHAR, max_length=10000),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=dim),
]
schema = CollectionSchema(fields, description="Knowledge base embeddings")
collection = Collection(name=collection_name, schema=schema)

In [None]:
import os
from PIL import Image
import pytesseract
from bs4 import BeautifulSoup
from langchain.docstore.document import Document

# OCR for images
def process_image(image_path):
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)
        return text.strip()
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return ""

# Text extraction for HTML
def process_html(html_path):
    try:
        with open(html_path, "r", encoding="utf-8") as f:
            soup = BeautifulSoup(f.read(), "html.parser")
            return soup.get_text(separator="\n", strip=True)
    except Exception as e:
        print(f"Error processing HTML {html_path}: {e}")
        return ""

# Folder processing function
def process_folder(folder_path):
    documents = []

    for sub_folder in os.listdir(folder_path):
        sub_folder_path = os.path.join(folder_path, sub_folder)
        if not os.path.isdir(sub_folder_path):
            continue

        aggregated_content = []

        for root, _, files in os.walk(sub_folder_path):
            for file in files:
                file_path = os.path.join(root, file)

                try:
                    if file.endswith((".json")):
                        with open(file_path, "r", encoding="utf-8") as f:
                            content = f.read().strip()
                            if content:
                                aggregated_content.append(content)

                    elif file.endswith(".html"):
                        content = process_html(file_path)
                        if content:
                            aggregated_content.append(content)

                except Exception as e:
                    print(f"Error reading file {file_path}: {e}")

        if aggregated_content:
            combined_content = "\n".join(aggregated_content)
            documents.append(Document(page_content=combined_content, metadata={"source": sub_folder}))

    return documents


In [None]:
!pip install langchain-community
!pip install langchain_community

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings  # updated import

import pytesseract
from PIL import Image

documents = process_folder("./zomato_scraped_data")
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
chunks = splitter.split_documents(documents)

embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
contents = [chunk.page_content for chunk in chunks]
embeddings = embedding_model.embed_documents(contents)

collection.insert([contents, embeddings])
collection.flush()

# Create an index for efficient vector search
index_params = {"index_type": "AUTOINDEX", "metric_type": "IP", "params": {}}
collection.create_index("embedding", index_params)
collection.load()


