In [4]:
from bs4 import BeautifulSoup
from bs4.element import NavigableString
import json
import re
import requests
import pandas as pd
import nest_asyncio
import asyncio
from playwright.async_api import async_playwright
from playwright.async_api import TimeoutError


In [5]:

nest_asyncio.apply()  # Allow nested loops in Jupyter/Colab

async def load_and_click_return_html(url: str):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()

        await page.goto(url)

        # Use the button class and text to locate it precisely
        await page.wait_for_selector('button.btn.btn-info.btn-block', timeout=10000)

        # Optional: Confirm it's the right button by checking text content
        buttons = await page.query_selector_all('button.btn.btn-info.btn-block')
        for btn in buttons:
            text = await btn.text_content()
            if "Vérifier la disponibilité" in text:
                await btn.click()
                break

        # Wait for any dynamic content to load after click
        await page.wait_for_timeout(2000)

        html = await page.content()
        await browser.close()
        return html




In [6]:
def extract_hotel_info(html):
    soup = BeautifulSoup(html, 'html.parser')

    # URL
    url_tag = soup.find('link', rel='canonical')
    url = url_tag['href'] if url_tag else None

    # Name
    name_tag = soup.find('meta', property='og:title')
    name = name_tag['content'] if name_tag else None

    # Star rating
    stars_tag = soup.find('meta', property='og:category')
    stars_rating = None
    if stars_tag:
        match = re.search(r'(\d)', stars_tag['content'])
        stars_rating = int(match.group(1)) if match else None

    # Initialize fields
    avis = latitude = longitude = location = None

    # Find JSON-LD with hotel info
    json_ld_tags = soup.find_all('script', type='application/ld+json')
    for tag in json_ld_tags:
        try:
            data = json.loads(tag.string)
            if isinstance(data, dict) and data.get('@type') == 'Hotel':
                avis = data.get('aggregateRating', {}).get('ratingValue')
                latitude = data.get('geo', {}).get('latitude')
                longitude = data.get('geo', {}).get('longitude')
                address = data.get('address', {})
                locality = address.get('addressLocality')
                country = address.get('addressCountry')
                location = f"{locality}" if locality else None
                break
        except (json.JSONDecodeError, TypeError):
            continue

    # Extract services
    services_div = soup.find('div', class_='row row-cols-2 row-cols-lg-4 g-2 g-lg-3 mb-3')
    services = []
    if services_div:
        services = [div.get_text(strip=True) for div in services_div.find_all('div', class_='col')]

    # Extract price from the specific price div
    price_div = soup.find('div', class_='text-end col-md-2 col-5 price font-weight-600 fs-6 selected')
    price = None
    if price_div:
        text = price_div.get_text()
        match = re.search(r'(\d+(?:\.\d+)?)', text)
        if match:
            price = float(match.group(1))

    return [url, name, stars_rating, avis, location, latitude, longitude, services, price]


In [7]:
#Extracting All hotel Links from each city

# URL dictionary for destinations
destinations = {
     "Tunis": "https://www.traveltodo.com/sejours-en-tunisie/hotels/hotel-tunis/",
    "Hammamet": "https://www.traveltodo.com/sejours-en-tunisie/hotels/hotel-hammamet/",
    "Korba": "https://www.traveltodo.com/sejours-en-tunisie/hotels/hotel-korba/",
    "Nabeul": "https://www.traveltodo.com/sejours-en-tunisie/hotels/hotel-nabeul/",
    "Korbous": "https://www.traveltodo.com/sejours-en-tunisie/hotels/hotel-korbous/",
    "Sousse": "https://www.traveltodo.com/sejours-en-tunisie/hotels/hotel-sousse/",
    "Monastir": "https://www.traveltodo.com/sejours-en-tunisie/hotels/hotel-monastir/",
    "Mahdia": "https://www.traveltodo.com/sejours-en-tunisie/hotels/hotel-mahdia/",
    "Tabarka": "https://www.traveltodo.com/sejours-en-tunisie/hotels/hotel-tabarka/",
    "Ain Draham": "https://www.traveltodo.com/sejours-en-tunisie/hotels/hotel-ain-draham/",
    "Tozeur": "https://www.traveltodo.com/sejours-en-tunisie/hotels/hotel-tozeur/",
    "Djerba": "https://www.traveltodo.com/sejours-en-tunisie/hotels/hotel-djerba/" ,  # Replace with actual URL
    # Add more cities and their URLs as needed
}

base_url = "https://www.traveltodo.com"  # The base URL for the links

all_hotels = []  # List to store all hotels' details

# Loop through each destination
for city, url in destinations.items():
    print(f"Scraping {city}...")
    try:
        response = requests.get(url)  # Send request to the webpage
        soup = BeautifulSoup(response.text, "html.parser")

        # Find all hotel blocks (h3 with class "text-capitalize")
        hotel_blocks = soup.find_all("h3", class_="text-capitalize")

        for block in hotel_blocks:
            # Extract hotel name
            a_tag = block.find("a")
            if a_tag:
                name = a_tag.get_text(strip=True)
                link = base_url + a_tag["href"]  # Full link to the hotel page
            else:
                name = 'Name not found'
                link = 'Link not available'

            

            # Append hotel details to the list
            all_hotels.append(link)

    except Exception as e:
        print(f"Failed to scrape {city}: {e}")




Scraping Tunis...
Scraping Hammamet...
Scraping Korba...
Scraping Nabeul...
Scraping Korbous...
Scraping Sousse...
Scraping Monastir...
Scraping Mahdia...
Scraping Tabarka...
Scraping Ain Draham...
Scraping Tozeur...
Scraping Djerba...


In [8]:
# all_hotels

In [10]:
# Collect all data rows here
all_data = []

# x = 4
# stop_after_x = True


print(len(all_hotels))


z=237
for i in range(237,len(all_hotels)):
    html = await load_and_click_return_html(all_hotels[i])
    data = extract_hotel_info(html)
    all_data.append(data)
    print(data)
    z=z+1
    print(z)
    # if i == x and stop_after_x == True:
    #     break
    

# Define column names
columns = [
    "url", "name", "stars_rating", "avis",
    "location", "latitude", "longitude", "services","price"
]

# Create DataFrame
df = pd.DataFrame(all_data, columns=columns)

# Optionally convert 'services' list to string for CSV
df['services'] = df['services'].apply(lambda x: ', '.join(x) if isinstance(x, list) else '')

# Save to CSV
df.to_csv("hotels_data.csv", index=False, encoding="utf-8-sig")


278
['https://www.traveltodo.com/hotels-tunisie/djerba/club-med-djerba-la-douce-1236.html', 'Club Med Djerba la Douce', 4, None, 'Djerba', 33.780565, 11.04851, ['Bar', 'Restaurant', 'Plage', 'Café', 'Discothèque', 'Salle de réunion', 'Piscine', 'Télévision', 'Chaînes câblées', 'Tennis', 'Sports nautiques', 'Change'], None]
238
['https://www.traveltodo.com/hotels-tunisie/djerba/alkantara-thalasso-djerba-1349.html', 'Alkantara Thalasso Djerba', 4, '12', 'Djerba', None, None, ['Climatisation', 'Sèche-cheveux', 'Bar', 'Restaurant', 'Plage', 'Salle de réunion', 'Piscine', 'Télévision', 'Sports nautiques'], None]
239
['https://www.traveltodo.com/hotels-tunisie/djerba/aldiana-djerba-atlantide-2143.html', 'Aldiana Djerba Atlantide', 4, None, 'Djerba', None, None, [], 655.5]
240
['https://www.traveltodo.com/hotels-tunisie/djerba/tui-magic-life-penelope-1325.html', 'Tui Magic Life Penelope', 4, '15', 'Djerba', 33.8457648, 10.993065, ['Climatisation', "Garderie d'enfants", 'Téléphone avec ligne d

In [None]:
# import folium
# import pandas as pd

# def pin_all_hotels_on_map(df, map_file='hotels_map.html'):
#     Center the map around the average Location
#     center_lat = df['latitude'].mean()
#     center_lon = df['longitude'].mean()

#     hotel_map = folium.Map(location=[center_lat, center_lon], zoom_start=10)

#     Loop through DataFrame and add a marker for each hotel
#     for _, row in df.iterrows():
#         if pd.notnull(row['latitude']) and pd.notnull(row['longitude']):
#             popup_text = f'<b>{row["name"]}</b><br>{row["stars_rating"]}⭐<br>{row["location"]}'
#             folium.Marker(
#                 location=[row['latitude'], row['longitude']],
#                 popup=folium.Popup(popup_text, max_width=250),
#                 tooltip=row['name']
#             ).add_to(hotel_map)

#     hotel_map.save(map_file)
#     print(f"Map saved as {map_file}")


In [None]:
 #pin_all_hotels_on_map(df, map_file='hotels_map.html')