In [30]:
import requests
from bs4 import BeautifulSoup
import re
import csv
import json
from http.client import RemoteDisconnected

In [31]:
categories_path = [
    "Immobilier", 
    "ImmoNeuf/Immobilier%20Neuf"
]
base_url = "https://www.tayara.tn/ads/c/{}/?page={}"

def get_all_soup():
    soups = []
    for category in categories_path:
        for page in range(1, 150):  # Limiter à 5 pages pour l'exemple
            url = base_url.format(category, page)
            # print(url)
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            soups.append(soup)
    return soups

In [32]:
# permet de retourner tous les liens de chaque immobilier
# vs pouvez inspecter https://www.tayara.tn/item/66eae8a467b755ba922a2a58/Immobilier%20Neuf/Ariana/Ghazela/Appartement_en_S2_de_12880_m_A41_au_4me_tage/
def get_all_links(soup):
    property_links = []
    for article in soup.find_all('article', class_="mx-0"):
        link = article.find('a')['href']
        if '/item/' in link:
            full_url = "https://www.tayara.tn" + link
            property_links.append(full_url)
    return property_links

In [33]:
import re
import json
import requests
from bs4 import BeautifulSoup

def _normalize_transaction(val: str):
    if not val:
        return None
    v = str(val).strip().lower()
    if "louer" in v or "location" in v or "à louer" in v:
        return "Location"
    if "vendre" in v or "vente" in v or "à vendre" in v:
        return "Vente"
    return str(val).strip()

def _int_or_none(x):
    if x is None:
        return None
    if isinstance(x, (int, float)):
        return int(x)
    s = str(x)
    m = re.search(r"\d+", s)
    return int(m.group(0)) if m else None

def _parse_city_region_from_url(url: str):
    try:
        parts = [p for p in url.strip('/').split('/') if p]
        if 'item' in parts:
            i = parts.index('item')
            governorate = parts[i+2] if len(parts) > i+2 else None
            delegation = parts[i+3] if len(parts) > i+3 else None
            city = governorate.replace('-', ' ').title() if governorate else None
            region = delegation.replace('-', ' ').title() if delegation else None
            return city, region
    except Exception:
        pass
    return None, None

def crawl_property_page(url: str):
    resp = requests.get(url, timeout=20)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'html.parser')

    item_info = {
        'link': url,
        'title': None,
        'price': None,
        'transaction': None,
        'city': None,
        'region': None,
        'description': None,
        'surface': None,
        'bathrooms': None,
        'rooms': None,
    }

    # Try to parse Next.js JSON: look for any application/json script containing adDetails
    ad_details = None
    ad_params = None
    try:
        for script in soup.find_all('script', attrs={'type': 'application/json'}):
            txt = script.string or script.get_text() or ''
            if 'adDetails' not in txt:
                continue
            data = json.loads(txt)
            d = data
            for key in ('props', 'pageProps'):
                d = d.get(key) if isinstance(d, dict) else None
                if d is None:
                    break
            if d and isinstance(d, dict) and 'adDetails' in d:
                ad_details = d.get('adDetails')
                ap = ad_details.get('adParams', []) if isinstance(ad_details, dict) else []
                # Ensure list of dicts
                ad_params = ap if isinstance(ap, list) else []
                break
    except Exception:
        ad_details = None
        ad_params = None

    # Fill from JSON if available
    if isinstance(ad_details, dict):
        item_info['title'] = ad_details.get('title') or item_info['title']
        item_info['description'] = ad_details.get('description') or item_info['description']
        item_info['price'] = _int_or_none(ad_details.get('price')) or item_info['price']
        loc = ad_details.get('location') or {}
        gov = (loc.get('governorate') or '').strip()
        delg = (loc.get('delegation') or '').strip()
        item_info['city'] = gov or item_info['city']
        item_info['region'] = delg or item_info['region']

        # Map params
        def get_param(label_variants):
            if not isinstance(ad_params, list):
                return None
            for p in ad_params:
                lbl = (p.get('label') or '').strip().lower()
                val = p.get('value')
                for lv in label_variants:
                    if lbl == lv:
                        return val
                for lv in label_variants:
                    if lv in lbl:
                        return val
            return None

        item_info['transaction'] = _normalize_transaction(get_param([
            'type de transaction','transaction','type'
        ])) or item_info['transaction']
        item_info['surface'] = _int_or_none(get_param(['superficie','surface'])) or item_info['surface']
        item_info['bathrooms'] = _int_or_none(get_param(['salles de bains','salle de bain','sdb'])) or item_info['bathrooms']
        item_info['rooms'] = _int_or_none(get_param(['chambres','chambre','pièces','pieces'])) or item_info['rooms']

    # Fallbacks from HTML
    if item_info['title'] is None:
        h1 = soup.find('h1')
        item_info['title'] = h1.get_text(strip=True) if h1 else None
    if item_info['price'] is None:
        price_data = soup.find('data')
        if price_data and price_data.has_attr('value'):
            item_info['price'] = _int_or_none(price_data['value'])
        else:
            span_price = soup.find('span', class_='mr-1')
            item_info['price'] = _int_or_none(span_price.get_text()) if span_price else None
    if item_info['description'] is None:
        desc_h2 = soup.find('h2', string=lambda s: isinstance(s, str) and 'Description' in s)
        if desc_h2:
            desc_p = desc_h2.find_next('p')
            item_info['description'] = desc_p.get_text(" ", strip=True) if desc_p else None

    # Criteria list
    def html_criteria():
        crit_h2 = soup.find('h2', string=lambda s: isinstance(s, str) and 'Critères' in s)
        if not crit_h2:
            return
        ul = crit_h2.find_next('ul')
        if not ul:
            return
        for li in ul.find_all('li'):
            container = li.find('span', class_=lambda c: isinstance(c, str) and 'flex' in c and 'flex-col' in c)
            if not container:
                continue
            children = container.find_all('span', recursive=False)
            if len(children) < 2:
                continue
            label_text = children[0].get_text(strip=True).lower()
            value_text = children[1].get_text(strip=True)
            if 'type de transaction' in label_text or label_text in ('transaction','type'):
                item_info['transaction'] = _normalize_transaction(value_text)
            elif 'superficie' in label_text or 'surface' in label_text:
                item_info['surface'] = _int_or_none(value_text)
            elif 'salles de bains' in label_text or 'salle de bain' in label_text or 'sdb' in label_text:
                item_info['bathrooms'] = _int_or_none(value_text)
            elif 'chambres' in label_text or 'chambre' in label_text or 'pièces' in label_text or 'pieces' in label_text:
                item_info['rooms'] = _int_or_none(value_text)
    html_criteria()

    if item_info['rooms'] is None and item_info['title']:
        m = re.search(r"\bS\s*\+\s*(\d)\b", item_info['title'], re.IGNORECASE)
        if m:
            item_info['rooms'] = int(m.group(1))

    if not item_info['city'] or not item_info['region']:
        city_url, region_url = _parse_city_region_from_url(url)
        item_info['city'] = item_info['city'] or city_url
        item_info['region'] = item_info['region'] or region_url

    return item_info

In [34]:
property_links = []
for soup in get_all_soup():
    property_link = get_all_links(soup)
    property_links.extend(property_link)

In [35]:
def save_to_csv(data, filename="immobiliers.csv"):
    if not data:
        print("Aucune donnée à sauvegarder.")
        return

    # Define the columns you want in order
    fieldnames = ['link', 'title', 'price', 'transaction', 'city', 'region', 
                  'description', 'surface', 'bathrooms', 'rooms']

    try:
        with open(filename, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=fieldnames, extrasaction='ignore')
            writer.writeheader()
            for row in data:
                if row:  # Skip None entries
                    writer.writerow(row)
        print(f"Données sauvegardées dans {filename}")
    except Exception as e:
        print(f"Erreur lors de l'enregistrement : {e}")

In [36]:
# Scrape data
data = []
for link in property_links:
    try:
        property_data = crawl_property_page(link)
        if property_data:  # Only add if not None
            data.append(property_data)
    except Exception as e:
        print(f"Error scraping {link}: {e}")

# Save to CSV
save_to_csv(data, "final_scrapped_immobiliers.csv")

Error scraping https://www.tayara.tn/item/terrains-et-fermes/ben-arous/borj-cedria/terrain-de-500-m-borj-cedria/675ac8a4b5e4ab395e78d5e2/: ('Connection aborted.', ConnectionResetError(10054, 'Une connexion existante a dû être fermée par l’hôte distant', None, 10054, None))
Error scraping https://www.tayara.tn/item/terrains-et-fermes/sfax/route-el-afrane/-vendre-terrain-sfax/694d2c2ccd04d08be750d17a/: HTTPSConnectionPool(host='www.tayara.tn', port=443): Read timed out. (read timeout=20)
Error scraping https://www.tayara.tn/item/appartements/tunis/lac-2/a-louer-un-appartement-en-s3-au-lac-2/6945237f8914daae519deb77/: HTTPSConnectionPool(host='www.tayara.tn', port=443): Read timed out. (read timeout=20)
Données sauvegardées dans final_scrapped_immobiliers.csv


In [37]:
# Quick sanity test for one listing (optional)
TEST_URL = "https://www.tayara.tn/item/appartements/ben-arous/ezzahra/duplex-s2-meuble-a-boukornine/694e8cd48914daae51a1dc1d/"
res = crawl_property_page(TEST_URL)
print({k: res.get(k) for k in ["title","price","transaction","city","region","description","surface","bathrooms","rooms","link"]})

{'title': 'DUPLEX S2 MEUBLE A BOUKORNINE', 'price': 700, 'transaction': 'Location', 'city': 'Ben Arous', 'region': 'Ezzahra', 'description': 'nous mettons pour la location un duplex s+2 meublé pour longue durée situé à proximité les commodités à boukornine.\n\n700d/mois\n\n☎:      98 176 369', 'surface': 100, 'bathrooms': 1, 'rooms': 3, 'link': 'https://www.tayara.tn/item/appartements/ben-arous/ezzahra/duplex-s2-meuble-a-boukornine/694e8cd48914daae51a1dc1d/'}
