In [65]:
import re
import csv
from bs4 import BeautifulSoup
from requests.sessions import Session

In [166]:
class DuPropioCrawler():
    uri = "https://duproprio.com/fr/rechercher/liste"
    def __init__(self):
        self.session = Session()
        self.session.headers.update({'User-Agent':"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"})
    
    def crawl(self, start=1, end=627, output = './results.csv'):
        fields = ['address', 'price', 'city', 'region', 'nb_etage', 'nb_chambre',\
                  'nb_sdb', 'nb_sde', 'aire_ha', 'aire_terr',\
                  'tx_municipales', 'tx_scolaires', 'electricite', 'tx_eau', 'assurances', 'url'
                 ]
        with open(output, 'w') as f:
            writer = csv.DictWriter(f, fields)
            writer.writeheader()
            for page in range(start, end):
                soup = self.fetch(
                    "https://duproprio.com/fr/rechercher/liste",
                    {'search':'true', 'with_builders':1, 'parent':1, 'pageNumber':page, 'sort':'-published_at'}
                )

                if soup is None:
                    continue
                if soup == False:
                    break

                results_box = soup.find('div', {'class':'search-results-listings'})
                for item in results_box.find_all('li', {'id':re.compile('^listing-\d+$')}):
                    _id = item.attrs.get('id')
                    href = item.a.attrs.get('href')

                    detail_page = self.fetch(href, {})
                    if detail_page is None:
                        continue
                    data = self.parse_detail(detail_page, href)
                    writer.writerow(data)
                    print("Item", _id.split('-')[-1], "saved")
                print("Page", page, "parsed ...")
                print('-------------------------------')
        print("FIN !!!")
                
    def fetch(self, url, params) -> BeautifulSoup:
        try:
            resp = self.session.get(
                url,
                params=params,
                timeout=3
            )
        except (Exception, ) as e:
            print(e)
        else:
            if len(resp.history) == 0:
                soup = BeautifulSoup(resp.content)
                return soup
            else:
                return False
        return None

    def parse_detail(self, soup : BeautifulSoup, href) -> dict[str, str]:
        """Extract detail for item"""
        price = soup.find('div', {'class':'listing-price__amount'})
        if price is not None:
            price = price.get_text().replace(" ", "").replace('\xa0', '').strip().removesuffix("$").strip()
        addr, city, reg = [x for x in soup.find('div', {'class':'listing-location__address'}).get_text().splitlines() if x]
        
        desc_box = soup.find('div', {'data-label':'#description'})
        if desc_box:
            chain = desc_box.get_text().strip().replace('\n', '').replace(' ', '')
            description =  self.parse_desc(chain)
        else:
            description = {
                'nb_etage':None,
                'nb_chambre':None,
                'nb_sdb':None,
                'nb_sde':None,
                'aire_ha':None,
                'aire_terr':None
            }

        taxes = self.get_taxes(soup)
        
        data = {
            'price':price,
            'address':addr,
            'city':city,
            'region':reg,
            'url':href
        }
        data.update(description)
        data["tx_municipales"] = taxes.get("Taxes municipales")
        data["tx_scolaires"] = taxes.get("Taxes scolaires")
        data["electricite"] = taxes.get("Électricité")
        data["tx_eau"] = taxes.get("Taxes d'eau")
        data["assurances"] = taxes.get("Assurances")
        
        return data

    def parse_desc(self, chain : str) -> dict:
        etage = re.search('(?P<eta>\d+)étage', chain)
        chambre = re.search('(?P<chambre>\d+)chambre', chain)
        sdb = re.search('(?P<sdb>\d+)sall(e|es)debain', chain)
        se = re.search('(?P<se>\d+)sall(e|es)d’eau', chain)
        ah = re.search('Airehabitable\(s-solexclu\)(?P<ah>[\d,]+)pi²', chain)
        td = re.search('Tailleduterrain(?P<td>[\d,]+)pi²', chain)
        
        etage = int(etage.group('eta')) if etage else None
        chambre = int(chambre.group('chambre')) if chambre else None
        sdb = int(sdb.group('sdb')) if sdb else None
        se = int(se.group('se')) if se else None
        ah = int(ah.group('ah').replace(',','')) if ah else None
        td = int(td.group('td').replace(',', '')) if td else None
        
        return {
            'nb_etage':etage,
            'nb_chambre':chambre,
            'nb_sdb':sdb,
            'nb_sde':se,
            'aire_ha':ah,
            'aire_terr':td
        }

    def get_taxes(self, soup : BeautifulSoup) -> dict[str, str]:
        taxes = {}
        table = soup.find(
            "div", {"class": "mortgage-data__table mortgage-data__table--costs"}
        )
        if table is not None:
            for row in table.find_all(
                "div", {"class": "mortgage-data__table__row"}
            ):
                row: Tag
                key = row.find(
                    "div",
                    {
                        "class": "mortgage-data__table__row__item mortgage-data__table__row__item--name"
                    },
                )
                value = row.find(
                    "div",
                    {
                        "class": "mortgage-data__table__row__item mortgage-data__table__row__item--yearly-costs"
                    },
                )
                if key and value:
                    taxes[key.get_text().strip()] = (
                        value
                            .get_text()
                            .replace(',', "")
                            .replace(" ", "")
                            .replace('\xa0', '')
                            .strip()
                            .removesuffix("$")
                            .strip()
                    )
        return taxes


In [106]:
s = Session()
s.headers.update({'User-Agent':"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36"})
r = s.get('https://duproprio.com/fr/mauricie/mrc-de-mekinac/terrain-a-vendre/hab-st-joseph-de-mekinac-56734')
soup = BeautifulSoup(r.text)

In [167]:
crawler = DuPropioCrawler()

In [145]:
data = crawler.parse_detail(soup, "https://duproprio.com/fr/mauricie/mrc-de-mekinac/terrain-a-vendre/hab-st-joseph-de-mekinac-56734")

In [146]:
print(data)

{'price': '45000', 'address': 'St-Joseph de Mékinac', 'city': 'MRC de Mékinac', 'region': 'Mauricie', 'url': 'https://duproprio.com/fr/mauricie/mrc-de-mekinac/terrain-a-vendre/hab-st-joseph-de-mekinac-56734', 'nb_etage': None, 'nb_chambre': None, 'nb_sdb': None, 'nb_sde': None, 'aire_ha': None, 'aire_terr': None, 'tx_municipales': None, 'tx_scolaires': None, 'electricite': None, 'tx_eau': None, 'assurances': None}


In [169]:
crawler.crawl(start=1, end=4)

Item 1026709 saved
Item 1025710 saved
Item 1026707 saved
Item 991533 saved
Item 1026749 saved
Item 1025964 saved
Item 980904 saved
Item 1021735 saved
Item 1026863 saved
Item 1026227 saved
Item 1026910 saved
Page 1 parsed ...
-------------------------------
Item 1026857 saved
Item 1026660 saved
Item 1011340 saved
Item 1026738 saved
Item 1027067 saved
Item 1026739 saved
Item 1026347 saved
Item 1026118 saved
Item 1020560 saved
Item 1024545 saved
Item 1009800 saved
Page 2 parsed ...
-------------------------------
Item 1024591 saved
Item 1022022 saved
Item 1026697 saved
Item 1026327 saved
Item 1026619 saved
Item 1026021 saved
Item 1021695 saved
Item 1026262 saved
Item 1026344 saved
Item 1026849 saved
Item 1016066 saved
Page 3 parsed ...
-------------------------------
FIN !!!


In [None]:
crawler.crawl(start=1, end=1000)