In [499]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import re

# Scraping a property page
This is a discovery notebook. It contains script that scrape a property page of the website seloger.com.\
From a page we can extract several features of the flat:

- Title
- URL (used a primary key)
- nom du quartier
- arrondissement/ville
- loyer (all inclusive)
- taxes and fees
- caution
- evolution des prix (si disponible)
- nb de pieces
- nb de m2
- "L'avis du professionel"
- Description de l'appart (general, interieur et autres)
- Diagnostic de performance energetique (note et kWhEP/m².an)

In [500]:
url = "https://www.seloger.com/annonces/locations/appartement/paris-11eme-75/nation-alexandre-dumas/155711407.htm?projects=1&types=1&places=[{ci:750105}|{ci:750111}|{ci:940067}|{ci:940080}|{idq:133102}|{idq:133103}|{idq:133104}|{idq:133105}|{idq:133106}|{idq:133107}|{idq:133108}|{idq:133109}|{idq:133110}|{idq:133111}|{idq:133112}|{idq:133113}|{idq:133114}|{idq:133115}|{idq:133764}]&surface=20/NaN&rooms=1&sort=d_dt_crea&picture=15&qsVersion=1.0&bd=ListToDetail"

In [501]:
header = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36" ,'referer':'https://www.google.com/'}
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text,"html.parser")

#### Comment:
Adding headers enables to avoid being detected as a bot when querying the webpage. When querying this website without headers, the response contains a captcha

In [502]:
print(soup.main.prettify())

<main>
 <div class="SubHeaderstyled__SubHeaderWrapper-sc-1s8qndx-0 htLyZj">
  <a class="SubHeaderstyled__Link-sc-1s8qndx-3 SubHeaderstyled__ReturnToResultsLink-sc-1s8qndx-4 iwDGaU" href="https://www.seloger.com/list.htm?projects=1&amp;types=1&amp;places=[%7Bci:750105%7D%7C%7Bci:750111%7D%7C%7Bci:940067%7D%7C%7Bci:940080%7D%7C%7Bidq:133102%7D%7C%7Bidq:133103%7D%7C%7Bidq:133104%7D%7C%7Bidq:133105%7D%7C%7Bidq:133106%7D%7C%7Bidq:133107%7D%7C%7Bidq:133108%7D%7C%7Bidq:133109%7D%7C%7Bidq:133110%7D%7C%7Bidq:133111%7D%7C%7Bidq:133112%7D%7C%7Bidq:133113%7D%7C%7Bidq:133114%7D%7C%7Bidq:133115%7D%7C%7Bidq:133764%7D]&amp;surface=20/NaN&amp;rooms=1&amp;sort=d_dt_crea&amp;picture=15&amp;qsVersion=1.0&amp;bd=ListToDetail">
   <div class="Icon__IconElement-sc-1xnn3zd-0 jGTLkV" data-test="sl-ui.icon">
    <svg viewbox="0 0 512 512">
     <path d="M483.8 241.7L67.3 239l85.4-93.4c6.4-7 5.9-18-1.1-24.4-7-6.4-18-5.9-24.4 1.1l-111.7 122c-2.9 3.2-4.5 7.3-4.5 11.5 0 .2.1.3.1.5 0 2 .4 4 1.1 5.9.4.8.9 1.6 1.4 2.4

# Navigate the soup
Most of the key information is in the **main** section of the HTML

## Title

In [503]:
soup.title.string

'Location Studio Paris 11ème - Appartement F1/T1/1 pièce 21,37m² 1015€/mois - SeLoger'

## URL

In [504]:
for val in soup.find_all('meta'):
    if val.get('name') == 'description':
        print(val.get('content'))
        print(val.link.get('href'))

- Studio refait à neuf de 21.37 m² 
- 303 rue du Faubourg Saint-Antoine
- 5ème étage avec ascenseur
- Balcon exposé Sud
- Rangements
- Parquet
- Four combiné
- Cuisine intégrée
- Plaque vitrocéramique
- Réfrigérateur / congélateur
-
https://www.seloger.com/annonces/locations/appartement/paris-11eme-75/nation-alexandre-dumas/155711407.htm


## Nom du quartier, arrondissement, ville
### Option 1 -  Use summary block

In [505]:
summary_block = main_tag.find_all('div', class_='Summarystyled__MainWrapper-tzuaot-1 erajXw')[0]

In [506]:
summary_block.find_all('div', class_='Summarystyled__Address-tzuaot-5 fLUFm')

[<div class="Summarystyled__Address-tzuaot-5 fLUFm">Quartier <!-- -->Nation-Alexandre Dumas<!-- -->, <!-- -->Paris 11ème</div>]

### Option 2 - Use main block

In [507]:
main_tag = soup.find_all('div', class_='app__CWrapMain-aroj7e-3 FfHMD')[0]

In [508]:
type(main_tag)

bs4.element.Tag

In [509]:
quartier_block = (
    main_tag.find('p', class_="Map__AddressLine-sc-6i077b-2 eCDRsF")
)

In [510]:
quartier_block.contents

['L’appartement est situé',
 ' ',
 ' ',
 <strong>à <!-- -->Paris<!-- --> (<!-- -->75011<!-- -->)</strong>,
 ', dans le quartier ',
 ' ',
 'Nation-Alexandre Dumas',
 ' ',
 '.']

In [511]:
ville = quartier_block.strong

In [512]:
print(ville.contents)

['à\xa0', ' ', 'Paris', ' ', ' (', ' ', '75011', ' ', ')']


In [513]:
''.join(ville.contents[2:])

'Paris  ( 75011 )'

In [514]:
quartier_block.contents[-3]

'Nation-Alexandre Dumas'

## Rent price

In [515]:
price_block = (
    main_tag
    .find_all('div', id='a-propos-de-ce-prix', class_='Pricestyled__Container-uc7t2j-1 djoCgs')
)[0]

In [516]:
type(price_block)

bs4.element.Tag

In [517]:
current_price = price_block.find_all('div', class_='Pricestyled__Price-uc7t2j-0 jWsTKU')[0].find('div').contents[0]

In [518]:
current_price

'1 015 €'

## Taxes and fees

In [519]:
taxes_and_fees_block = price_block.find_all('div', class_='Pricestyled__Panel-uc7t2j-4 OLjZF')[0]

In [520]:
assert taxes_and_fees_block.strong.contents[0] == 'Charges comprises'

In [521]:
taxes_and_fees_block.find_all('div')

[<div>- Complément de loyer :<!-- --> <strong>268 €</strong></div>,
 <div>- Provisions pour charges avec regularisation annuelle :<!-- --> <strong>45 €</strong></div>]

In [522]:
# try except sur les fluctuations de prix

In [523]:
price_fluctuations_block = price_block.find_all('div', class_='PriceHistorystyled__Container-sc-18jhpbr-0 iSZUrX')[0]

In [524]:
(
    price_fluctuations_block
    .find_all('span',
              class_='PriceHistorystyled__BoldDisplayAmount-sc-18jhpbr-4 kdmxHw global-styles__TextNoWrap-sc-1aeotog-6 dVzJN')
)[0].contents[0]

'45 €'

In [525]:
deposit_block = price_block.find_all('div', class_='rentHelper__Garantie-sc-1x3dozo-0 AJvvM')[0]

In [526]:
deposit_block.strong.contents[0]

'970 €'

## Dimensions

In [527]:
dimensions_block = (
    summary_block
    .find_all('div', class_='Summarystyled__TagsWrapper-tzuaot-18 cgUaLi')[0]
    .find_all('div', class_='TagsWithIcon__TagContainer-j1x9om-1 eiaFim')
)

In [528]:
dimensions_block

[<div class="TagsWithIcon__TagContainer-j1x9om-1 eiaFim"><div class="TagsWithIcon__CurrentIcon-j1x9om-0 jGcqmv Icon__IconElement-sc-1xnn3zd-0 jbPwtO" data-test="sl-ui.icon"><svg viewbox="0 0 512 512"><g><path d="M368.6 27.6H132.1c-1.1-.1-2.2-.2-3.2-.2-13.3 0-24.1 10.8-24.1 24.1v335.3c0 9.4 5.4 17.9 13.9 21.9l149.9 69.8c12.1 5.6 26.4.4 32-11.7 1.5-3.2 2.2-6.6 2.2-10.1v-38.4h65.8c19.9 0 36.1-16.2 36.1-36.1V63.7c.1-19.9-16.1-36.1-36.1-36.1zm-89.8 429l-149.9-69.9V51.5l149.9 75v330.1zM380.7 382c0 6.6-5.4 12-12 12h-65.8V126.4c0-9.1-5.1-17.5-13.3-21.5L183.2 51.7h185.5c6.6 0 12 5.4 12 12V382z"></path><path d="M255.4 311.9c6.6 0 12-5.4 12.1-12v-48.2c0-6.7-5.4-12-12-12s-12 5.4-12 12v48.2c-.2 6.6 5.2 12 11.9 12z"></path></g></svg></div><div>1 pièce</div></div>,
 <div class="TagsWithIcon__TagContainer-j1x9om-1 eiaFim"><div class="TagsWithIcon__CurrentIcon-j1x9om-0 jGcqmv Icon__IconElement-sc-1xnn3zd-0 jbPwtO" data-test="sl-ui.icon"><svg viewbox="0 0 512 512"><path d="M382 493.6H44.6c-7.2 0-13-5.8-

In [529]:
rooms = dimensions_block[0]

In [530]:
rooms.find_all('div')[-1].contents

['1 pièce']

In [531]:
size = dimensions_block[1]

In [532]:
size.find_all('div')[-1].contents

['21,37m²']

## L'avis du professionel

In [533]:
description_block = (
    main_tag
    .find_all('div', class_='TitledDescription__TitledDescriptionContent-sc-1r4hqf5-1 koqVoo')
)

In [534]:
avis_du_pro = description_block[0]

In [535]:
avis_du_pro

<div class="TitledDescription__TitledDescriptionContent-sc-1r4hqf5-1 koqVoo"><div class=""><div class="ShowMoreText__UITextContainer-sc-5ggbbc-0 hCeOyd"><p>- Studio refait à neuf de 21.37 m² 
</p><p>- 303 rue du Faubourg Saint-Antoine
</p><p>- 5ème étage avec ascenseur
</p><p>- Balcon exposé Sud
</p><p>- Rangements
</p><p>- Parquet
</p><p>- Four combiné
</p><p>- Cuisine intégrée
</p><p>- Plaque vitrocéramique
</p><p>- Réfrigérateur / congélateur
</p><p>- Hotte
</p><p>- Four
</p><p>- Lave-linge séchant
</p><p>- Salle de douche
</p><p>- Métro Nation 200 m
</p><p>- Loyer de base: 702 euros 
</p><p>- Complément de loyer: 268 euros 
</p><p>- Charges: 45 euros
</p><p>- Aucun frais d'agence.</p><button class="ContactLink__StyledButton-sc-1ex39dt-0 jPagdq">Demander plus d'informations à l'agence<div class="ContactLink__Chevron-sc-1ex39dt-1 gYbUwK Icon__IconElement-sc-1xnn3zd-0 fFMQXr" color="#e00034" data-test="contact-link-chevron" style="margin-left:5px"><svg viewbox="0 0 512 512"><path d="M

In [536]:
avis_du_pro.find_all('p')

[<p>- Studio refait à neuf de 21.37 m² 
 </p>, <p>- 303 rue du Faubourg Saint-Antoine
 </p>, <p>- 5ème étage avec ascenseur
 </p>, <p>- Balcon exposé Sud
 </p>, <p>- Rangements
 </p>, <p>- Parquet
 </p>, <p>- Four combiné
 </p>, <p>- Cuisine intégrée
 </p>, <p>- Plaque vitrocéramique
 </p>, <p>- Réfrigérateur / congélateur
 </p>, <p>- Hotte
 </p>, <p>- Four
 </p>, <p>- Lave-linge séchant
 </p>, <p>- Salle de douche
 </p>, <p>- Métro Nation 200 m
 </p>, <p>- Loyer de base: 702 euros 
 </p>, <p>- Complément de loyer: 268 euros 
 </p>, <p>- Charges: 45 euros
 </p>, <p>- Aucun frais d'agence.</p>]

## Amenities

In [537]:
general = description_block[2].find_all('ul', class_='GeneralList__List-sc-9gtpjm-0 BAyYz')[0]

In [538]:
general

<ul class="GeneralList__List-sc-9gtpjm-0 BAyYz"><li class="GeneralList__Item-sc-9gtpjm-1 dsevYN">Surface de 21,37m²</li><li class="GeneralList__Item-sc-9gtpjm-1 dsevYN">Bâtiment de 6 étages</li><li class="GeneralList__Item-sc-9gtpjm-1 dsevYN">1 Pièce</li><li class="GeneralList__Item-sc-9gtpjm-1 dsevYN">Année de construction 1885</li><li class="GeneralList__Item-sc-9gtpjm-1 dsevYN">Refait à neuf</li><li class="GeneralList__Item-sc-9gtpjm-1 dsevYN">Au 5ème étage</li></ul>

In [539]:
inside = description_block[3].find_all('ul', class_='GeneralList__List-sc-9gtpjm-0 BAyYz')[0]

In [540]:
inside

<ul class="GeneralList__List-sc-9gtpjm-0 BAyYz"><li class="GeneralList__Item-sc-9gtpjm-1 dsevYN">1 Salle d'eau</li><li class="GeneralList__Item-sc-9gtpjm-1 dsevYN">Parquet</li><li class="GeneralList__Item-sc-9gtpjm-1 dsevYN">Rangements</li><li class="GeneralList__Item-sc-9gtpjm-1 dsevYN">Chauffage électrique</li><li class="GeneralList__Item-sc-9gtpjm-1 dsevYN">Cuisine équipée</li></ul>

In [541]:
other = description_block[4].find_all('ul', class_='GeneralList__List-sc-9gtpjm-0 BAyYz')[0]

In [542]:
other

<ul class="GeneralList__List-sc-9gtpjm-0 BAyYz"><li class="GeneralList__Item-sc-9gtpjm-1 dsevYN">Digicode</li></ul>

## Diagnostique energetique

In [543]:
diagnostics_block = main_tag.find_all('div', id='diagnostics')[0]

In [544]:
diagnostics_block

<div class="global-styles__WhiteBox-sc-1aeotog-2 global-styles__DescriptionWrapper-sc-1aeotog-5 czLMZl" data-test="diagnostics-block" id="diagnostics"><div class="TitledDescription__TitledDescriptionContainer-sc-1r4hqf5-0 fHzMfE"><h3 class="typography__TitleH3-sc-111kv7i-1 jrMlIh">Les diagnostics énergétiques</h3><div class="TitledDescription__TitledDescriptionContent-sc-1r4hqf5-1 koqVoo"><div class="Diagnostics__DiagnosticsContainer-al64ti-2 Oitqq" data-test="diagnostics-content"><div class="Diagnostics__PreviewContainer-al64ti-0 dTGPCg"><p class="Diagnostics__PreviewTitle-al64ti-1 eJGktk" data-test="diagnostics-preview-title">Diagnostic de performance énergétique</p><div class="Preview__PreviewBar-sc-1pa12ii-0 jIpQSL" data-test="diagnostics-preview-bar"><div class="Preview__PreviewTile-sc-1pa12ii-1 gafcOY"><p>A</p></div><div class="Preview__PreviewTile-sc-1pa12ii-1 iGmEzL"><p>B</p></div><div class="Preview__PreviewTile-sc-1pa12ii-1 edGcMP"><p>C</p></div><div class="Preview__PreviewTi

In [493]:
[energy_diagnostic, ges_diagnostics] = diagnostics_block.find_all('div', class_='Diagnostics__DiagnosticsContainer-al64ti-2 Oitqq')

In [494]:
energy_diagnostic

<div class="Diagnostics__DiagnosticsContainer-al64ti-2 Oitqq" data-test="diagnostics-content"><div class="Diagnostics__PreviewContainer-al64ti-0 dTGPCg"><p class="Diagnostics__PreviewTitle-al64ti-1 eJGktk" data-test="diagnostics-preview-title">Diagnostic de performance énergétique</p><div class="Preview__PreviewBar-sc-1pa12ii-0 jIpQSL" data-test="diagnostics-preview-bar"><div class="Preview__PreviewTile-sc-1pa12ii-1 gafcOY"><p>A</p></div><div class="Preview__PreviewTile-sc-1pa12ii-1 iGmEzL"><p>B</p></div><div class="Preview__PreviewTile-sc-1pa12ii-1 edGcMP"><p>C</p></div><div class="Preview__PreviewTile-sc-1pa12ii-1 Preview__PreviewFocusedTile-sc-1pa12ii-2 hJiFNB"><p>D</p><div class="Preview__PreviewTooltip-sc-1pa12ii-3 hVDlNM"><span class="Preview__PreviewTooltipValue-sc-1pa12ii-4 eGDcBO">152</span><span class="Preview__PreviewTooltipCaption-sc-1pa12ii-5 cizigk">kWhEP/m².an</span></div></div><div class="Preview__PreviewTile-sc-1pa12ii-1 EmyuG"><p>E</p></div><div class="Preview__Previe

In [497]:
energy_diagnostic.find_all('div', class_=lambda x: x and 'FocusedTile' in x)[0].p.contents

['D']

In [498]:
energy_diagnostic.find_all('div', class_='Preview__PreviewTooltip-sc-1pa12ii-3 hVDlNM')

[<div class="Preview__PreviewTooltip-sc-1pa12ii-3 hVDlNM"><span class="Preview__PreviewTooltipValue-sc-1pa12ii-4 eGDcBO">152</span><span class="Preview__PreviewTooltipCaption-sc-1pa12ii-5 cizigk">kWhEP/m².an</span></div>]