# Web Scraping using Beautiful Soup

Objective: Web Scraping using Beautiful Soup  
Ref: https://towardsdatascience.com/looking-for-a-house-build-a-web-scraper-to-help-you-5ab25badc83e

In [1]:
from bs4 import BeautifulSoup
from requests import get
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
headers = ({'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})

## Set Website and Test Communication

In [3]:
sapo = "https://casa.sapo.pt/Venda/Apartamentos/?sa=11&or=10"
response = get(sapo, headers=headers)
print(response)

<Response [200]>


In [4]:
print(response.text[:1000])



<!DOCTYPE html>

<html lang="pt-PT">
<head><title>
	Casas para Venda, Apartamentos Ofertas recentes no Distrito de Lisboa, CASA SAPO - Portal Nacional de Imobiliário
</title><meta name="author" content="CASA SAPO - Portal Nacional de Imobiliário - Janela Digital SA" />
    <meta name="application-name" content="CASA SAPO - Portal Nacional de Imobiliário" data-copyright="Janela Digital SA" data-generated-time="11/03/2019 01:06" />
    
<meta name="content-language" content="pt-PT" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta name="description" content="Casas para Venda, Apartamentos Ofertas recentes no Distrito de Lisboa, Casas para Venda, Deseja comprar casa? No maior Portal Imobiliário Nacional temos milhares de apartamentos e moradias em Lisboa, no Porto e por todo o país." />
<meta name="keywords" content="Casas para Venda, Apartamentos Ofertas recentes no Distrito de Lisboa, Casas para Venda, venda, compra, comprar, casas, imóveis

## Create Container

In [5]:
html_soup = BeautifulSoup(response.text, 'html.parser')
house_containers = html_soup.find_all('div', class_="searchResultProperty")
print("Number of items in the container :", len(house_containers))

Number of items in the container : 25


## Data / Variable Extraction

In [6]:
first = house_containers[0]
first.find_all('span')

[<span>
                         Apartamento T4, Oeiras e São Julião da Barra, Paço de Arcos e Caxias, Oeiras
                     </span>,
 <span class="btnContactPVPI" id="MC_PropertyInList_repProperties_btnContactPVPINormal_0" onclick="ShowContactForm('a2297877-4394-11e9-b712-060000000057', '13', '5', true); return false;" style="z-index: 9999;">Contacte Anunciante</span>,
 <span>1 800 000 <strong title="Euro">€</strong></span>]

In [8]:
first.find_all('span')[2]

<span>1 800 000 <strong title="Euro">€</strong></span>

In [9]:
#Price
var_x = first.find_all('span')[2].text.replace('\xa0','').replace('€','').strip()
var_1 = int(''.join(itertools.takewhile(str.isdigit, var_x)))
print(var_1)
print(type(var_1))

1800000
<class 'int'>


In [10]:
#Location
var_y = first.find_all('p', class_="searchPropertyLocation")[0].text
var_2 = var_y[7:var_y.find(',')].strip()
print(var_2)

Oeiras e São Julião da Barra


In [11]:
#Area Size
var_3 = first.find_all('p')[7].text
print(var_3)

251m²


## Declare Dataframe Formation

In [12]:
titles = []
#created = []
prices = []
areas = []
zone = []
condition = []
#descriptions = []
#urls = []
#thumbnails = []

## Scrap Data from Loop Pages

In [13]:
%%time

n_pages = 0

for page in range(0,900):
    n_pages += 1
    sapo_url = 'https://casa.sapo.pt/Venda/Apartamentos/?sa=11&lp=10000&or=10'+'&pn='+str(page)
    r = get(sapo_url, headers=headers)
    page_html = BeautifulSoup(r.text, 'html.parser')
    house_containers = page_html.find_all('div', class_="searchResultProperty")
    if house_containers != []:
        for container in house_containers:
            
            # Price            
            price = container.find_all('span')[2].text
            if price == 'Contacte Anunciante':
                price = container.find_all('span')[3].text
                if price.find('/') != -1:
                    price = price[0:price.find('/')-1]
            if price.find('/') != -1:
                price = price[0:price.find('/')-1]
            
            price_ = [int(price[s]) for s in range(0,len(price)) if price[s].isdigit()]
            price = ''
            for x in price_:
                price = price+str(x)
            prices.append(int(price))

            # Zone
            location = container.find_all('p', class_="searchPropertyLocation")[0].text
            location = location[7:location.find(',')]
            zone.append(location)

            # Title
            name = container.find_all('span')[0].text
            titles.append(name)

            # Status
            status = container.find_all('p')[5].text
            condition.append(status)

            # Area
            m2 = container.find_all('p')[9].text
            if m2 != '-':
                m2 = m2.replace('\xa0','')
                m2 = float("".join(itertools.takewhile(str.isdigit, m2)))
                areas.append(m2)
                
            else:
                m2 = container.find_all('p')[7].text
                if m2 != '-':
                    m2 = m2.replace('\xa0','')
                    m2 = float("".join(itertools.takewhile(str.isdigit, m2)))
                    areas.append(m2)
                else:
                    areas.append(m2)

        else:
            break
        sleep(ranint(1,2))

print('At least {} pages scrapped, containing {} properties.' .format(n_pages, len(titles)))

At least 1 pages scrapped, containing 25 properties.
Wall time: 2.32 s


In [15]:
cols = ['v1','v2','v3','v4','v5']
df = pd.DataFrame({'v1':titles, 'v2':prices,'v3':areas,'v4':zone,'v5':condition})
df

Unnamed: 0,v1,v2,v3,v4,v5
0,"\r\n Apartamento T4, Oe...",1800000,590.0,Oeiras e São Julião da Barra,Novo
1,"\r\n Apartamento T4, Pá...",562500,123.0,Páteo Bagatela (São Mamede),Usado
2,"\r\n Apartamento T2, Oe...",700000,244.0,Oeiras e São Julião da Barra,Novo
3,"\r\n Apartamento T3, Oe...",885000,185.0,Oeiras e São Julião da Barra,Novo
4,"\r\n Apartamento T3, Oe...",800000,177.0,Oeiras e São Julião da Barra,Novo
5,"\r\n Apartamento T2, Oe...",660000,136.0,Oeiras e São Julião da Barra,Novo
6,"\r\n Apartamento T3, Oe...",835000,185.0,Oeiras e São Julião da Barra,Novo
7,"\r\n Apartamento T4, Oe...",1090000,250.0,Oeiras e São Julião da Barra,Novo
8,"\r\n Apartamento T4, Oe...",1080000,250.0,Oeiras e São Julião da Barra,Novo
9,"\r\n Apartamento T3, Ja...",325000,169.0,Jardim da Amoreira (Ramada),Usado
