# Web Scraping using Beautiful Soup

Objective: Web Scraping using Beautiful Soup  
Ref: https://towardsdatascience.com/looking-for-a-house-build-a-web-scraper-to-help-you-5ab25badc83e

In [1]:
from bs4 import BeautifulSoup
from requests import get
import pandas as pd
import itertools
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
headers = ({'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'})

## Set Website and Test Communication

In [3]:
sapo = "https://casa.sapo.pt/en_GB/For-sale/Apartments/Most-recent/?sa=11&lp=10000"
response = get(sapo, headers=headers)
print(response)

<Response [200]>


In [4]:
print(response.text[:1000])



<!DOCTYPE html>

<html lang="en-GB">
<head><title>
	Houses for sale, Apartments Most recent, from 10,000 € in Distrito de Lisboa, CASA SAPO - Portugal´s Real Estate Portal
</title><meta name="author" content="CASA SAPO - Portugal´s Real Estate Portal - Janela Digital SA" />
    <meta name="application-name" content="CASA SAPO - Portugal´s Real Estate Portal" data-copyright="Janela Digital SA" data-generated-time="10/03/2019 08:57" />
    
<meta name="content-language" content="en-GB" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<meta name="description" content="Houses for sale, Apartments Most recent, from 10,000 € in Distrito de Lisboa, Houses for sale, Do you wish to buy a house In the leading National Real Estate Portal we have thousands of apartments and detached houses in Lisbon, Oporti and all over the country." />
<meta name="keywords" content="Houses for sale, Apartments Most recent, from 10,000 € in Distrito de Lisboa, Houses for s

## Create Container

In [5]:
html_soup = BeautifulSoup(response.text, 'html.parser')
house_containers = html_soup.find_all('div', class_="searchResultProperty")
print("Number of items in the container :", len(house_containers))

Number of items in the container : 25


## Data / Variable Extraction

In [6]:
first = house_containers[0]
first.find_all('span')

[<span>
                         Apartment 3 Bedrooms, Alfragide, Amadora
                     </span>,
 <span class="btnContactPVPI" id="MC_PropertyInList_repProperties_btnContactPVPINormal_0" onclick="ShowContactForm('9a5c218b-42ea-11e9-ba04-060000000051', '13', '5', false); return false;" style="z-index: 9999;">Contact Promoter</span>,
 <span>225,000 <strong title="Euro">€</strong></span>]

In [7]:
first.find_all('p')

[<p class="searchPropertyTitle">
 <span>
                         Apartment 3 Bedrooms, Alfragide, Amadora
                     </span>
 </p>, <p class="searchPropertyLocation">
                     Alfragide, Amadora
                 </p>, <p>
                                 For sale
                             </p>, <p>
 <span>225,000 <strong title="Euro">€</strong></span>
 </p>, <p>Condition</p>, <p>Used</p>, <p>Net Area</p>, <p>85m²</p>, <p>Floor Area</p>, <p>128m²</p>, <p>Construction Area</p>, <p>-</p>, <p class="searchPropertyDescription">
 </p>]

In [8]:
#Property Title
var1 = first.findAll("span")[0].text.strip()
print(var1)

Apartment 3 Bedrooms, Alfragide, Amadora


In [9]:
#Location
var2 = first.find_all('p')[1].text.strip()
print(var2)

Alfragide, Amadora


In [10]:
#Type
var3 = first.find_all('p')[2].text.strip()
print(var3)

For sale


In [11]:
#Price
varx = first.find_all('p')[3].text.replace(',','').replace('€','').strip()
var4 = int(''.join(itertools.takewhile(str.isdigit, varx)))
print(var4)
print(type(var4))

225000
<class 'int'>


In [12]:
#Condition
var5 = first.find_all('p')[5].text.strip()
print(var5)

Used


In [13]:
#Net Area
var6 = first.find_all('p')[7].text.strip()
print(var6)

85m²


In [14]:
#Floor Area
var7 = first.find_all('p')[9].text.strip()
print(var7)

128m²


In [15]:
#Construction Area
var8 = first.find_all('p')[11].text.strip()
print(var8)

-


## Declare Dataframe Formation

In [16]:
titles = []
locations = []
ptypes = []
prices = []
conditions = []
net_areas = []
floor_areas = []
construct_areas = []

## Scrap Data from Loop Pages

In [17]:
%%time
n_pages = 0

for page in range(0,100):
    n_pages +=1
    sapo_url = 'https://casa.sapo.pt/Venda/Apartamentos/?sa=11&lp=10000&or=10'+'&pn='+str(page)
    #sapo_url = 'https://casa.sapo.pt/en_GB/For-sale/Apartments/Most-recent/?sa=11&lp=10000&pn='+str(page)
    r = get(sapo_url, headers = headers)
    page_html = BeautifulSoup(r.text, 'html.parser')
    containers = page_html.find_all('div', class_="searchResultProperty")
    
    if containers != []:
        for container in containers:
            
            #titles = []
            title = container.findAll("span")[0].text.strip()
            titles.append(title)
            
            #locations = []
            location = container.find_all('p')[1].text.strip()
            locations.append(location)
            
            #ptypes = []
            ptype = container.find_all('p')[2].text.strip()
            ptypes.append(ptype)
            
            #prices = []
            varx = container.find_all('p')[3].text.replace(',','').replace('€','').strip()
            price = int(''.join(itertools.takewhile(str.isdigit, varx)))
            prices.append(price)
            
            #conditions = []
            condition = container.find_all('p')[5].text.strip()
            conditions.append(condition)
            
            #net_areas = []
            net_area = container.find_all('p')[7].text.strip()
            net_areas.append(net_area)
            
            #floor_areas = []
            floor_area = container.find_all('p')[9].text.strip()
            floor_areas.append(floor_area)
            
            #construct_areas = []
            construct_area = container.find_all('p')[11].text.strip()
            construct_areas.append(construct_area)
        else:
            break
        sleep(ranint(1,2))

print('At least {} pages scrapped, containing {} properties.' .format(n_pages, len(titles)))

At least 1 pages scrapped, containing 25 properties.
Wall time: 1.79 s


In [18]:
cols = ['v1','v2','v3','v4','v5','v6','v7','v8']
df = pd.DataFrame({'v1':titles, 'v2':locations,'v3':ptypes,'v4':prices,
                   'v5':conditions,'v6':net_areas,'v7':floor_areas,'v8':construct_areas})
df

Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v8
0,,"Alfragide, Amadora",Venda,225,Usado,85m²,128m²,-
1,"Estúdio, Santa Maria Maior, Lisboa","Santa Maria Maior, Lisboa",Venda,150,Usado,15m²,16m²,-
2,"Apartamento T4, Conquinha (São Pedro e Santiag...","Conquinha (São Pedro e Santiago), S.P., Santia...",Venda,269,Recuperado,170m²,185m²,-
3,"Apartamento T2, Quinta das Flores (Santo Antón...",Quinta das Flores (Santo António de Cavaleiros...,Venda,160,Usado,80m²,86m²,-
4,"Apartamento T2, Estrela (Lapa), Estrela, Lisboa","Estrela (Lapa), Estrela, Lisboa",Venda,495,Recuperado,115m²,-,-
5,"Apartamento T2, Centro (Bobadela), Santa Iria ...","Centro (Bobadela), Santa Iria de Azoia, São Jo...",Venda,239,Renovado,-,95m²,-
6,"Apartamento T2, Moinhos da Funcheira (São Brás...","Moinhos da Funcheira (São Brás), Mina de Água,...",Venda,120,Usado,78m²,85m²,-
7,"Apartamento T2, Ericeira, Mafra","Ericeira, Mafra",Venda,190,Usado,-,-,-
8,"Apartamento T2, Pontinha e Famões, Odivelas","Pontinha e Famões, Odivelas",Venda,149,Usado,63m²,63m²,63m²
9,"Apartamento T1, Santa Maria Maior, Lisboa","Santa Maria Maior, Lisboa",Venda,285,Usado,40m²,40m²,40m²
