# Web parsing with Python, Beautiful Soup and Selenium

### 1. Get any HTML

It's very easy to extract the source code of a web page in Python

In [None]:
import requests

In [None]:
# a very lightweight website
url = 'https://lite.cnn.com/en'

In [None]:
# Let's render it here (I love Jupyter)
from IPython.display import IFrame
IFrame(src=url, width='100%', height='250ps')

In [None]:
answer = requests.get(url)

In [None]:
# what could we do with an answer
print(answer.url)
print(answer.status_code)
print(answer.reason)

In [None]:
print(answer.content)

**That looks like a lot of things. We have to somehow navigate through HTML**

### 2. Use BS

In [None]:
from bs4 import BeautifulSoup

In [None]:
soup = BeautifulSoup(answer.content, 'html.parser')

In [None]:
# now we can recognize some structure
print(soup.prettify())

In [None]:
soup.title

In [None]:
# let's find the links
soup.find_all('a')[:10]

In [None]:
# and get the title of one
soup.find_all('a')[5].get_text()

Now to serious business!

### 3. Scrape Airbnb page

Let's get to the website and look for some apartments

In [None]:
# Let's plan a trip to Austrian Alps
airbnb_url = 'https://www.airbnb.com/s/Mayrhofen--Austria/homes?tab_id=home_tab&refinement_paths%5B%5D=%2Fhomes&date_picker_type=calendar&query=Mayrhofen%2C%20Austria&place_id=ChIJbzLYLzjdd0cRDtGuTzM_vt4&checkin=2021-02-06&checkout=2021-02-13&adults=4&source=structured_search_input_header&search_type=autocomplete_click'

In [None]:
soup = BeautifulSoup(requests.get(airbnb_url).content, 'html.parser')

In [None]:
print(soup.prettify())

### 4. Inspect elements

Press F12 ;)

### 5. Scrape 1 element

In [None]:
soup.find_all('div', '_gig1e7')

In [None]:
# we can also extract its child tag
soup.find_all('div', '_8s3ctt')

In [None]:
listings = soup.find_all('div', '_8s3ctt')

In [None]:
listings[0]

In [None]:
listings[0].find_all('a')[0].get('href')

In [None]:
listings[0].get_text()

### 6. Inspect all data elements on search page

**smithio.medium.com**

<img src='https://miro.medium.com/max/700/1*GLNHp0QOf5qZiHa1bnaRvg.png'>

In [None]:
# url: tag=a, get=href
# name: tag=div, class=_hxt6u1e, get=aria-label
# header: tag=div, class=_b14dlit

### 7. Write a scraping function

In [None]:
# First Generation :)
def extract_basic_features(listing_html):
    features_dict = {}
    
    url = listing_html.find('a').get('href')
    name = listing_html.find("div", {"class": "_hxt6u1e"}).get('aria-label')
    header = listing_html.find("div", {"class": "_b14dlit"}).get_text()
    
    features_dict['url'] = url
    features_dict['name'] = name
    features_dict['header'] = header
    
    return features_dict

In [None]:
extract_basic_features(listings[0])

In [None]:
# what if the tag is not found?
listings[0].find('b').get_text()

In [None]:
# Second Generation :)
def extract_basic_features(listing_html):
    features_dict = {}
    
    try:
        url = listing_html.find('b').get('href')
    except:
        url = 'empty'
    try:
        name = listing_html.find("div", {"class": "_hxt6u1e"}).get('aria-label')
    except:
        name = 'empty'
    try:
        header = listing_html.find("div", {"class": "_b14dlit"}).text
    except:
        header = 'empty'
    
    
    features_dict['url'] = url
    features_dict['name'] = name
    features_dict['header'] = header
    
    return features_dict

In [None]:
extract_basic_features(listings[0])

In [None]:
# too many separate extractions
RULES_SEARCH_PAGE = {
    'url': {'tag': 'a', 'get': 'href'},
    'name': {'tag': 'div', 'class': '_hxt6u1e', 'get': 'aria-label'},
    'header': {'tag': 'div', 'class': '_b14dlit'},
    'rooms': {'tag': 'div', 'class': '_kqh46o'},
    'facilities': {'tag': 'div', 'class': '_kqh46o', 'order': 1},
    'badge': {'tag': 'div', 'class': '_17bkx6k'},
    'rating_n_reviews': {'tag': 'span', 'class': '_18khxk1'},
    'price': {'tag': 'span', 'class': '_1p7iugi'},
    'superhost': {'tag': 'div', 'class': '_ufoy4t'},
}

In [None]:
# Third Generation :)
def extract_element(listing_html, params):
    # 1. Find the right tag
    if 'class' in params:
        elements_found = listing_html.find_all(params['tag'], params['class'])
    else:
        elements_found = listing_html.find_all(params['tag'])

    # 2. Extract the right element
    tag_order = params.get('order', 0)
    element = elements_found[tag_order]
        
    # 3. Get text
    if 'get' in params:
        output = element.get(params['get'])
    else:
        output = element.get_text()

    return output

In [None]:
print(extract_element(listings[0], RULES_SEARCH_PAGE['header']))
print(extract_element(listings[0], RULES_SEARCH_PAGE['url']))

In [None]:
for feature in RULES_SEARCH_PAGE:
    print(f"{feature}: {extract_element(listings[0], RULES_SEARCH_PAGE[feature])}")

In [None]:
for feature in RULES_SEARCH_PAGE:
    try:
        print(f"{feature}: {extract_element(listings[0], RULES_SEARCH_PAGE[feature])}")
    except:
        print(f"{feature}: empty")

YAY!!! We're extracted all the features from one listing!

### 8. Explore pagination

<img src='https://miro.medium.com/max/564/1*Q9iBSu5nniBwc8Wt2-8Ujw.png'>

In [None]:
airbnb_url

In [None]:
# let's finally write this function
def get_listings(search_page):
    soup = BeautifulSoup(requests.get(search_page).content, 'html.parser')
    listings = soup.find_all('div', '_8s3ctt')

    return listings

In [None]:
# it works
len(get_listings(airbnb_url))

In [None]:
# let's try next page
new_url = airbnb_url + '&items_offset=20'
len(get_listings(new_url))

In [None]:
# checking the content, if the data is there
print(extract_element(get_listings(airbnb_url)[0], RULES_SEARCH_PAGE['name']))
print(extract_element(get_listings(new_url)[0], RULES_SEARCH_PAGE['name']))

### 9. Collect all urls

In [None]:
# let's iterate through all 15 pages
all_listings = []
for i in range(15):
    offset = 20 * i
    new_url = airbnb_url + f'&items_offset={offset}'
    new_listings = get_listings(new_url)
    all_listings.extend(new_listings)
    
    # let's check if it's scraping
    print(len(all_listings))

In [None]:
# why? maybe Airbnb tries to prevent scraping
# let's wait a couple of seconds after every iteration
import time

all_listings = []
for i in range(15):
    offset = 20 * i
    new_url = airbnb_url + f'&items_offset={offset}&section_offset=3'
    new_listings = get_listings(new_url)
    all_listings.extend(new_listings)
    
    # let's check if it's scraping
    print(len(all_listings))

    time.sleep(2)

Not perfect but some improvement

In [None]:
# another random check, if the data is there
print(extract_element(all_listings[113], RULES_SEARCH_PAGE['name']))

### 10. Scrape all search pages

1. build all urls
2. iteratively scrape them

In [None]:
# 1. build all urls
def build_urls(main_url, listings_per_page=20, pages_per_location=15):
    url_list = []
    for i in range(pages_per_location):
        offset = listings_per_page * i
        url_pagination = main_url + f'&items_offset={offset}'
        url_list.append(url_pagination)
    
    return url_list

In [None]:
# safe function to extract all features from one page
def extract_page_features(soup, rules):
    features_dict = {}
    for feature in rules:
        try:
            features_dict[feature] = extract_element(soup, rules[feature])
        except:
            features_dict[feature] = 'empty'
    
    return features_dict

In [None]:
# 2. Iteratively scrape pages
def process_search_pages(url_list):
    features_list = []
    for page in url_list:
        listings = get_listings(page)
        for listing in listings:
            features = extract_page_features(listing, RULES_SEARCH_PAGE)
            features_list.append(features)

    return features_list

In [None]:
# build a list of URLs
url_list = build_urls(airbnb_url)

In [None]:
url_list

In [None]:
# try for one page
base_features = process_search_pages(url_list[:1])

In [None]:
base_features

### 11. Look at it

https://github.com/x-technology/airbnb-analytics/blob/main/Part%201%20-%20Web%20Scraping/data_sample.csv

# All imports in one cell (just in case)

In [None]:
# all imports
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

import json
import time

import pandas as pd

from multiprocessing import Pool

import os