## Scraping with Selenium
`! pip install selenium` <br>
`! pip install webdriver-manager`

In [6]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
import time
import pathlib
import pickle

In [7]:
website= "https://spainhomes.com/real-estate/malaga"

In [8]:
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(website)

In [5]:
driver.quit()

## Get ads from one page

In [9]:
ads = driver.find_elements(By.XPATH,'//div[@class="right"]')

In [10]:
len(ads)

12

In [11]:
ads[0]

<selenium.webdriver.remote.webelement.WebElement (session="28667f91da2a3e185fe9522d72203775", element="f.13D6EB19068A64AE4545DE01014076E3.d.204EC8684978BA57762D9D5013A1CBBF.e.274")>

## Get info from one ad
We want to get: <br>
1. Title 
2. Description
3. Number of bedrooms
4. Number of bathrooms
5. Price
6. url for more details

In [12]:
title = ads[0].find_element(By.XPATH, './/span[@class="title"]').text
title

'Modern Sea View Apartments with Spacious Terraces in Fuengirola'

In [13]:
desc = ads[0].find_element(By.XPATH, './/span[@class="desc"]').text
desc

'The sea-view apartments with large terraces are situated in a prestigious community in Fuengirola, Costa del Sol. The gated and secured complex has amazing facilities and social and sports clubs.'

In [14]:
price = ads[0].find_element(By.XPATH, './/span[@class="fiyat"]').text
price

'FROM\n€540.000'

In [15]:
details = ads[0].find_element(By.XPATH, './/div[@class="other row between-xs middle-xs nowrap"]')

In [16]:
details = details.find_elements(By.XPATH, './/span')
[el.text for el in details]

['FUENGIROLA - MÁLAGA', '1, 2, 3', '1, 2']

In [17]:
url = ads[0].find_element(By.XPATH, './/a[@class="main-emlak-link"]')
url

<selenium.webdriver.remote.webelement.WebElement (session="28667f91da2a3e185fe9522d72203775", element="f.13D6EB19068A64AE4545DE01014076E3.d.204EC8684978BA57762D9D5013A1CBBF.e.1779")>

In [18]:
url.get_attribute("href")

'https://spainhomes.com/ad/agp-0732-new-build-apartments-with-sea-views-in-prime-area-of-fuengirola'

## Get info from all ads in one page

In [19]:
def get_info(ad):
    info = {}
    info["title"] = ad.find_element(By.XPATH, './/span[@class="title"]').text
    info["desc"] = ad.find_element(By.XPATH, './/span[@class="desc"]').text
    info["price"] = ad.find_element(By.XPATH, './/span[@class="fiyat"]').text
    details = ad.find_element(By.XPATH, './/div[@class="other row between-xs middle-xs nowrap"]')
    details = details.find_elements(By.XPATH, './/span')
    info["details"] = [el.text for el in details]
    url = ad.find_element(By.XPATH, './/a[@class="main-emlak-link"]')
    info["url"] = url.get_attribute("href")
    return info

In [20]:
page_1_info = [get_info(ad) for ad in ads]

In [21]:
page_1_info

[{'title': 'Modern Sea View Apartments with Spacious Terraces in Fuengirola',
  'desc': 'The sea-view apartments with large terraces are situated in a prestigious community in Fuengirola, Costa del Sol. The gated and secured complex has amazing facilities and social and sports clubs.',
  'price': 'FROM\n€540.000',
  'details': ['FUENGIROLA - MÁLAGA', '1, 2, 3', '1, 2'],
  'url': 'https://spainhomes.com/ad/agp-0732-new-build-apartments-with-sea-views-in-prime-area-of-fuengirola'},
 {'title': 'BREEAM-Certified Flats in a Prime Area in Fuengirola Málaga',
  'desc': 'These flats are situated in a BREEAM-certified complex in Fuengirola, Málaga. The complex features rich amenities like swimming pools, parking, a wellness area, and a business center.',
  'price': 'FROM\n€720.000',
  'details': ['FUENGIROLA - MÁLAGA', '2, 3', '2'],
  'url': 'https://spainhomes.com/ad/agp-0909-sustainable-flats-in-a-complex-with-rich-amenities-in-fuengirola'},
 {'title': 'Spacious Villa with Panoramic Sea Views

In [22]:
driver.quit()

In [23]:
# making a directory with results
results_dir = pathlib.Path("results")
results_dir.mkdir(parents=True, exist_ok=True)

In [24]:
file = open(results_dir/'malaga_page_1.pickle', 'wb')
pickle.dump(page_1_info , file)
file.close()

### Cliking on the cookie page

In [25]:
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(website)

In [27]:
def click_accept_cookies(driver):
    cookie_xpath = '//div/a[@class="yes"]'
    button = driver.find_element(By.XPATH, cookie_xpath)
    button.click()

In [28]:
click_accept_cookies(driver)

### getting other pages

In [37]:
def dump_page(i, page_info, results_dir=results_dir):
    filename = results_dir/'malaga_page_{}.pickle'.format(str(i))
    file = open(filename, 'wb')
    pickle.dump(page_info , file)
    file.close()
    print(filename)

In [29]:
def get_url(i):
    url = "https://spainhomes.com/real-estate/malaga?page={}.htm".format(i)
    return url

In [30]:
i = 2
website = get_url(i)
website

'https://spainhomes.com/real-estate/malaga?page=2.htm'

In [31]:
driver.quit()

In [32]:
driver = webdriver.Chrome()
driver.maximize_window()
driver.get(website)
time.sleep(3)

In [33]:
click_accept_cookies(driver)

In [34]:
ads = driver.find_elements(By.XPATH,'//div[@class="right"]')
len(ads)

12

In [35]:
page_i_info = [get_info(ad) for ad in ads]

In [52]:
#page_i_info 

In [38]:
dump_page(i, page_i_info)

results/malaga_page_2.pickle


In [39]:
driver.quit()

### Try automatic with lots of sleeps

In [40]:
def scrape_a_page_for_ads(website):
    driver = webdriver.Chrome()
    driver.maximize_window()
    driver.get(website)
    time.sleep(3)
    click_accept_cookies(driver)
    time.sleep(2)
    ads = driver.find_elements(By.XPATH,'//div[@class="right"]')
    return ads, driver

In [41]:
for i in range(3, 15):
    print(i)
    website = get_url(i)
    ads, driver = scrape_a_page_for_ads(website)
    time.sleep(4)
    page_info = [get_info(ad) for ad in ads]
    dump_page(i, page_info)
    time.sleep(1)
    driver.quit()

3
results/malaga_page_3.pickle
4
results/malaga_page_4.pickle
5
results/malaga_page_5.pickle
6
results/malaga_page_6.pickle
7
results/malaga_page_7.pickle
8
results/malaga_page_8.pickle
9
results/malaga_page_9.pickle
10
results/malaga_page_10.pickle
11
results/malaga_page_11.pickle
12
results/malaga_page_12.pickle
13
results/malaga_page_13.pickle
14
results/malaga_page_14.pickle
