In [30]:
# WARNING, the script takes a couple of minutes to run. The 3 cells need to be run, since the next cell only
# defines the function which is used in the third cell.

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import pandas as pd
import math


In [31]:
def get_listings(driver):
    dicts_for_df = []
    articles = driver.find_elements_by_tag_name("article")
    
    for article in articles:
        listed_property = {}
        
        body_element = article.find_element_by_css_selector(".item-body")
        listed_property["title"] = body_element.find_element_by_tag_name("a").text
        listed_property["address"] = body_element.find_element_by_css_selector(".item-location").text
        listed_property["price"] = article.find_element_by_css_selector("span.title-price").text
        listed_property_url_element = article.find_element_by_css_selector("a.link.listing-title.stretched-link")
        listed_property["url"] = listed_property_url_element.get_attribute("href")
        details_element = body_element.find_element_by_css_selector("p.item-highlights").text
        details_list = details_element.split("•")
        listed_property_type_list = [detail.strip() for detail in details_list if "piso" in detail.lower() or "casa" in detail.lower()]
        if len(listed_property_type_list) == 1:
            listed_property["type"] = listed_property_type_list[0]
        else:
            listed_property["type"] = None
        listed_property_bedrooms_list = [detail.strip() for detail in details_list if "habitación/es" in detail.lower()]
        if len(listed_property_bedrooms_list) == 1:
            listed_property["bedrooms"] = listed_property_bedrooms_list[0]
        else:
            listed_property["bedrooms"] = None
        listed_property_rooms_list = [detail.strip() for detail in details_list if "habit." in detail.lower()]
        if len(listed_property_rooms_list) == 1:
            listed_property["rooms"] = listed_property_rooms_list[0]
        else:
            listed_property["rooms"] = None 
        listed_property_bathrooms_list = [detail.strip() for detail in details_list if "baño" in detail.lower()]
        if len(listed_property_bathrooms_list) == 1:
            listed_property["bathrooms"] = listed_property_bathrooms_list[0]
        else:
            listed_property["bathrooms"] = None
        listed_property_surface_list = [detail.strip() for detail in details_list if "m²" in detail.lower()]
        if len(listed_property_surface_list) == 1:
            listed_property["surface"] = listed_property_surface_list[0]
        else:
            listed_property["surface"] = None
        
        dicts_for_df.append(listed_property)
    return dicts_for_df 

In [32]:
driver = webdriver.Chrome("chromedriver.exe")
scrapped_url = "https://www.properstar.es/mexico/ciudad-de-mexico-loc/alquiler/piso-casa"
final_data_for_df = []

driver.get(scrapped_url)

print("Changing currency")
driver.find_element_by_css_selector(".btn.btn-dropdown-toggle").click()
region = driver.find_element_by_css_selector(".regional-panel")
buttons = region.find_elements_by_tag_name("button")
currency_button = buttons[1]
currency_button.click()
currencies = driver.find_elements_by_css_selector(".currencies-group")
currencies_group = currencies[1]
currency_buttons = currencies_group.find_elements_by_tag_name("button")
mexican_pesos = currency_buttons[44]
mexican_pesos.click()

print("Finding and calculating total results")
dicts_for_df = []
total_results_element = driver.find_element_by_css_selector("div.total-results")
total_results = int(total_results_element.text.replace(" listados", ""))
total_results_pages = math.ceil(total_results/20)
listing_urls = [scrapped_url + f"?p={i}" for i in range (2,total_results_pages + 1)]
print(f"Total results are {total_results}")
print(f"Total results pages are {total_results_pages}")

print("Getting page results for page 1")
page_listings = get_listings(driver)
final_data_for_df.append(page_listings)

for listing_url in listing_urls:
    print(f"Getting page results for page {listing_url.split('=')[1]}")
    driver.get(listing_url)
    page_listings = get_listings(driver)
    final_data_for_df.append(page_listings)

driver.close()
print("finished scrapping")

flat_final_data_for_df = [listing for listings in final_data_for_df for listing in listings]
listings_df = pd.DataFrame(flat_final_data_for_df)
listings_df.to_csv("output/scrapped_data.csv")          
          
listings_df.head()
print("finished")

Changing currency
Finding and calculating total results
Total results pages are 10
Getting page results for page 1
Getting page results for page 2
Getting page results for page 3
Getting page results for page 4
Getting page results for page 5
Getting page results for page 6
Getting page results for page 7
Getting page results for page 8
Getting page results for page 9
Getting page results for page 10
finished scrapping
finished
