In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
import time
import pandas as pd
import numpy as np
import math

start = time.time()

path = r"D:\Zenor\Python\chromedriver_win32\chromedriver.exe"

### Starting Webdriver

In [2]:
options = webdriver.ChromeOptions()
# options.add_argument("--start-minimized")

driver = webdriver.Chrome(path, options=options)
# driver.minimize_window()

driver.get("https://www.imot.bg/pcgi/imot.cgi")

### Accepting Cookies and getting to search page

In [3]:
accept_cookies = driver.find_element(By.CLASS_NAME, value="fc-button-label")
accept_cookies.click()

select_sofia = driver.find_element(By.ID, value="BG-23")
select_sofia.click()

sales = driver.find_element(By.CLASS_NAME, value="mapBtnProdajbi")
sales.click()

In [4]:
wait = WebDriverWait(driver, 10)

### Setting Apartment parameters

In [5]:
# Apartment parameters (rooms)
rooms = ["vi3", # 3 rooms
         "vi4", # 4 rooms
         "vi5"] # multiple rooms

for room in rooms:
    room_element = driver.find_element(By.ID, value=room)
    room_element.click()

# Apartment parameters (size & price)
search_params = ["f26",  # from_size in sqm
                 #"f27",  # to_size in sqm
                 "f28",  # from_total price
                 "f29"]  # to_total price

search_values = [90,     # from_size in sqm
                 #96,     # to_size in sqm
                 150000, # from_total price
                 300000] # to_total price

for p, v in zip(search_params, search_values):
    params = driver.find_element(By.NAME, value=p)
    params.send_keys(v)
    
# Apartment parameters (location)    
neighbourhoods = ["Зона Б-5",
                  "Зона Б-5-3",
                  "Център",
                  "Медицинска академия",
                  "Докторски паметник", 
                  "Оборище",
                  "Лозенец", 
                  "Иван Вазов",
                  "Яворов"]

for n in neighbourhoods:
    neighbourhood = driver.find_element(By.XPATH, value=f'//option[@value="{n}"]')
    actions = ActionChains(driver)
    actions.double_click(neighbourhood).perform()
    
# Apartment parameters (additional tickboxes)
extra_params = ["f62", # construction type
                "fe1"] # having images
    
for p in extra_params:
    tick_element = driver.find_element(By.NAME, value=p)
    tick_element.click()   
    
# Apartment parameters (year of construction)    
# construction_year = driver.find_element(By.NAME, value="f52")
# construction_year.send_keys("2015")

### Getting to Search results

In [6]:
search_button = driver.find_element(By.XPATH, value='//input[@type="button"]')
search_button.click()

### Getting first page results links

In [7]:
first_page_link = driver.current_url

In [8]:
first_page_link

'https://www.imot.bg/pcgi/imot.cgi?act=3&slink=9uya1a&f1=1'

### Getting all result page links

In [9]:
total_nbr_pages = int(driver.find_element(By.CLASS_NAME, "pageNumbersInfo").text[-2:])
total_nbr_pages

16

In [10]:
pages = [first_page_link]

In [11]:
for i in range(2, total_nbr_pages + 1):
    next_page_link = first_page_link[-1:].replace("1", first_page_link[:-1] + f"{i}")
    pages.append(next_page_link)

In [12]:
len(pages)

16

### Creating lists for storing values

In [13]:
# Creating empty list lith links
listings = []

# Creating empty lists for storing values for the df
ids = []
prices = []
sizes = []
years_construction = []
descriptions = []
agencies = []
locations = []
titles = []

### Opening each page and the listing links

In [14]:
for page in pages:
    driver.get(page)
    time.sleep(3)
    wait.until(EC.presence_of_element_located((By.CLASS_NAME, "lnk1")))
    listing_links = driver.find_elements(By.CLASS_NAME, "lnk1")
    
    for i in listing_links:
        link = i.get_attribute("href")
        listings.append(link)
        
print(len(listings))

612


### Opening listing and exctrating data

In [15]:
for i in listings:
    driver.get(i)
    time.sleep(3)
    
    # getting listing ids
    try:
        listing_id_element = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'input[type="hidden"][name="adv"]')))
        ids.append(listing_id_element.get_attribute("value"))
    except Exception:
        ids.append(None)

    # getting prices
    try:
        price_element = wait.until(EC.presence_of_element_located((By.ID, "cena")))
        prices.append(price_element.text)
    except Exception:
        prices.append(None)
    
    # getting apartment size, year of construction
    ad_params_element = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "adParams")))
    
    try:
        size_element = ad_params_element.find_element(By.XPATH, "./div[contains(., 'Площ: ')]/strong")
        sizes.append(size_element.text)
    except Exception:
        sizes.append(None)
    
    try:
        construction_year_element = ad_params_element.find_element(By.XPATH, "./div[contains(., 'Строителство: ')]/strong")
        years_construction.append(construction_year_element.text)
    except Exception:
        years_construction.append(None)
    
    # getting description    
    try:
        description_element = wait.until(EC.presence_of_element_located((By.ID, "description_div")))
        descriptions.append(description_element.text)
    except Exception:
        descriptions.append(None)
    
    # getting agency name
    try:
        agency_element = wait.until(EC.presence_of_element_located((By.TAG_NAME, "b")))
        agencies.append(agency_element.text)
    except Exception:
        agencies.append(None)
    
    # getting location
    try:
        location_element = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "location")))
        locations.append(location_element.text)
    except Exception:
        locations.append(None)

    # getting listing title
    try:
        title_element = wait.until(EC.presence_of_element_located((By.CLASS_NAME, "advHeader")))
        titles.append(title_element.text)
    except Exception:
        titles.append(None)
        
driver.quit()

### Creating a DataFrame

In [16]:
data = {
        'id':                 [i for i in ids],
        'price':              [i for i in prices],
        'size':               [i for i in sizes],
        'construction_year':  [i for i in years_construction],
        'description':        [i for i in descriptions],
        'agency':             [i for i in agencies],
        'location':           [i for i in locations],
        'title':              [i for i in titles],
        'link':               [i for i in listings],
        'date':               [pd.Timestamp.now().date() for i in titles]
}

df = pd.DataFrame(data)       

df.head()

Unnamed: 0,id,price,size,construction_year,description,agency,location,title,link,date
0,1c168303044134720,237 990 EUR,111 m2,"Тухла, 2022 г.",ПРЕКРАСЕН ТРИСТАЕН АПАРТАМЕНТ! ТОП ЛОКАЦИЯ!\n\...,Брокер: Йорданка Петрова,"град София, Зона Б-5","Продава 3-СТАЕН\nград София, Зона Б-5\nВиж карта",https://www.imot.bg/pcgi/imot.cgi?act=5&adv=1c...,2023-10-15
1,1c169279714284408,283 000 EUR,125 m2,"Тухла, 1998 г.","Комуникативно местоположение, ул. Димитър Хадж...",Брокер: Мариела Китанова,"град София, Иван Вазов","Продава 3-СТАЕН\nград София, Иван Вазов\nВиж к...",https://www.imot.bg/pcgi/imot.cgi?act=5&adv=1c...,2023-10-15
2,1c169714589935846,215 000 EUR,90 m2,"Тухла, 1997 г.","10 минути пеш до НДК, 2-стаен преустроен на тр...",Брокер: Лилия Йорданова,"град София, Лозенец","Продава 3-СТАЕН\nград София, Лозенец\nВиж карта",https://www.imot.bg/pcgi/imot.cgi?act=5&adv=1c...,2023-10-15
3,1c169702525323979,215 000 EUR,90 m2,"Тухла, 1995 г.",ТОП ОФЕРТА !!! Продаваме прекрасен тристаен ап...,Агенция: НАЙС ХОУМ,"град София, Лозенец, ул. Крум Попов","Продава 3-СТАЕН\nград София, Лозенец, ул. Крум...",https://www.imot.bg/pcgi/imot.cgi?act=5&adv=1c...,2023-10-15
4,1c169506678922472,246 000 EUR,90 m2,"Тухла, 1980 г.",Панорамен апартамент до водната кула. Жилището...,Агенция: АВЕКС,"град София, Лозенец","Продава 3-СТАЕН\nград София, Лозенец\nВиж карта",https://www.imot.bg/pcgi/imot.cgi?act=5&adv=1c...,2023-10-15


### Editing the DataFrame

###### Price

In [17]:
df['price'] = df['price'].str.replace(' EUR', '')
df['price'] = df['price'].str.replace(' ', '')
df['price'] = df['price'].astype('int32')

In [18]:
df['price'][:3]

0    237990
1    283000
2    215000
Name: price, dtype: int32

###### Size

In [19]:
df['size'] = df['size'].str.replace(' m2', '')
df['size'] = df['size'].astype('int16')

In [20]:
df['size'][:3]

0    111
1    125
2     90
Name: size, dtype: int16

###### Constructed

In [21]:
df['construction_year'] = df['construction_year'].str.replace('^Тухла$', '0', regex=True)
df['construction_year'] = df['construction_year'].str.extract(r"(\d{4}) г") #.astype('int16') 

In [22]:
df['construction_year'][:3]

0    2022
1    1998
2    1997
Name: construction_year, dtype: object

###### Agency

In [23]:
df['agency'] = df['agency'].str.split(": ", n=1, expand=True)[1]

In [24]:
df['agency'][:3]

0    Йорданка Петрова
1    Мариела Китанова
2     Лилия Йорданова
Name: agency, dtype: object

###### Location

In [25]:
df['location'] = df['location'].str.split(", ", n=1, expand=True)[1]

In [26]:
df['location'][:3]

0      Зона Б-5
1    Иван Вазов
2       Лозенец
Name: location, dtype: object

###### Type of apartment (nbr of rooms)

In [27]:
df['nbr_rooms'] = df['title'].str.split("\n", n=1, expand=True)[0]
df['nbr_rooms'] = df['nbr_rooms'].str.replace('Продава ', '')

In [28]:
df['nbr_rooms'][:3]

0    3-СТАЕН
1    3-СТАЕН
2    3-СТАЕН
Name: nbr_rooms, dtype: object

###### Date of scraping

In [29]:
date_scraped = df['date'].groupby(by=df['date']).max().iloc[0]

In [30]:
file_name = f"{date_scraped}" + "_imot.bg.xlsx"
file_path = "D:\\Zenor\\Python\\Webscraping_Housing\\" + file_name
file_path

'D:\\Zenor\\Python\\Webscraping_Housing\\2023-10-15_imot.bg.xlsx'

In [31]:
df.to_excel(file_path, index=False)

In [32]:
df.head()

Unnamed: 0,id,price,size,construction_year,description,agency,location,title,link,date,nbr_rooms
0,1c168303044134720,237990,111,2022,ПРЕКРАСЕН ТРИСТАЕН АПАРТАМЕНТ! ТОП ЛОКАЦИЯ!\n\...,Йорданка Петрова,Зона Б-5,"Продава 3-СТАЕН\nград София, Зона Б-5\nВиж карта",https://www.imot.bg/pcgi/imot.cgi?act=5&adv=1c...,2023-10-15,3-СТАЕН
1,1c169279714284408,283000,125,1998,"Комуникативно местоположение, ул. Димитър Хадж...",Мариела Китанова,Иван Вазов,"Продава 3-СТАЕН\nград София, Иван Вазов\nВиж к...",https://www.imot.bg/pcgi/imot.cgi?act=5&adv=1c...,2023-10-15,3-СТАЕН
2,1c169714589935846,215000,90,1997,"10 минути пеш до НДК, 2-стаен преустроен на тр...",Лилия Йорданова,Лозенец,"Продава 3-СТАЕН\nград София, Лозенец\nВиж карта",https://www.imot.bg/pcgi/imot.cgi?act=5&adv=1c...,2023-10-15,3-СТАЕН
3,1c169702525323979,215000,90,1995,ТОП ОФЕРТА !!! Продаваме прекрасен тристаен ап...,НАЙС ХОУМ,"Лозенец, ул. Крум Попов","Продава 3-СТАЕН\nград София, Лозенец, ул. Крум...",https://www.imot.bg/pcgi/imot.cgi?act=5&adv=1c...,2023-10-15,3-СТАЕН
4,1c169506678922472,246000,90,1980,Панорамен апартамент до водната кула. Жилището...,АВЕКС,Лозенец,"Продава 3-СТАЕН\nград София, Лозенец\nВиж карта",https://www.imot.bg/pcgi/imot.cgi?act=5&adv=1c...,2023-10-15,3-СТАЕН


In [33]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 612 entries, 0 to 611
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   id                 612 non-null    object
 1   price              612 non-null    int32 
 2   size               612 non-null    int16 
 3   construction_year  509 non-null    object
 4   description        610 non-null    object
 5   agency             605 non-null    object
 6   location           612 non-null    object
 7   title              612 non-null    object
 8   link               612 non-null    object
 9   date               612 non-null    object
 10  nbr_rooms          612 non-null    object
dtypes: int16(1), int32(1), object(9)
memory usage: 1.5 MB


In [34]:
driver.quit()

### Check for errors in data (if result = 1, ok to proceed)

In [35]:
pd.Series(listings).nunique() / pd.Series(listings).count()

1.0

###  Time elapsed

In [36]:
end = time.time()

In [37]:
print(
    round((end - start) / 60, 2),
)

39.67
