In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re

In [3]:
def initialize_property_dict():
    return {
        'price_total': 'Not Available',
        'location': 'Not Available',
        'property_type': 'Not Available',
        'square_meters': 'Not Available',
        'construction_type': 'Not Available',
        'construction_year': 'Not Available',
        'floor_number': 'Not Available',
        'floor_type': 'Not Available'
    }

In [5]:
def process_vip_params(listing, data):
    params = listing.find_all('span', class_='ads-params-multi')
    param_map = {
        'Вид на имота': 'property_type',
        'Квадратура': 'square_meters',
        'Вид строителство': 'construction_type',
        'Година на строителство': 'construction_year',
        'Номер на етажа': 'floor_number',
        'Етаж': 'floor_type'
    }
    for param in params:
        title = param.get('title')
        if title in param_map:
            key = param_map[title]
            value = param.text.strip().split(':', 1)[-1].strip()
            data[key] = value

def process_regular_params(listing, data):
    param_tables = listing.find_all('div', class_='ads-params-table')
    param_map = {
        'Вид на имота': 'property_type',
        'Квадратура': 'square_meters',
        'Вид строителство': 'construction_type',
        'Година на строителство': 'construction_year',
        'Номер на етажа': 'floor_number',
        'Етаж': 'floor_type'
    }
    for table in param_tables:
        rows = table.find_all('div', class_='ads-params-row')
        for row in rows:
            param_title = row.find('div', class_='ads-param-title')
            if not param_title:
                continue
            param_title = param_title.text.strip().replace(':', '')
            if param_title not in param_map:
                continue
            param_value = row.find('div', class_='ads-params-cell').text.strip()
            key = param_map[param_title]
            
            if param_title == 'Квадратура':
                match = re.search(r'(\d+)', param_value)
                param_value = match.group(1) if match else param_value
            elif param_title == 'Година на строителство':
                match = re.search(r'(\d{4})', param_value)
                param_value = match.group(1) if match else param_value
            data[key] = param_value

In [7]:
def scrape_property_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        listings = soup.find_all('div', class_=['listtop-item', 'listvip-item'])
        data = []
        for listing in listings:
            property_data = initialize_property_dict()
            is_vip = 'listvip-item' in listing.get('class', [])
            
            location_div = listing.find('div', class_=re.compile('list(top|vip)-item-address'))
            if location_div:
                location_text = location_div.get_text(' ', strip=True)
                location_text = re.sub(r'^[^а-яА-Яa-zA-Z]*', '', location_text)
                property_data['location'] = location_text.strip()
                
            price_spans = listing.find_all('span', class_='nowrap')
            if price_spans:
                property_data['price_total'] = price_spans[0].text.strip()
                
            if is_vip:
                process_vip_params(listing, property_data)
            else:
                process_regular_params(listing, property_data)
            data.append(property_data)
        return data
    except Exception as e:
        print(f"Error scraping {url}: {str(e)}")
        return []

In [9]:
base_url = "https://www.alo.bg/obiavi/imoti-prodajbi/apartamenti-stai/?region_id=22&location_ids=4342%40page%3D2%40page%3D3A%40page%3D2&page="

all_properties = []

for page_num in range(2, 270):
    page_url = f"{base_url}{page_num}"
    print(f"Scraping page {page_num}...")
    page_data = scrape_property_page(page_url)
    all_properties.extend(page_data)
    time.sleep(2) 

print("Scraping completed!")

Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
Scraping page 51...
Scraping

In [71]:
df = pd.DataFrame(all_properties)
df.to_csv("sofia_real_estate.csv", index=False, encoding="utf-8")
print("Saved as sofia_real_estate.csv")

Saved as sofia_real_estate.csv


In [73]:
df.head()

Unnamed: 0,price_total,location,property_type,square_meters,construction_type,construction_year,floor_number,floor_type
0,246255,"Малинова Долина, София",Многостаен апартамент в София,141,Тухла,2025,1 етаж,Първи жилищен
1,236305,"Бояна, София",Мезонет,134,Тухла,Not Available,Not Available,Not Available
2,239865,"Бояна, София",Тристаен апартамент в София,113,Тухла,Not Available,2 етаж,Not Available
3,160000,"Зона Б18, София",Двустаен апартамент в София,57,Тухла,2005,7 етаж,Непоследен
4,240000,"Дружба 1, София",Тристаен апартамент в София,87,ЕПК/ПК,1987,9 етаж,Непоследен
