In [1]:
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import requests
import time
from datetime import datetime
import random

from bs4 import BeautifulSoup

# Data scraping 

I scrape Mercedes-Benz GLE listings from [mobile.bg](mobile.bg). I extract the following data:
- title - short description of the offer.
- price - price of the car (lv).
- manufacturing_year - month and year of manufacture.
- mileage
- color
- fuel
- hp
- eurostandard
- cubic_capacity
- gearbox - all are automatic.
- type - 2 or 4-door passenger car.
- region - seller location.
- url - link to the current offer.

In [156]:
def scrape_cars(pages, base_url):
    all_listings = []
   
    for page in range(1, pages + 1):
        url = base_url if page == 1 else f"{base_url}/p-{page}"
        print(f"Scraping page {page}/{pages}")
             
        try:
            response = requests.get(url)
            response.raise_for_status()  
        except Exception as e:
            print(f"Error fetching page {page}: {e}")
            continue
            
        soup = BeautifulSoup(response.content, "html.parser")
        listings = soup.find_all("div", class_ = lambda x: x and "item" in x)

        for current_car in listings:
            car_data = {
                "title": np.nan,
                "price": np.nan,
                "manufacturing_year": np.nan,
                "mileage": np.nan,
                "color": np.nan,
                "fuel": np.nan,
                "hp": np.nan,
                "eurostandard": np.nan,
                "cubic_capacity": np.nan,
                "gearbox": "автоматична",
                "type": np.nan,
                "region": np.nan,
                "url": np.nan,
            }
            
            title_element = current_car.find("a", class_ = "title saveSlink")
            if title_element:
                car_data["title"] = title_element.text.strip()
                url = current_car.find("a")["href"]
                car_data["url"] = "https:" + url if url.startswith("//") else url

            price_element = current_car.find("div", class_ = "price")
            if price_element:
                car_data["price"] = price_element.find("div").text.strip()

            params_element = current_car.find("div", class_ = "params")
            if params_element:
                spans = params_element.find_all("span")
                for i, span in enumerate(spans):
                    text = span.text.strip()
                    if "г." in text:
                        car_data["manufacturing_year"] = text
                    elif "км" in text:
                        car_data["mileage"] = text[-10:-3]
                    elif any(color in text.lower() for color in ["бял", "черен", "сив", "син", "червен", "бордо", "зелен", "графит", "сребърен", "кафяв", "хамелеон", "металик", "перла", "бежов"]):
                        car_data["color"] = text
                    elif any(fuel in text.lower() for fuel in ["бензинов", "дизелов", "хибриден", "електрически", "plug-in хибрид"]):
                        car_data["fuel"] = text
                    elif "к.с." in text:
                        car_data["hp"] = text[-8:-5]
                    elif "евро" in text.lower():
                        car_data["eurostandard"] = text[5:]
                    elif "куб.см" in text:
                        car_data["cubic_capacity"] = text[0:4]
                    elif any(car_type in text.lower() for car_type in ["джип", "седан", "купе", "стреч лимузина", "хечбек"]):
                        car_data["type"] = text

            location_element = current_car.find("div", class_ = "location")
            if location_element:
                location = location_element.text.strip()
                idx_comma = location.find(",")
                car_data["region"] = location[5:idx_comma]

            if car_data["title"] is np.nan:
                continue
            
            print(car_data["title"])
            all_listings.append(car_data)
            print(f"{car_data}\n")
        
        # add a random delay between requests to avoid being blocked:
        time.sleep(random.uniform(2, 4))
    
    data_df = pd.DataFrame(all_listings)
    return data_df

In [157]:
base_url = "https://www.mobile.bg/obiavi/avtomobili-dzhipove/mercedes-benz/gle-klasa/avtomatichna/ot-2015/namira-se-v-balgariya"
cars_data_df = scrape_cars(42, base_url)

Scraping page 1/42
Mercedes-Benz GLE 450AMG COUPE Дистроник Камери360 Обдухване Keyless
{'title': 'Mercedes-Benz GLE 450AMG COUPE Дистроник Камери360 Обдухване Keyless', 'price': '69 990 лв.', 'manufacturing_year': 'април 2016 г.', 'mileage': '153 000', 'color': 'Черен', 'fuel': 'Бензинов', 'hp': '367', 'eurostandard': '6', 'cubic_capacity': '3000', 'gearbox': 'автоматична', 'type': 'Джип', 'region': 'София', 'url': 'https://www.mobile.bg/obiava-21739610929678661-mercedes-benz-gle-450amg-coupe-distronik-kameri360-obduhvane-keyless'}

Mercedes-Benz GLE 350d Coupe-AMG Line-9G-tronic-Harmon Kardon
{'title': 'Mercedes-Benz GLE 350d Coupe-AMG Line-9G-tronic-Harmon Kardon', 'price': '69 999 лв.', 'manufacturing_year': 'януари 2017 г.', 'mileage': '189 000', 'color': 'Бял', 'fuel': 'Дизелов', 'hp': '258', 'eurostandard': nan, 'cubic_capacity': nan, 'gearbox': 'автоматична', 'type': 'Джип', 'region': 'Пазарджик', 'url': 'https://www.mobile.bg/obiava-21732691514860984-mercedes-benz-gle-350d-cou

In [159]:
cars_data_df.head()

Unnamed: 0,title,price,manufacturing_year,mileage,color,fuel,hp,eurostandard,cubic_capacity,gearbox,type,region,url
0,Mercedes-Benz GLE 450AMG COUPE Дистроник Камер...,69 990 лв.,април 2016 г.,153 000,Черен,Бензинов,367,6.0,3000.0,автоматична,Джип,София,https://www.mobile.bg/obiava-21739610929678661...
1,Mercedes-Benz GLE 350d Coupe-AMG Line-9G-troni...,69 999 лв.,януари 2017 г.,189 000,Бял,Дизелов,258,,,автоматична,Джип,Пазарджик,https://www.mobile.bg/obiava-21732691514860984...
2,Mercedes-Benz GLE 43 AMG COUPE Панорама Дистр....,78 990 лв.,октомври 2017 г.,127 000,Син,Бензинов,367,6.0,3000.0,автоматична,Джип,София,https://www.mobile.bg/obiava-11737682127414901...
3,Mercedes-Benz GLE 250 d,45 500 лв.,януари 2015 г.,178 000,Черен,Дизелов,204,,2143.0,автоматична,Джип,Пловдив,https://www.mobile.bg/obiava-21705057744409616...
4,Mercedes-Benz GLE 350 4Matic AMG * ТОП СЪСТОЯНИЕ*,46 000 лв.,юли 2016 г.,251 700,Сив,Бензинов,307,5.0,3500.0,автоматична,Джип,София,https://www.mobile.bg/obiava-21725807501288193...


In [161]:
cars_data_df.shape

(836, 13)

### Save the data into .csv file:

In [162]:
current_date = datetime.today().strftime('%d-%m-%Y')
filename = f"data/raw_data_{current_date}.csv"
cars_data_df.to_csv(filename, index = False)