### 0. Import libraries

In [None]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import time, random
from datetime import datetime
from urllib.parse import urljoin

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36" 
}

### 1. Stanovi

In [None]:
BASE = "https://www.oglasi.rs"
url = "https://www.oglasi.rs/nekretnine/prodaja-stanova?p=1"

def get_next_url(soup):
    for li in soup.select("ul.pagination.hidden-xs li"):
        a = li.find("a", href=True)
        if a and "sledeća" in a.get_text(strip=True).lower() and a["href"] != "#":
            return urljoin(BASE, a["href"])
    a = soup.select_one("ul.pager.visible-xs li.next a[href]")
    if a and "sledeća" in a.get_text(strip=True).lower():
        return urljoin(BASE, a["href"])
    return None

In [None]:
rows = []

while url:
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    doc = BeautifulSoup(r.text, "html.parser")

    ads = doc.find_all("article", {"itemprop": "itemListElement"})

    for ad in ads:
        row = {}

        row['naslov'] = ad.find('h2', {"itemprop": "name"}).text
        row['link'] = ad.find('a').get('href')

        lok = ad.find_all("a", {"itemprop": "category"})
        row['lokacija'] = " / ".join([l.get_text(strip=True) for l in lok]) if lok else None

        info_divs = ad.select("div.col-sm-6")
        for div in info_divs:
            text = div.get_text(strip=True)
            value = div.find("strong").get_text(strip=True) if div.find("strong") else None

            if "Sobnost" in text:
                row["sobnost"] = value
            elif "Kvadratura" in text:
                row["kvadratura"] = value
            elif "Nivo u zgradi" in text:
                row["sprat"] = value

        price_tag = ad.find(class_="text-price")
        if price_tag:
            cena_raw = price_tag.get_text(" ", strip=True)
            row['cena'] = cena_raw.replace("\xa0", " ").strip()
        else:
            muted = ad.select_one("span.text-muted strong")
            if muted and "cena nije navedena" in muted.get_text(strip=True).lower():
                row['cena'] = "Cena nije navedena"
            else:
                row['cena'] = None

        try:
            row['opis'] = ad.find('p', {"itemprop": "description"}).text
        except:
            row['opis'] = None

        try:
            time_tag = ad.find("time")
            row["obnovljen"] = time_tag.get_text(strip=True) if time_tag else None
        except:
            row["obnovljen"] = None
        
        try:
            ag_tag = ad.find(class_="visible-sm") or ad.find("cite") or ad.find("small")
            row["agencija"] = ag_tag.get_text(strip=True) if ag_tag else None
        except:
            row["agencija"] = None

        img = ad.find("img")
        if img:
            row['slika'] = img.get("src")
        else:
            row['slika'] = None

        rows.append(row)

    next_url = get_next_url(doc)
    if not next_url:
        break
    url = next_url
    time.sleep(random.uniform(0.5, 3.0))

In [None]:
print(len(rows))

In [None]:
stanovi = pd.DataFrame(rows)
stanovi

### 2. Kuće

In [None]:
BASE = "https://www.oglasi.rs"
url = "https://www.oglasi.rs/nekretnine/prodaja-kuca?p=1"

def get_next_url(soup):
    for li in soup.select("ul.pagination.hidden-xs li"):
        a = li.find("a", href=True)
        if a and "sledeća" in a.get_text(strip=True).lower() and a["href"] != "#":
            return urljoin(BASE, a["href"])
    a = soup.select_one("ul.pager.visible-xs li.next a[href]")
    if a and "sledeća" in a.get_text(strip=True).lower():
        return urljoin(BASE, a["href"])
    return None

In [None]:
rows = []

while url:
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    doc = BeautifulSoup(r.text, "html.parser")

    ads = doc.find_all("article", {"itemprop": "itemListElement"})

    for ad in ads:
        row = {}

        row['naslov'] = ad.find('h2', {"itemprop": "name"}).text
        row['link'] = ad.find('a').get('href')

        lok = ad.find_all("a", {"itemprop": "category"})
        row['lokacija'] = " / ".join([l.get_text(strip=True) for l in lok]) if lok else None

        info_divs = ad.select("div.col-sm-6")
        for div in info_divs:
            text = div.get_text(strip=True)
            value = div.find("strong").get_text(strip=True) if div.find("strong") else None

            if "Sobnost" in text:
                row["sobnost"] = value
            elif "Kvadratura" in text:
                row["kvadratura"] = value
            elif "Nivo u zgradi" in text:
                row["sprat"] = value

        price_tag = ad.find(class_="text-price")
        if price_tag:
            cena_raw = price_tag.get_text(" ", strip=True)
            row['cena'] = cena_raw.replace("\xa0", " ").strip()
        else:
            muted = ad.select_one("span.text-muted strong")
            if muted and "cena nije navedena" in muted.get_text(strip=True).lower():
                row['cena'] = "Cena nije navedena"
            else:
                row['cena'] = None

        try:
            row['opis'] = ad.find('p', {"itemprop": "description"}).text
        except:
            row['opis'] = None

        try:
            time_tag = ad.find("time")
            row["obnovljen"] = time_tag.get_text(strip=True) if time_tag else None
        except:
            row["obnovljen"] = None
        
        try:
            ag_tag = ad.find(class_="visible-sm") or ad.find("cite") or ad.find("small")
            row["agencija"] = ag_tag.get_text(strip=True) if ag_tag else None
        except:
            row["agencija"] = None

        img = ad.find("img")
        if img:
            row['slika'] = img.get("src")
        else:
            row['slika'] = None

        rows.append(row)

    next_url = get_next_url(doc)
    if not next_url:
        break
    url = next_url
    time.sleep(random.uniform(0.5, 3.0))

In [None]:
print(len(rows))

In [None]:
kuce = pd.DataFrame(rows)
kuce

### 3. Lokal, poslovni prostor

In [None]:
BASE = "https://www.oglasi.rs"
url = "https://www.oglasi.rs/nekretnine/prodaja-poslovnog-prostora?p=1"

def get_next_url(soup):
    for li in soup.select("ul.pagination.hidden-xs li"):
        a = li.find("a", href=True)
        if a and "sledeća" in a.get_text(strip=True).lower() and a["href"] != "#":
            return urljoin(BASE, a["href"])
    a = soup.select_one("ul.pager.visible-xs li.next a[href]")
    if a and "sledeća" in a.get_text(strip=True).lower():
        return urljoin(BASE, a["href"])
    return None

In [None]:
rows = []

while url:
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    doc = BeautifulSoup(r.text, "html.parser")

    ads = doc.find_all("article", {"itemprop": "itemListElement"})

    for ad in ads:
        row = {}

        row['naslov'] = ad.find('h2', {"itemprop": "name"}).text
        row['link'] = ad.find('a').get('href')

        lok = ad.find_all("a", {"itemprop": "category"})
        row['lokacija'] = " / ".join([l.get_text(strip=True) for l in lok]) if lok else None

        info_divs = ad.select("div.col-sm-6")
        for div in info_divs:
            text = div.get_text(strip=True)
            value = div.find("strong").get_text(strip=True) if div.find("strong") else None

            if "Sobnost" in text:
                row["sobnost"] = value
            elif "Kvadratura" in text:
                row["kvadratura"] = value
            elif "Nivo u zgradi" in text:
                row["sprat"] = value

        price_tag = ad.find(class_="text-price")
        if price_tag:
            cena_raw = price_tag.get_text(" ", strip=True)
            row['cena'] = cena_raw.replace("\xa0", " ").strip()
        else:
            muted = ad.select_one("span.text-muted strong")
            if muted and "cena nije navedena" in muted.get_text(strip=True).lower():
                row['cena'] = "Cena nije navedena"
            else:
                row['cena'] = None

        try:
            row['opis'] = ad.find('p', {"itemprop": "description"}).text
        except:
            row['opis'] = None

        try:
            time_tag = ad.find("time")
            row["obnovljen"] = time_tag.get_text(strip=True) if time_tag else None
        except:
            row["obnovljen"] = None
        
        try:
            ag_tag = ad.find(class_="visible-sm") or ad.find("cite") or ad.find("small")
            row["agencija"] = ag_tag.get_text(strip=True) if ag_tag else None
        except:
            row["agencija"] = None

        img = ad.find("img")
        if img:
            row['slika'] = img.get("src")
        else:
            row['slika'] = None

        rows.append(row)

    next_url = get_next_url(doc)
    if not next_url:
        break
    url = next_url
    time.sleep(random.uniform(0.5, 3.0))

In [None]:
print(len(rows))

In [None]:
lokali = pd.DataFrame(rows)

### 4. Vikendice

In [None]:
BASE = "https://www.oglasi.rs"
url = "https://www.oglasi.rs/nekretnine/prodaja-vikendica?p=1"

def get_next_url(soup):
    for li in soup.select("ul.pagination.hidden-xs li"):
        a = li.find("a", href=True)
        if a and "sledeća" in a.get_text(strip=True).lower() and a["href"] != "#":
            return urljoin(BASE, a["href"])
    a = soup.select_one("ul.pager.visible-xs li.next a[href]")
    if a and "sledeća" in a.get_text(strip=True).lower():
        return urljoin(BASE, a["href"])
    return None

In [None]:
rows = []

while url:
    r = requests.get(url, headers=headers, timeout=30)
    r.raise_for_status()
    doc = BeautifulSoup(r.text, "html.parser")

    ads = doc.find_all("article", {"itemprop": "itemListElement"})

    for ad in ads:
        row = {}

        row['naslov'] = ad.find('h2', {"itemprop": "name"}).text
        row['link'] = ad.find('a').get('href')

        lok = ad.find_all("a", {"itemprop": "category"})
        row['lokacija'] = " / ".join([l.get_text(strip=True) for l in lok]) if lok else None

        info_divs = ad.select("div.col-sm-6")
        for div in info_divs:
            text = div.get_text(strip=True)
            value = div.find("strong").get_text(strip=True) if div.find("strong") else None

            if "Sobnost" in text:
                row["sobnost"] = value
            elif "Kvadratura" in text:
                row["kvadratura"] = value
            elif "Nivo u zgradi" in text:
                row["sprat"] = value

        price_tag = ad.find(class_="text-price")
        if price_tag:
            cena_raw = price_tag.get_text(" ", strip=True)
            row['cena'] = cena_raw.replace("\xa0", " ").strip()
        else:
            muted = ad.select_one("span.text-muted strong")
            if muted and "cena nije navedena" in muted.get_text(strip=True).lower():
                row['cena'] = "Cena nije navedena"
            else:
                row['cena'] = None

        try:
            row['opis'] = ad.find('p', {"itemprop": "description"}).text
        except:
            row['opis'] = None

        try:
            time_tag = ad.find("time")
            row["obnovljen"] = time_tag.get_text(strip=True) if time_tag else None
        except:
            row["obnovljen"] = None
        
        try:
            ag_tag = ad.find(class_="visible-sm") or ad.find("cite") or ad.find("small")
            row["agencija"] = ag_tag.get_text(strip=True) if ag_tag else None
        except:
            row["agencija"] = None

        img = ad.find("img")
        if img:
            row['slika'] = img.get("src")
        else:
            row['slika'] = None

        rows.append(row)

    next_url = get_next_url(doc)
    if not next_url:
        break
    url = next_url
    time.sleep(random.uniform(0.5, 3.0))

In [None]:
print(len(rows))

In [None]:
vikendice = pd.DataFrame(rows)
vikendice

### 5. Merge

In [None]:
oglasi = pd.concat([vikendice, lokali, kuce, stanovi], ignore_index=True)
oglasi.head()

In [None]:
timestamp = datetime.now().strftime('%Y-%m-%d')
filename = f"prodaja {timestamp}.csv"
oglasi.to_csv(filename, index=False)