In [1]:
from bs4 import BeautifulSoup
import requests
import re
import csv

In [2]:
# Basic Website URL
root_url = 'https://cheb.ws/prodam/kvartira/'

# Get HTML Contents of the webpage via HTTP Request
try:
    r = requests.get(root_url, timeout=3)
    r.raise_for_status()
except requests.exceptions.HTTPError as err:
    print ('HTTP Error:', err)
except requests.exceptions.ConnectionError as err:
    print ('Error Connecting:', err)
except requests.exceptions.Timeout as err:
    print ('Timeout Error:', err)
except requests.exceptions.RequestException as err:
    print ('Oops, Unknown Error', err)

soup = BeautifulSoup(r.content, 'html.parser')

# Select specific div with page selector class
pages_divs = soup.find_all('div', class_='link_bar')[0]

# Select href attributes of links
hrefs = [a['href'] for a in pages_divs.find_all('a')]
hrefs.pop()

# Find overall number of pages to create array of URLs
num_of_pages = int(re.findall(r'\d+', hrefs[-1])[0])
print(f'There are {num_of_pages + 1} pages of apartments on the website overall')

# Fill pages array with URL values to process data later
pages = [root_url + f'?page={page}' for page in range(1, num_of_pages + 1)]

There are 262 pages of apartments on the website overall


In [3]:
%%time

apartments = []

# Process and retreive URL data in a loop

for page in pages:

    r = requests.get(page)
    
    soup = BeautifulSoup(r.content, 'html.parser')

    table = soup.find('table', {'class': 'map'})
    rows = table.find_all('tr')[1:]
    
    for row in rows:
        
        link = row.find('td', {'class': 'col-type'}).find('a', {'class': 'titleline'})['href']
        apartments.append(link)

CPU times: user 35 s, sys: 470 ms, total: 35.5 s
Wall time: 4min 31s


In [4]:
# Select unique URLs

apartments = list(set(apartments))

In [6]:
# Write links to a CSV file

with open('apartments_links.csv', 'w') as f:
    writer = csv.writer(f, quoting=csv.QUOTE_ALL, delimiter='\n')
    writer.writerow(apartments)