In [5]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import numpy as np
import pandas as pd
import re

In [9]:
class remax():
    '''
    scrapes the information on house offers from RE/MAX website and puts them in a dataframe
    '''
    def __init__(self,link):
        '''
        the class takes one link as an argument, of two possible, one gives results for selling appartments and the other for renting
        '''
        self.link = link
        self.request = requests.get(link)
        self.request.encoding = 'UTF-8'
        self.soup = BeautifulSoup(self.request.text,'html')
        self.ulrs = self.getUrls()
        self.prices = self.getAllPrice()
        self.addresses = self.getAllAddress()
        self.areas  = self.getAllArea()
        self.pricesm2 = self.pricem2()
        self.df = self.makeDf() 
        
    def getUrls(self):
        '''
        returns list of all pages we will scrape from
        '''
        last_page = int(self.soup.findAll('li',{'class':"page-item d-none d-sm-block"})[-1].find('a')['href'][-2:])  
        nums = list(range(1,last_page+1))
        url_list = ["{}&stranka={}".format(self.link, str(page)) for page in nums]
        self.url_list = url_list
        return url_list
    
    def replacetn(self,char):
        '''
        removes unwanted strings from values
        '''
        char = [a.replace('\n', '') for a in char]
        char = [a.replace('\t', '') for a in char]
        char = [a.replace('\xa0', '') for a in char]
        char = [a.replace('(za nemovitost) ', '') for a in char]
        char = [a.replace('(za měsíc)', '') for a in char]
        return char
    
    def getAllAddress(self):
        '''
        returns a list of addresses of all houses
        '''
        addr_list = []
        for url in self.ulrs:
            houses = self.soup.findAll('div', {'class':"pl-items__item-info"})
            addr = self.replacetn([house.find('p').text for house in houses])
            for addr1 in addr:
                addr_list.append(addr1.split(','))
            addr_list = self.getNiceAddress(addr_list)
        flat_list = [ item for elem in addr_list for item in elem]
        self.addr_list = flat_list
        return flat_list
    
    def getNiceAddress(self, list):
        '''
        returns only a part of the address we care about
        '''
        alist = []
        for addr in list:
            addr = [s for s in addr if "Praha" in s]
            alist.append(addr)
        return alist
        
    
    def getAllPrice(self):
        '''
        returns a list of prices of all houses
        '''
        price_list = []
        for url in self.ulrs:
            houses = self.soup.findAll('div', {'class':"pl-items__item-price"})
            prices = self.replacetn([house.find('strong').text for house in houses])
            for price1 in prices:
                price_list.append(price1)
        self.price_list = price_list
        return price_list

    def getAllArea(self):
        '''
        returns a list of the size of all houses
        '''
        area_list = []
        for url in self.ulrs:
            houses = self.soup.findAll('h2')
            areas = self.replacetn([house.find('strong').text for house in houses])
            areas = self.getNum(areas)
            for area1 in areas:
                area_list.append(area1)
        self.area_list = area_list
        return area_list

    def getNum(self,char):
        '''
        from a really long string returns only integer equal to the squared meters the house has
        '''
        char = [a.split('m²') for a in char]
        newchar = [item[0] for item in char]
        newchar_final = [a.split() for a in newchar]
        newchar_ultra_final = [item[-1] for item in newchar_final]
        return newchar_ultra_final
    
    def pricem2(self):
        '''
        returns price per squared meter of the house
        set to NaN if either price or area has values that are not an cenvertible to integer
        '''
        price_num = []
        for i in range(0,len(self.prices)):
            price_num.append(self.prices[i].split("Kč")[0])
        price_m2_list = []
        for j in range(0,len(self.prices)):
            try:
                price_m2_list.append(round(int(price_num[j]) / int(self.areas[j])))
            except:
                price_m2_list.append(np.nan)
        self.price_m2_list = price_m2_list
        return price_m2_list
    
    def makeDf(self):
        '''
        makes dataset of the data and
        cleans the data to show only what if important
        '''
        data = {'Price':self.price_list,'Area':self.area_list,'Location':self.addr_list,'Price/m_sq': self.price_m2_list}
        df = pd.DataFrame(data)
        df = df[~df.Price.str.contains('Cena na vyžádání v kanceláři')]
        df = df[~df.Price.str.contains('Prodáno')]
        df = df[~df.Price.str.contains('(za m)')]
        df = df[~df.Area.str.contains('a')]
        df = df[~df.Area.str.contains('e')]
        df = df[~df.Area.str.contains('i')]
        df = df[~df.Area.str.contains('o')]
        df = df[~df.Area.str.contains('u')] 
        df.reset_index(drop=True,inplace=True)
        return df

In [3]:
#first link is for selling and second is for renting building
links = ['https://www.remax-czech.cz/reality/vyhledavani/?regions%5B19%5D=on&sale=1','https://www.remax-czech.cz/reality/vyhledavani/?regions%5B19%5D=on&sale=2']

In [10]:
#dataset for houses sold
remax_buy = remax(links[0]).df
remax_buy

Unnamed: 0,Price,Area,Location,Price/m_sq
0,8990000Kč,77,Praha 9– Klánovice,116753
1,3500000Kč,45,Praha 4– Chodov,77778
2,7000000Kč,63,Praha 10– Vršovice,111111
3,5880000Kč,74,Praha 9– Letňany,79459
4,5000000Kč,634,Praha 7– Troja,7886
...,...,...,...,...
730,4950000Kč,73,Praha 10– Dolní Měcholupy,67808
731,4790000Kč,79,Praha 10– Petrovice,60633
732,8450000Kč,84,Praha 3– Žižkov,100595
733,12835805Kč,101,Praha 5– Smíchov,127087


In [186]:
#dataset for houses rented
remax_rent = remax(links[1]).df
remax_rent

Unnamed: 0,Price,Area,Location,Price/m_sq
0,25000Kč,89,Praha 2– Nové Město,281.0
1,25000Kč,87,Praha 8– Libeň,287.0
2,25000Kč,65,Praha 5– Smíchov,385.0
3,15000Kč,52,Praha 6– Břevnov,288.0
4,17500Kč,59,Praha 4– Kunratice,297.0
...,...,...,...,...
495,12000Kč,33,Praha 9– Vysočany,364.0
496,15000Kč,45,Praha 7– Holešovice,333.0
497,16000Kč,30,Praha 4– Chodov,533.0
498,12500Kč,36,Praha 9– Čakovice,347.0
