In [251]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from urllib.request import urlopen, Request
import json
import re
from selenium import webdriver
import datetime as dt
import random

RIGHT_MOVE DATA SCRAPING

In [308]:
def get_page_details_rightMove(response):
    soup = BeautifulSoup(response.text)
    all_script = soup.find_all('script')
    script_filter = [i for i in all_script if len(i.attrs) == 0]
    con = script_filter[2].contents[0][19:]
    new_con = json.loads(con)
    return new_con['properties']

In [309]:
'''
transactionType: one(1) represent sales transaction, and zero(0) represent rent transaction
'''
def get_web_details_rightMove(page, transactionType):
    if transactionType == 1:
        if page == 0:
            response = requests.get(f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E87490&propertyTypes=&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords=')
            return get_page_details_rightMove(response)
        else:
            response = requests.get(f'https://www.rightmove.co.uk/property-for-sale/find.html?locationIdentifier=REGION%5E87490&index={page}&propertyTypes=&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords=')
            return get_page_details_rightMove(response)
    else:
        if page == 0:
            response = requests.get(f'https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E87490&propertyTypes=&includeLetAgreed=false&mustHave=&dontShow=&furnishTypes=&keywords=')
            return get_page_details_rightMove(response)
        else:
            response = requests.get(f'https://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E87490&index={page}&propertyTypes=&includeSSTC=false&mustHave=&dontShow=&furnishTypes=&keywords=')
            return get_page_details_rightMove(response)



In [188]:
listing_sources = ['Rightmove', 'Zoopla', 'On The Market(OTM)']
parent_url = ['https://www.rightmove.co.uk', 'https://www.zoopla.co.uk', 'https://www.onthemarket.com']

pages_dict = {'transaction_type':[], 'bedrooms':[], 'bathrooms':[], 'description':[], 'property_type':[], 'price':[], 'location':[],
           'post_code':[], 'latitude':[],'longitude':[], 'agent':[], 'agent_contact':[], 'listing_source':[], 'listing_url':[], 'date':[]}

In [43]:
def getPriceRegex(value):
    price_regex = '\d+,?\d*,?\d*,?\d*'
    return int(re.search(price_regex, value)[0].replace(",",""))

In [44]:
def getPostcodeRegex(value):
    postCode_regex = '([A-Z]+\d+[A-Z]*)'
    if re.search(postCode_regex, value) is None:
        return None
    else:
        return re.search(postCode_regex, value)[0]

In [303]:
def getActualDate_rightMove(value):
    todays_date = dt.date.today()
    if value == 'today':
        return str(todays_date.day)+"/"+str(todays_date.month)+"/"+str(todays_date.year)
    elif value == 'yesterday':
        td = dt.timedelta(days=1)
        yd = todays_date - td
        return str(yd.day)+"/"+str(yd.month)+"/"+str(yd.year)
    else:
        return value

    
    

In [315]:
#Dataframe for combining all the data from each website
pages_df = pd.DataFrame()

In [310]:



def getData_rightMove(transactionType):
    global pages_dict
    pages_dict = {i:[] for i in pages_dict}
    for i in range(0, 985, 24):
        page_details = get_web_details_rightMove(i, transactionType)
        for j in page_details:
            pages_dict['transaction_type'].append(j['transactionType'])
            pages_dict['bedrooms'].append(j['bedrooms'])
            pages_dict['bathrooms'].append(j['bathrooms'])
            pages_dict['description'].append(j['summary'])
            pages_dict['property_type'].append(j['propertySubType'])
            pages_dict['price'].append(j['price']['amount'] if transactionType == 1 else getPriceRegex(j['price']['displayPrices'][0]['displayPrice']))
            pages_dict['location'].append(j['displayAddress'])
            pages_dict['post_code'].append(getPostcodeRegex(j['displayAddress']))
            pages_dict['latitude'].append(j['location']['latitude'])
            pages_dict['longitude'].append(j['location']['longitude'])
            pages_dict['agent'].append(j['customer']['brandTradingName'])
            pages_dict['agent_contact'].append(j['customer']['contactTelephone'])
            pages_dict['listing_source'].append(listing_sources[0])
            pages_dict['listing_url'].append(parent_url[0] + j['propertyUrl'])
            pages_dict['date'].append(getActualDate_rightMove(j['addedOrReduced'].split(" ").pop()))
    return pages_dict
    


In [317]:
#Scraping rent data from rightMove website
rightMoveRent_df = pd.DataFrame(getData_rightMove(0))
rightMoveRent_df['bathrooms'] = rightMoveRent_df.bathrooms.fillna(0).astype(int)
rightMoveRent_df['bedrooms'] = rightMoveRent_df.bedrooms.fillna(0).astype(int)
rightMoveRent_df['date'] = pd.to_datetime(rightMoveRent_df['date'])

pages_df = pages_df.append(rightMoveRent_df, ignore_index=True)


  rightMoveRent_df['date'] = pd.to_datetime(rightMoveRent_df['date'])
  pages_df = pages_df.append(rightMoveRent_df, ignore_index=True)


In [321]:
#Scraping sales data from rightMove website
rightMoveSale_df = pd.DataFrame(getData_rightMove(1))
rightMoveSale_df['bathrooms'] = rightMoveSale_df.bathrooms.fillna(0).astype(int)
rightMoveSale_df['bedrooms'] = rightMoveSale_df.bedrooms.fillna(0).astype(int)
rightMoveSale_df['date'] = pd.to_datetime(rightMoveSale_df['date'])
rightMoveSale_df['transaction_type'] = rightMoveSale_df['transaction_type'].map(lambda a: 'sale')

pages_df = pages_df.append(rightMoveSale_df, ignore_index=True)

  rightMoveSale_df['date'] = pd.to_datetime(rightMoveSale_df['date'])
  pages_df = pages_df.append(rightMoveSale_df, ignore_index=True)


ZOOPLA DATA SCRAPING

In [377]:
def get_page_details_zoopla(response):
    soup = BeautifulSoup(response)
    zoopla_content = soup.find("script", attrs={'id':'__NEXT_DATA__'}).contents[0]
    zoopla_dict = json.loads(zoopla_content)
    property_list = zoopla_dict['props']['pageProps']['regularListingsFormatted']
    return property_list

In [431]:
def get_web_details_zoopla(page, transactionType):
    if transactionType == 0:
        driver = webdriver.Chrome()
        try:
            driver.get(f'https://www.zoopla.co.uk/to-rent/property/london/?price_frequency=per_month&q=London&results_sort=newest_listings&search_source=to-rent&pn={page}')
            page_details = driver.page_source
            driver.close()
            return get_page_details_zoopla(page_details)
        except:
            driver.refresh()
            page_details = driver.page_source
            driver.close()
            return get_page_details_zoopla(page_details)
    else:
        driver = webdriver.Chrome()
        try:
            driver.get(f'https://www.zoopla.co.uk/for-sale/property/london/?q=London&results_sort=newest_listings&search_source=for-sale&pn={page}')
            page_details = driver.page_source
            driver.close()
            return get_page_details_zoopla(page_details)
        except:
            driver.refresh()
            page_details = driver.page_source
            driver.close()
            return get_page_details_zoopla(page_details)


In [169]:
def getBedAndBathroomData(value, item):
    for i in value:
        if list(i.values()).__contains__(item):
            return int(i['content'])
    return None

In [380]:
def getTransactionType_zoopla(value, transactionType):
    if transactionType == 0:
        if re.search('rent', value) is None:
            return 'rent'
        else:
            return re.search('rent', value)[0]
    else:
        if re.search('sale', value) is None:
            return 'sale'
        else:
            return re.search('sale', value)[0]
    

In [389]:

def getData_zoopla(transactionType):
    global pages_dict
    pages_dict = {i:[] for i in pages_dict}
    for i in range(1, 41):
        page_details = get_web_details_zoopla(i, transactionType)
        for j in page_details:
            pages_dict['transaction_type'].append(getTransactionType_zoopla(j['listingUris']['detail'], transactionType))
            pages_dict['bedrooms'].append(getBedAndBathroomData(j['features'], 'bed'))
            pages_dict['bathrooms'].append(getBedAndBathroomData(j['features'], 'bath'))
            pages_dict['description'].append(j['summaryDescription'])
            pages_dict['property_type'].append(j['propertyType'])
            pages_dict['price'].append(getPriceRegex(j['price']))
            pages_dict['location'].append(j['address'])
            pages_dict['post_code'].append(getPostcodeRegex(j['address']))
            pages_dict['latitude'].append(j['location']['coordinates']['latitude'])
            pages_dict['longitude'].append(j['location']['coordinates']['longitude'])
            pages_dict['agent'].append(j['branch']['name'])
            pages_dict['agent_contact'].append(j['branch']['phone'])
            pages_dict['listing_source'].append(listing_sources[1])
            pages_dict['listing_url'].append(parent_url[1] + j['listingUris']['detail'])
            pages_dict['date'].append(j['lastPublishedDate'][0:10])
    return pages_dict



In [390]:
#Scraping rent data from zoopla website
zooplaRent_df = pd.DataFrame(getData_zoopla(0))
zooplaRent_df['bathrooms'] = zooplaRent_df.bathrooms.fillna(0).astype(int)
zooplaRent_df['bedrooms'] = zooplaRent_df.bedrooms.fillna(0).astype(int)
zooplaRent_df['date'] = pd.to_datetime(zooplaRent_df['date'])

pages_df = pages_df.append(zooplaRent_df, ignore_index=True)

  pages_df = pages_df.append(zooplaRent_df, ignore_index=True)


In [395]:
#Scraping sales data from zoopla website
zooplaSale_df = pd.DataFrame(getData_zoopla(1))
zooplaSale_df['bathrooms'] = zooplaSale_df.bathrooms.fillna(0).astype(int)
zooplaSale_df['bedrooms'] = zooplaSale_df.bedrooms.fillna(0).astype(int)
zooplaSale_df['date'] = pd.to_datetime(zooplaSale_df['date'])

pages_df = pages_df.append(zooplaSale_df, ignore_index=True)

  pages_df = pages_df.append(zooplaSale_df, ignore_index=True)


ON_THE_MARKET DATA SCRAPING

In [270]:
def find_required_tag(element):
    return element.has_attr('type') and repr(element.string).__contains__('__OTM__.jsonData')

In [420]:
def get_page_details_OTM(response, personal_or_general):
    if personal_or_general == 1:
        soup = BeautifulSoup(response)
        required_tag = soup.find(find_required_tag).string
        extract = required_tag[required_tag.index('__OTM__.jsonData') + 19 : required_tag.index('__OTM__.globals') - 1].strip(';\n')
        property_list = json.loads(extract)
        return property_list
    else:
        soup = BeautifulSoup(response)
        required_tag = soup.find(find_required_tag).string
        extract = required_tag[required_tag.index('__OTM__.jsonData') + 19 : required_tag.index('__OTM__.globals') - 1].strip(';\n')
        property_list = json.loads(extract)['properties']
        return property_list

In [419]:
def get_web_details_OTM(page, transactionType):
    if transactionType == 0:
        driver = webdriver.Chrome()
        try:
            driver.get(f'https://www.onthemarket.com/to-rent/property/london/?page={page}&view=grid')
            page_details = driver.page_source
            driver.close()
            return get_page_details_OTM(page_details, 0)
        except:
            driver.refresh()
            page_details = driver.page_source
            driver.close()
            return get_page_details_OTM(page_details, 0)
    else:
        driver = webdriver.Chrome()
        try:
            driver.get(f'https://www.onthemarket.com/for-sale/property/london/?page={page}&view=grid')
            page_details = driver.page_source
            driver.close()
            return get_page_details_OTM(page_details, 0)
        except:
            driver.refresh()
            page_details = driver.page_source
            driver.close()
            return get_page_details_OTM(page_details, 0)

In [418]:
def get_web_details_OTM_personal(pageLink):
    driver = webdriver.Chrome()
    try:
        driver.get(f'https://www.onthemarket.com{pageLink}')
        page_details = driver.page_source
        driver.close()
        return get_page_details_OTM(page_details, 1)
    except:
        driver.refresh()
        page_details = driver.page_source
        driver.close()
        return get_page_details_OTM(page_details, 1)

In [273]:
def getTransactionType_OTM(value):
    if value:
        return 'sale'
    else:
        return 'rent'

In [274]:
def getActualDate_OTM(value):
    reg = '\d+'
    ran = [2,3,4,5,6]
    ran2 = [1,2,3,4]
    extract = re.search(reg, value)
    if extract is None:
        day = value.split(" ")[1]
        actual_day = dt.datetime.today()
        previous_day = dt.timedelta(days=1)
        if day == 'today':
            return str(actual_day.day)+"/"+str(actual_day.month)+"/"+str(actual_day.year)
        else:
            new_day = actual_day - previous_day
            return str(new_day.day)+"/"+str(new_day.month)+"/"+str(new_day.year)

    else:
        days = int(extract[0])
        if days == 7:
            previous_days = dt.timedelta(days=random.choice(ran))
            actual_day = dt.datetime.today() - previous_days
            return str(actual_day.day)+"/"+str(actual_day.month)+"/"+str(actual_day.year)
        else:
            previous_days = dt.timedelta(days=(random.choice(ran2) + days))
            actual_day = dt.datetime.today() - previous_days
            return str(actual_day.day)+"/"+str(actual_day.month)+"/"+str(actual_day.year)


In [297]:
def getData_OTM(transactionType):
    global pages_dict
    pages_dict = {i:[] for i in pages_dict}
    for i in range(0, 42):
        page_details = get_web_details_OTM(i, transactionType)

        for j in page_details:
            if j.__contains__('property-link'):
                pp = get_web_details_OTM_personal(j['property-link'])
                pages_dict['transaction_type'].append(getTransactionType_OTM(pp['for-sale?']))
                pages_dict['bedrooms'].append(pp['bedrooms'])
                pages_dict['bathrooms'].append(pp['bathrooms'])
                pages_dict['description'].append(pp['description'])
                pages_dict['property_type'].append(pp['humanised-property-type'])
                pages_dict['price'].append(getPriceRegex(pp['price']))
                pages_dict['location'].append(pp['display_address'])
                pages_dict['post_code'].append(getPostcodeRegex(pp['display_address']))
                pages_dict['latitude'].append(pp['location']['lat'])
                pages_dict['longitude'].append(pp['location']['lon'])
                pages_dict['agent'].append(pp['agent']['name'])
                pages_dict['agent_contact'].append(pp['agent']['telephone'])
                pages_dict['listing_source'].append(listing_sources[2])
                pages_dict['listing_url'].append(parent_url[2] + pp['property-link'])
                pages_dict['date'].append(getActualDate_OTM(pp['days-since-added-reduced']))
            else:
                continue
    return pages_dict


In [432]:
#Scraping rent data from ON THE MARKET website
otmRent_df = pd.DataFrame(getData_OTM(0))
otmRent_df['bathrooms'] = otmRent_df.bathrooms.fillna(0).astype(int)
otmRent_df['bedrooms'] = otmRent_df.bedrooms.fillna(0).astype(int)
otmRent_df['date'] = pd.to_datetime(otmRent_df['date'])

pages_df = pages_df.append(otmRent_df, ignore_index=True)

"otmRent_df = pd.DataFrame(getData_OTM(0))\notmRent_df['bathrooms'] = otmRent_df.bathrooms.fillna(0).astype(int)\notmRent_df['bedrooms'] = otmRent_df.bedrooms.fillna(0).astype(int)\notmRent_df['date'] = pd.to_datetime(otmRent_df['date'])\n\npages_df = pages_df.append(otmRent_df, ignore_index=True)"

In [444]:
#Scraping sale data from ON THE MARKET website
otmSale_df = pd.DataFrame(getData_OTM(1))
otmSale_df['bathrooms'] = otmSale_df.bathrooms.fillna(0).astype(int)
otmSale_df['bedrooms'] = otmSale_df.bedrooms.fillna(0).astype(int)
otmSale_df['date'] = pd.to_datetime(otmSale_df['date'])

pages_df = pages_df.append(otmSale_df, ignore_index=True)

"otmSale_df = pd.DataFrame(getData_OTM(1))\notmSale_df['bathrooms'] = otmSale_df.bathrooms.fillna(0).astype(int)\notmSale_df['bedrooms'] = otmSale_df.bedrooms.fillna(0).astype(int)\notmSale_df['date'] = pd.to_datetime(otmSale_df['date'])\n\npages_df = pages_df.append(otmSale_df, ignore_index=True)"