## Required imports

In [204]:
# imports
from lxml import html, etree
import requests
import pandas as pd
import datetime as dt

## Function

In [205]:
def rightmove_webscrape(rightmove_url):
    
# Get the start & end of the web url around the index value
    start,end = rightmove_url.split('&index=')
    url_start = start+'&index='
    url_end = end[1:]
    
# Initialise variables
    price_pcm=[]
    titles=[]
    addresses=[]
    weblinks=[]
    page_counts=[]
    
# Initialise pandas DataFrame for results.
    df=pd.DataFrame(columns=['price','type','address','url'])

# Get the total number of results from the search
    page = requests.get(rightmove_url)
    tree = html.fromstring(page.content)
    xp_result_count = '//span[@class="searchHeader-resultCount"]/text()'
    result_count = int(tree.xpath(xp_result_count)[0].replace(",", ""))
    
# Turn the total number of search results into number of iterations for the loop
    loop_count = result_count/24
    if result_count%24>0:
        loop_count = loop_count+1
        
# Set the Xpath variables for the loop
    xp_prices = '//span[@class="propertyCard-priceValue"]/text()'
    xp_titles = '//div[@class="propertyCard-details"]//a[@class="propertyCard-link"]//h2[@class="propertyCard-title"]/text()'
    xp_addresses = '//address[@class="propertyCard-address"]/text()'
    xp_weblinks = '//div[@class="propertyCard-details"]//a[@class="propertyCard-link"]/@href'

# Start the loop through the search result web pages
    for pages in range(0,loop_count,1):
        rightmove_url = url_start+str(pages*24)+url_end
        page = requests.get(rightmove_url)
        tree = html.fromstring(page.content)
        
# Reset variables
        price_pcm=[]
        titles=[]
        addresses=[]
        weblinks=[]

# Create data lists from Xpaths
        for val in tree.xpath(xp_prices):
            price_pcm.append(val)
        for val in tree.xpath(xp_titles):
            titles.append(val)
        for val in tree.xpath(xp_addresses):
            addresses.append(val)
        for val in tree.xpath(xp_weblinks):
            weblinks.append(val)

# Convert data to temporary DataFrame
        data = [price_pcm, titles, addresses, weblinks]
        temp_df= pd.DataFrame(data)
        temp_df = temp_df.transpose()
        temp_df.columns=['price','type','address','url']
        
# Drop empty rows from DataFrame which come from placeoholders in html file.
        temp_df = temp_df[temp_df.url != '/property-for-sale/property-0.html']
    
# Join temporary DataFrame to main results DataFrame.
        frames = [df,temp_df]
        df = pd.concat(frames)

# Renumber results DataFrame index to remove duplicate index values.
    df = df.reset_index(drop=True)

# Convert price column to numeric values for analysis.
    df.price.replace(regex=True,inplace=True,to_replace=r'\D',value=r'')
    df.price=pd.to_numeric(df.price)

# Extract postcode areas to separate column.
    df['postcode'] = df['address'].str.extract(r'\b([A-Za-z][A-Za-z]?[0-9][0-9]?[A-Za-z]?)\b',expand=True)
    
# Extract number of bedrooms from 'type' column.
    df['number_bedrooms'] = df.type.str.extract(r'\b([\d][\d]?)\b',expand=True)
    df.loc[df['type'].str.contains('studio',case=False),'number_bedrooms']=0

# Add in date column with date website was queried (i.e. today's date).
    now = dt.datetime.today().strftime("%d/%m/%Y")
    df['date'] = now

# Optional line to export the results to CSV if you wish to inspect them in an alternative program.
#     df.to_csv('rightmove_df.csv',encoding='utf-8')
 
    return df

## Using the function

In [206]:
# Example query: London fields, to rent, added last 7 days:
'http://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E70417&numberOfPropertiesPerPage=24&radius=0.0&sortType=6&index=0&propertyTypes=detached%2Csemi-detached%2Cterraced%2Cflat%2Cbungalow&maxDaysSinceAdded=7&includeLetAgreed=false&viewType=LIST&currencyCode=GBP'

'http://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E70417&numberOfPropertiesPerPage=24&radius=0.0&sortType=6&index=0&propertyTypes=detached%2Csemi-detached%2Cterraced%2Cflat%2Cbungalow&maxDaysSinceAdded=7&includeLetAgreed=false&viewType=LIST&currencyCode=GBP'

In [207]:
# Example query: All London, to rent, added today:
'http://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E87490&numberOfPropertiesPerPage=24&radius=0.0&sortType=6&index=0&propertyTypes=detached%2Csemi-detached%2Cterraced%2Cflat%2Cbungalow&maxDaysSinceAdded=1&includeLetAgreed=false&viewType=LIST&currencyCode=GBP'

'http://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E87490&numberOfPropertiesPerPage=24&radius=0.0&sortType=6&index=0&propertyTypes=detached%2Csemi-detached%2Cterraced%2Cflat%2Cbungalow&maxDaysSinceAdded=1&includeLetAgreed=false&viewType=LIST&currencyCode=GBP'

In [208]:
# Create a search on the rightmove.co.uk website, then copy and paste the url from the first results page to this variable:
rightmove_url = 'http://www.rightmove.co.uk/property-to-rent/find.html?locationIdentifier=REGION%5E87490&numberOfPropertiesPerPage=24&radius=0.0&sortType=6&index=0&propertyTypes=detached%2Csemi-detached%2Cterraced%2Cflat%2Cbungalow&maxDaysSinceAdded=1&includeLetAgreed=false&viewType=LIST&currencyCode=GBP'

In [209]:
# Run the function 
df = rightmove_webscrape(rightmove_url)

# View the first few results
df.head()

Unnamed: 0,price,type,address,url,postcode,number_bedrooms,date
0,1050.0,1 bedroom flat,"Grove Hill,London,E18",/property-to-rent/property-45094061.html,E18,1,17/08/2016
1,563.0,Studio flat,"Ravenscroft Road, London, W4",/property-to-rent/property-60806813.html,W4,0,17/08/2016
2,3163.0,2 bedroom apartment,"Hamlet Gardens, London, W6",/property-to-rent/property-43838709.html,W6,2,17/08/2016
3,2492.0,4 bedroom apartment,"Chatsworth Road, E5 0LA",/property-to-rent/property-41781018.html,E5,4,17/08/2016
4,1899.0,3 bedroom flat,"Blackshaw Road, London, SW17",/property-to-rent/property-61223090.html,SW17,3,17/08/2016


## Analysis

In [210]:
df.number_bedrooms.unique()

array(['1', 0, '2', '4', '3', '6', nan, '5', '8', '15', '7'], dtype=object)

In [219]:
# See which 'types' do not have bedroom number extracted
df[df.number_bedrooms.isnull()].type.unique()

array(['Property', 'Maisonette', 'Detached house', 'Flat share',
       'Semi-detached bungalow', 'House', 'Terraced house',
       'End of terrace house'], dtype=object)

In [230]:
print 'most expensive: '
print df[df.price==df.price.max()].price

most expensive: 
820    108333.0
Name: price, dtype: float64


In [245]:
# Average prices by number of bedrooms
pd.pivot_table(df,values='price',index='number_bedrooms',aggfunc=('mean','count'),dropna=True)

Unnamed: 0_level_0,mean,count
number_bedrooms,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1238.970684,307
1,1635.940684,1315
15,108333.0,1
2,2281.295254,1812
3,3243.215686,612
4,3540.961702,235
5,5789.080645,62
6,4962.066667,15
7,17992.0,1
8,10299.0,2


In [244]:
df.count()

price              4400
type               4402
address            4402
url                4402
postcode           3278
number_bedrooms    4364
date               4402
dtype: int64

In [221]:
print 'Average rent pcm - all results:'
print
print '£',round(df.price.mean(),2)

Average rent pcm - all results:

£ 2297.13


In [220]:
df.postcode.unique()

array(['E18', 'W4', 'W6', 'E5', 'SW17', 'W2', 'NW4', 'SE16', 'N21', 'N13',
       nan, 'E2', 'E15', 'NW3', 'EC1M', 'E1W', 'W9', 'N1', 'E14', 'SW7',
       'SW1W', 'SE1', 'W10', 'SW1Y', 'SE18', 'E3', 'WC1R', 'W1U', 'N12',
       'SW6', 'SW13', 'EC1V', 'SW9', 'W8', 'SE13', 'W12', 'W1H', 'SW10',
       'SW2', 'W5', 'N10', 'SE11', 'W1K', 'SE17', 'SW8', 'NW8', 'SW3',
       'E1', 'SE26', 'SW15', 'SW1E', 'W1W', 'NW6', 'SW4', 'N7', 'E16',
       'E12', 'SE25', 'N8', 'SE10', 'SW16', 'SE9', 'NW1', 'SW1P', 'W3',
       'SM4', 'WC2E', 'SW19', 'NW2', 'SW11', 'SW1V', 'W1S', 'E9', 'SW18',
       'WC2H', 'E8', 'W1', 'NW11', 'SE15', 'SW1X', 'SE28', 'W14', 'WC1H',
       'E10', 'N19', 'E11', 'NW10', 'N20', 'N4', 'E7', 'E17', 'N9', 'NW9',
       'EN2', 'SE6', 'N5', 'WC1', 'WC1E', 'WC2R', 'SW12', 'SE3', 'E4',
       'Nw4', 'EC2Y', 'SW14', 'WC2B', 'SE5', 'SW20', 'SE21', 'SE27', 'N6',
       'SW1', 'N17', 'N2', 'W1G', 'N16', 'N11', 'N18', 'EN1', 'EC2A',
       'SE23', 'NW7', 'SE24', 'N3', 'W7', 'SW5', 'SE4

## Optional error checking

In [131]:
# Below is a method for exporting the full html text from the url if you wish to inspect it in detail.
page = requests.get(rightmove_url)
tree = html.fromstring(page.content)
html_text=etree.tostring(tree)
file = open("html.txt", "w")
file.write(html_text)
file.close()