<a href="https://colab.research.google.com/github/yfpang7/dataScience_projects/blob/main/right_move_relative.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [37]:
import requests
from bs4 import BeautifulSoup
import polars as pl
import numpy as np
import plotly.express as px
import plotly.io as pio

pio.renderers.default ='vscode'

pl.Config(tbl_cols=-1)

<polars.config.Config at 0x7a90dc1ef740>

In [38]:
# every page of the search
# CB1 within 3 miles
# up to GBP700
urls = [
    f"https://www.rightmove.co.uk/property-for-sale/find.html?useLocationIdentifier=true&locationIdentifier=OUTCODE%5E409&radius=3.0&_includeSSTC=on&index={i}&sortType=2&channel=BUY&transactionType=BUY&displayLocationIdentifier=CB1.html&maxPrice=700000#prop165096422"
    for i in range(0, 1200, 24)
]

In [39]:
relative_urls = []
for url in urls:
  response = requests.get(url)
  soup = BeautifulSoup(response.content, 'html.parser')

  # get the wrapper for all the properties in each page
  all_containers = soup.find_all(class_='PropertyCard_propertyCardDescriptionInner__3Vkmk')

  # loop through each propery and get the href
  # append the href relative url scrape later
  for container in all_containers:
    relative_urls.append(container.find(href=True).get('href'))



In [40]:
# check the scraped
len(relative_urls)

1050

In [41]:
# check for the base url
# https://www.rightmove.co.uk/properties/164828633#/?channel=RES_BUY

# e.g. the base in this case is - https://www.rightmove.co.uk
# e.g. the relative is /properties/166143110#/?channel=RES_BUY

# create the full list
full_urls = [f"{'https://www.rightmove.co.uk'}{relative}" for relative in relative_urls]

In [42]:
# click one of the full urls to check for working link
full_urls[0:5]

['https://www.rightmove.co.uk/properties/166127114#/?channel=RES_BUY',
 'https://www.rightmove.co.uk/properties/166046009#/?channel=RES_BUY',
 'https://www.rightmove.co.uk/properties/164815784#/?channel=RES_BUY',
 'https://www.rightmove.co.uk/properties/159137219#/?channel=RES_BUY',
 'https://www.rightmove.co.uk/properties/164837978#/?channel=RES_BUY']

## Class Testing
* ensure it is finding the correct information and values

In [43]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) "
                  "Chrome/127.0.0.0 Safari/537.36"
}


test_request = requests.get(
    'https://www.rightmove.co.uk/properties/161836745#/?channel=RES_BUY',
    # full_urls[5],
    headers=headers
    )


In [44]:
test_request.status_code

410

In [45]:
test_soup = BeautifulSoup(test_request.content, 'html.parser')

In [46]:
container = test_soup.find_all(class_='_9u6R9n55iQlZi-JF6H59W')

more_info_array = [cont.find(class_='_2zXKe70Gdypr_v9MUDoVCm').text for cont in container]

more_info_template = ['council_tax', 'parking', 'garden', 'accessibility']

dict(zip(more_info_template, more_info_array))


{'council_tax': 'Band: D',
 'parking': 'Off street',
 'garden': 'Yes',
 'accessibility': 'Ask agent'}

In [47]:
# price
test_soup.find(class_='_1gfnqJ3Vtd1z40MlC0MzXu').find('span').text

'£700,000'

In [48]:
# street address
test_soup.find(class_='_2uQQ3SV0eMHL1P6t5ZDo2q').text

'Kelvin Close'

In [49]:
property_container = test_soup.find_all(class_='_3gIoc-NFXILAOZEaEjJi1n')

property_info = [cont.find(class_='_1hV1kqpVceE9m-QrX_hWDN').text for cont in property_container]

property_template = ['property_type', 'bedrooms', 'bathrooms', 'size', 'tenure']

dict(zip(property_template, property_info))

{'property_type': 'Semi-Detached',
 'bedrooms': '3',
 'bathrooms': '2',
 'size': 'Ask agent',
 'tenure': 'Freehold'}

In [50]:
# added date
test_soup.find(class_='_2nk2x6QhNB1UrxdI5KpvaF').text

'Added on 12/05/2025'

## Scraping

In [29]:
# create the full list
full_urls = [f"{'https://www.rightmove.co.uk'}{relative}" for relative in relative_urls]

rows = []
for i, url in enumerate(full_urls):
    # print(f"Processing URL {i+1}/{len(full_urls)}: {url}")

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) "
                      "Chrome/127.0.0.0 Safari/537.36"
    }

    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raises an HTTPError for bad responses

        soup = BeautifulSoup(response.content, 'html.parser')

        # Helper function to safely extract text
        def safe_extract(soup, class_name, default="Not found"):
            element = soup.find(class_=class_name)
            return element.text.strip() if element else default

        def safe_extract_nested(soup, class_name, nested_tag, default="Not found"):
            element = soup.find(class_=class_name)
            if element:
                nested = element.find(nested_tag)
                return nested.text.strip() if nested else default
            return default

        # Extract basic information with error handling
        address = safe_extract(soup, '_2uQQ3SV0eMHL1P6t5ZDo2q')
        added_date = safe_extract(soup, '_2nk2x6QhNB1UrxdI5KpvaF')
        price = safe_extract_nested(soup, '_1gfnqJ3Vtd1z40MlC0MzXu', 'span')

        # council tax, parking, garden, accessibility
        misc_container = soup.find_all(class_='_9u6R9n55iQlZi-JF6H59W')
        misc_values = []
        for cont in misc_container:
            value_elem = cont.find(class_='_2zXKe70Gdypr_v9MUDoVCm')
            misc_values.append(value_elem.text.strip() if value_elem else "Not found")

        misc_keys = ['council_tax', 'parking', 'garden', 'accessibility']
        # Pad with "Not found" if we have fewer values than keys
        # this error handling was suggested by Claude AI
        while len(misc_values) < len(misc_keys):
            misc_values.append("Not found")
        misc_info_dict = dict(zip(misc_keys, misc_values))

        # property type, bedrooms, bathrooms, size, tenure
        property_container = soup.find_all(class_='_3gIoc-NFXILAOZEaEjJi1n')
        property_info = []
        for cont in property_container:
            value_elem = cont.find(class_='_1hV1kqpVceE9m-QrX_hWDN')
            property_info.append(value_elem.text.strip() if value_elem else "Not found")

        property_keys = ['property_type', 'bedrooms', 'bathrooms', 'size', 'tenure']
        # Pad with "Not found" if we have fewer values than keys
        while len(property_info) < len(property_keys):
            property_info.append("Not found")
        property_info_dict = dict(zip(property_keys, property_info))

        # create the row dictionary with all information
        row = {
            'url': url,
            'address': address,
            'added_date': added_date,
            'price': price
        }

        # add the property and misc info to the row
        row.update(property_info_dict)
        row.update(misc_info_dict)

        # append the complete row to rows list
        rows.append(row)
        # print(f"Successfully processed: {address}")

    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        continue
    except Exception as e:
        print(f"Error processing {url}: {e}")
        continue

# create DataFrame
df = pl.DataFrame(rows)

In [30]:
# inspect the dataframe
df.head()

url,address,added_date,price,property_type,bedrooms,bathrooms,size,tenure,council_tax,parking,garden,accessibility
str,str,str,str,str,str,str,str,str,str,str,str,str
"""https://www.rightmove.co.uk/pr…","""Wimpole Road, CAMBRIDGE, Cambr…","""Reduced on 03/09/2025""","""£550,000""","""Detached""","""3""","""1""","""Ask agent""","""Freehold""","""Band: TBC""","""Yes""","""Yes""","""Ask agent"""
"""https://www.rightmove.co.uk/pr…","""Auckland Road, Cambridge""","""Added on 21/08/2025""","""£700,000""","""End of Terrace""","""3""","""1""","""915 sq ft""","""Freehold""","""Band: D""","""Permit""","""Yes""","""Ask agent"""
"""https://www.rightmove.co.uk/pr…","""Hertford Street, Cambridge""","""Added on 22/07/2025""","""£700,000""","""Terraced""","""3""","""1""","""Ask agent""","""Freehold""","""Band: E""","""Ask agent""","""Yes""","""Ask agent"""
"""https://www.rightmove.co.uk/pr…","""City Road, Cambridge""","""Added on 25/07/2025""","""£700,000""","""Terraced""","""3""","""2""","""1,163 sq ft""","""Freehold""","""Band: E""","""No parking""","""Yes""","""Ask agent"""
"""https://www.rightmove.co.uk/pr…","""The Oak Building, Kingfisher W…","""Added on 23/07/2025""","""£700,000""","""Apartment""","""2""","""2""","""Ask agent""","""Share of Freehold""","""Band: E""","""Garage,Allocated""","""Ask agent""","""Ask agent"""


## Data Cleaning

In [31]:
clean_df = (
    df
    .with_columns(
        pl.col('price').str.replace_all(r'£|,', '').cast(pl.Float64),
        pl.col('bedrooms').str.replace_all(r"1,835 sq ft|2,475 sq ft|Ask agent", '').cast(pl.Categorical, strict=False),
        pl.col('bathrooms').cast(pl.Categorical, strict=False),
        pl.col('size').str.replace_all(r' sq ft|,', '').cast(pl.Float64, strict=False),
        pl.col('council_tax').str.replace_all(r'Band: ', ''),
        # pl.col('address').str.extract(r'([A-Za-z]{2}[0-9]+?)').alias('zip_code'),
        pl.col('address').str.extract(r'([A-Za-z]{2}[0-9]{1,2})').alias('zip_code'), # postcode followed by the first 2 digits
        # pl.col('address').str.extract(r'([A-Za-z]{2}[0-9]{1,2}.{4})').alias('zip_code'),
        pl.col('added_date').str.replace('Added on ', '').str.to_datetime(format='%d/%m/%Y', strict=False).dt.date(),
    )
    .with_columns(
        (pl.col('size') / pl.col('bedrooms').cast(pl.Int64, strict=False)).alias('sqft_per_bedroom'),
        ((pl.col('bedrooms').cast(pl.Int64, strict=False) + pl.col('bathrooms').cast(pl.Int64, strict=False)) / pl.col('size')).alias('bed_bath_density')
    )
    .rename({'zip_code': 'postcode'})
)

clean_df.head()

url,address,added_date,price,property_type,bedrooms,bathrooms,size,tenure,council_tax,parking,garden,accessibility,postcode,sqft_per_bedroom,bed_bath_density
str,str,date,f64,str,cat,cat,f64,str,str,str,str,str,str,f64,f64
"""https://www.rightmove.co.uk/pr…","""Wimpole Road, CAMBRIDGE, Cambr…",,550000.0,"""Detached""","""3""","""1""",,"""Freehold""","""TBC""","""Yes""","""Yes""","""Ask agent""","""CB23""",,
"""https://www.rightmove.co.uk/pr…","""Auckland Road, Cambridge""",2025-08-21,700000.0,"""End of Terrace""","""3""","""1""",915.0,"""Freehold""","""D""","""Permit""","""Yes""","""Ask agent""",,305.0,0.004372
"""https://www.rightmove.co.uk/pr…","""Hertford Street, Cambridge""",2025-07-22,700000.0,"""Terraced""","""3""","""1""",,"""Freehold""","""E""","""Ask agent""","""Yes""","""Ask agent""",,,
"""https://www.rightmove.co.uk/pr…","""City Road, Cambridge""",2025-07-25,700000.0,"""Terraced""","""3""","""2""",1163.0,"""Freehold""","""E""","""No parking""","""Yes""","""Ask agent""",,387.666667,0.004299
"""https://www.rightmove.co.uk/pr…","""The Oak Building, Kingfisher W…",2025-07-23,700000.0,"""Apartment""","""2""","""2""",,"""Share of Freehold""","""E""","""Garage,Allocated""","""Ask agent""","""Ask agent""",,,


In [32]:
# check all the garden
pl.Config(tbl_rows=-1)
df['garden'].value_counts()

garden,count
str,u32
"""Patio""",9
"""Rear garden""",11
"""Back garden""",9
"""Front garden""",1
"""Yes""",577
"""Communal garden,Terrace""",2
"""Terrace""",2
"""Private garden""",76
"""Patio,Private garden,Enclosed …",16
"""Ask developer""",29


In [33]:
# this is to export the data
# !pip install xlsxwriter
# clean_df.write_excel('right_move_CB1_3miles_700kGBP.xlsx')

In [34]:
# for google colab renderer
# !pip install -U kaleido

In [52]:
# explore the data and check the relationship between square fit and size
# also annotate the private gardens
fig = px.scatter(
    data_frame=clean_df,
    x='size',
    y='price',
    color='bedrooms',
    labels={'size' : 'sqft'},
    # trendline='ols',
    color_discrete_sequence=px.colors.qualitative.Dark24,
    text=np.where(clean_df['garden'].str.to_lowercase().str.contains('private').to_numpy(), 'pg', '')

)

fig.update_layout(template='ggplot2', width=900)
fig.update_traces(
    marker=dict(size=9, line=dict(width=0.5, color='grey'), opacity=0.7),
    textposition='top center',
    textfont=dict(size=10, color='black')
)

fig.show()

In [51]:
# majority of the 2 and 1 bedrooms do not have garden - check if they are apartments
fig = px.scatter(
    data_frame=clean_df,
    x='size',
    y='price',
    color=np.where(clean_df['property_type'].str.to_lowercase().str.contains('apartment').to_numpy(), 'apartment', 'rest'),
    labels={'size' : 'sqft'},
    # trendline='ols',
    color_discrete_sequence=px.colors.qualitative.Dark24,
    text=np.where(clean_df['garden'].str.to_lowercase().str.contains('private').to_numpy(), 'pg', '')

)

fig.update_layout(template='ggplot2', width=900)
fig.update_traces(
    marker=dict(size=9, line=dict(width=0.5, color='grey'), opacity=0.7),
    textposition='top center',
    textfont=dict(size=10, color='black')
)

fig.show()

In [None]:
# feature engineer one new feature
clean_df_2 = (
    clean_df
    .with_columns(
        (pl.col('size') / pl.col('bedrooms').cast(pl.Int64, strict=False)).alias('sqft_per_bedrooms')
    )
)

Interpretation:
* Average room space allocation:
  * higher values = more spacious properties relative to bedroom count
  * lower values = more compact/efficient user of space
* Propperty layout insights:
  * for exmplem, a 2000 sqft house with 2 bedrooms = 1000 sqft/bedroom (more specious)
* Value comparison:
  * luxury properties will have higher sqft/bedroom rations
  * start homes and apartments will have lower ratios

How this metric work?
* filter our properties with least sqft per bedroom
* compare properties with different bedroom counts
* analyze pricing patterns based on space efficiency or outliers (cramped vs spacious)

In [None]:
# interested in 2-3 bedroom
# want to have a relatively spacious space (doesn't look cramp from the inside)
# we want large total size (sqft), spacious (higher sqft_per_bedroom), and higher bedrooms (3)
# we want light pink to blue
fig = px.scatter(
    data_frame=(
        clean_df_2
        .with_columns(
            pl.col('bedrooms').cast(pl.Int64, strict=False)
        )
        .filter(
            pl.col('bedrooms').is_between(2, 3)
        )
    ),
    x='sqft_per_bedrooms',
    y='price',
    color='size',
    # trendline='ols',
    color_continuous_scale='RdBu',
    text='bedrooms'

)

fig.update_layout(template='ggplot2', width=900)
fig.update_traces(
    marker=dict(size=9, line=dict(width=0.5, color='grey'), opacity=0.9),
    textposition='top center',
    textfont=dict(size=10, color='black')
)

fig.show(renderer='jpg')

In [None]:
# seems like the ideal more balanced properties are:
# >1000sqft
# > 500sqft_per_bedrooms
# 2-3 bedrooms
# between 350k to 550k
shortlisted_properties = (
    clean_df_2
    .with_columns(
        pl.col('bedrooms').cast(pl.Int64, strict=False)
    )
    .filter(
        pl.col('size').gt(1000) &
        pl.col('bedrooms').is_between(2,3) &
        pl.col('price').is_between(350000, 550000)
    )

)

In [None]:
fig = px.bar(
    shortlisted_properties,
    y='address',
    x='price',
)

# Make the y-axis labels (addresses) clickable
fig.update_layout(
    template='ggplot2',
    width=1000,
)

# Show as interactive HTML (remove renderer='png')
fig.show(renderer='png')

In [None]:
# shortlisted_properties.write_excel('shortlisted_properties.xlsx')

In [None]:
# out of curiousity check the property price by postcode
# for exactly 2 or 3 bedrooms
fig = px.box(
    data_frame=clean_df_2,
    x='postcode',
    y='price',
    points='all',
    facet_col=np.where(
        clean_df_2['bedrooms'].cast(pl.Int64, strict=False).is_between(2,3).to_numpy(),
        '2 or 3 bedrooms', 'rest'
        ),
    color='bedrooms',
    title='Property Prices by Postcode: 2-3 Bedrooms vs Others'
)

fig.update_layout(
    template='ggplot2',
    width=1200,
    yaxis_tickformat='£,.0f',
    xaxis_title='Postcode',
    yaxis_title='Price (£)'
)

# Rotate postcode labels if they're crowded
fig.update_xaxes(tickangle=45)
fig.show(renderer='jpg')

In [None]:
# !jupyter nbconvert --to html right_move_relative.ipynb --output right_move_relative.html