This block contains helper methods and some regex patterns to use in the data extraction process.

In [1]:
from bs4 import Tag
from typing import Any
import re

# Helper function to flatten a 2D list into a 1D list
def flatten(matrix: list[list[Any]]) -> list[Any]:
    return [item for sublist in matrix for item in sublist]

# Regex patterns for sold house information
date_of_sale_pattern = re.compile(r"Såld (\d{1,2}) (januari|februari|mars|april|maj|juni|juli|augusti|september|oktober|november|december) (\d{4})")
address_pattern = re.compile(r"\b\w+(?:\s+\w+)*\s+\d{1,3}\b")
location_pattern = re.compile(r"\b\w+,\s+\w+\s+\w+\b")
bo_and_bi_area_pattern = re.compile(r"\b\d{2,3}+\s*(?:\+\s*\d+\s*)?\b")
nr_of_rooms_pattern = re.compile(r"\b\d+\s*rum\b")
area_of_plot_pattern = re.compile(r'&nbsp;')
closing_price_pattern = re.compile(r"\b\d{1,3}(?: \d{3})* kr\b")

# What value to return if the information is not found
no_information = None

def find_sale_date(house_info: Tag) -> str | None:
    sale_date_str = no_information
    try:
        sale_date_str: str = re.findall(date_of_sale_pattern, house_info.text)[0]
        sale_date_str = f"{sale_date_str[0]} {sale_date_str[1]} {sale_date_str[2]}"
    except IndexError:
        pass
    return sale_date_str

def find_address(house_info: Tag) -> str | None:
    address_str = no_information
    try:
        address_tag: Tag = house_info.find_all("h2", class_="sold-property-listing__heading qa-selling-price-title hcl-card__title")[0]
        address_str = re.findall(address_pattern, address_tag.text)[0]
    except IndexError:
        pass
    return address_str

def find_location(house_info: Tag) -> str | None:
    location_str = no_information
    try:
        location_str: str = re.findall(location_pattern, house_info.text)[0]
        location_str = location_str.replace("\n","") \
                                    .replace("         ","")
    except IndexError:
        pass
    return location_str

def find_bo_and_bi_area(house_info: Tag) -> tuple[int] | None:
    bo_and_bi_area_int_tuple: tuple[int] = no_information
    try:
        bo_and_bi_area_tag: Tag = house_info.find_all("div", class_="sold-property-listing__subheading sold-property-listing__area")[0]
        bo_and_bi_area_str: str = bo_and_bi_area_tag.text.replace("\n","") \
                                                            .replace("         ","") \
                                                            .replace("  "," ") \
                                                            .replace(" ","") \
                                                            .replace(u"\xa0", u" ")
        bo_and_bi_area_str = re.findall(bo_and_bi_area_pattern, bo_and_bi_area_str)[0]
        bo_and_bi_area_str_list: list[str] = bo_and_bi_area_str.split("+")
        bo_and_bi_area_int_tuple = tuple([int(area) for area in bo_and_bi_area_str_list])
    except IndexError:
        pass
    return bo_and_bi_area_int_tuple

def find_nr_of_rooms(house_info: Tag) -> str | None:
    nr_of_rooms_str = no_information
    try:
        nr_of_rooms_tag: Tag = house_info.find_all("div", class_="sold-property-listing__subheading sold-property-listing__area")[0]
        nr_of_rooms_str: str = nr_of_rooms_tag.text.replace("\n","") \
                                                        .replace("         ","") \
                                                        .replace("  "," ") \
                                                        .replace(u"\xa0", u" ")
        nr_of_rooms_str = re.findall(nr_of_rooms_pattern, nr_of_rooms_str)[0]
    except IndexError:
        pass
    return nr_of_rooms_str

def find_closing_price(house_info: Tag) -> int | None:
    closing_price_str = no_information
    try:
        closing_price_tag: Tag = house_info.find_all("span", class_="hcl-text hcl-text--medium")[0]
        closing_price_str: str = closing_price_tag.text.replace(u"\xa0", u" ")
        closing_price_str = re.findall(closing_price_pattern, closing_price_str)[0]
        closing_price_str = closing_price_str.replace(" kr","")
        closing_price_str = closing_price_str.replace(" ","")
        closing_price_int: int = int(closing_price_str)
    except IndexError:
        pass
    return closing_price_int

def find_area_of_plot(house_info: Tag) -> str | None:
    area_of_plot_str = no_information
    try:
        area_of_plot_tag: Tag = house_info.find_all("div", class_="sold-property-listing__land-area")[0]
        area_of_plot_str: str = area_of_plot_tag.text.replace("\n","") \
                                                      .replace("         ","") \
                                                      .replace("  "," ") \
                                                      
        area_of_plot_str = re.findall(area_of_plot_pattern, area_of_plot_str)[0]
    except IndexError:
        pass
    return area_of_plot_str

# Helper function to extract sold house information from a Tag object (which is a list item) 
def get_sold_house_info(house_info: Tag) -> tuple[str, str, str, tuple[int], str, str, int]:

    sale_date_str: str = find_sale_date(house_info)
    address_str: str = find_address(house_info)
    location_str: str = find_location(house_info)
    bo_and_bi_area_str: tuple[int] = find_bo_and_bi_area(house_info)
    nr_of_rooms_str: str = find_nr_of_rooms(house_info)
    plot_area_str = find_area_of_plot(house_info)
    closing_price_int: int = find_closing_price(house_info)

    return (sale_date_str, address_str, location_str, bo_and_bi_area_str, nr_of_rooms_str, plot_area_str, closing_price_int)

### Problem 1: Scraping house prices

In [2]:
from bs4 import BeautifulSoup, ResultSet, Tag
import pandas as pd, matplotlib as pl, glob, os, re

# get all html files in the directory
dir_path = os.getcwd() + "/kungalv_slutpriser/*.html"
files = glob.glob(dir_path)
soups = [BeautifulSoup(open(file, encoding="utf-8"), "html.parser") for file in files]

# Find the list item html elements that contain the sold house information and extract the information using the helper functions
sold_houses_divs: list[ResultSet[Tag]] = [soup.find_all("li", class_ = "sold-results__normal-hit") for soup in soups]
sold_houses_divs: list[Tag] = flatten(sold_houses_divs)
houses_info: list[tuple] = [get_sold_house_info(house_info) for house_info in sold_houses_divs]
print(houses_info)



[('9 oktober 2023', 'Skårby station 350', 'Kareby, Kungälvs kommun', (143, 25), '7 rum', '    2\xa0303\xa0m² tomt   ', 3005000), ('5 oktober 2023', 'Högalidsgatan 3', 'Centrum, Kungälvs kommun', (103, 103), '5 rum', '    862\xa0m² tomt   ', 3800000), ('3 oktober 2023', 'Kungälvsvägen 22', 'Centralt, Kungälvs kommun', (77, 46), '5 rum', '    1\xa0548\xa0m² tomt   ', 4500000), ('2 oktober 2023', 'Ädelstensvägen 58', 'Kode, Kungälvs kommun', (123,), '6 rum', '    379\xa0m² tomt   ', 4075000), ('27 september 2023', 'Kantorvägen 4', 'Bohuslän, Kungälvs kommun', (166,), '6 rum', '    558\xa0m² tomt   ', 3625000), ('26 september 2023', 'Diamantvägen 34', 'Kode, Kungälvs kommun', (123,), '5 rum', '    559\xa0m² tomt   ', 2900000), ('25 september 2023', 'Tjäderstigen 8', 'Centralt, Kungälvs kommun', (126,), '4 rum', '    362\xa0m² tomt   ', 4760000), ('22 september 2023', 'Heavägen 31', 'Lycke, Kungälvs kommun', (165, 30), '6 rum', '    1\xa0553\xa0m² tomt   ', 5450000), ('17 september 2023', '