Date of sale Såld 28 oktober 2023, 
Address (or sometimes the name of the plot of land) Strömgatan 4, 
Location of the estate Ytterby, Kungalvs kommun, 
Area in the form of boarea+biarea 105+10 m2 ,
The number of rooms 5 rum, 
Area of the plot 972 m2 tomt
Closing price 5 750 000 kr.

Todo: regex på adress som måste va på samma rad, 

In [93]:
from bs4 import Tag
import re

# Helper function to flatten a 2D list into a 1D list
def flatten(matrix: list[list[type]]) -> list[type]:
    return [item for sublist in matrix for item in sublist]

# Regex patterns for sold house information
date_of_sale_pattern = re.compile(r"Såld (\d{1,2}) (januari|februari|mars|april|maj|juni|juli|augusti|september|oktober|november|december) (\d{4})")
address_pattern = re.compile(r"\b\w+(?:\s+\w+)*\s+\d{1,3}\b")
location_pattern = re.compile(r"\b\w+,\s+\w+\s+\w+\b")
bo_and_bi_area_pattern = re.compile(r"\b\d{2,3}+\s*(?:\+\s*\d+\s*)?\b")
nr_of_rooms_pattern = re.compile(r"\b\d+\s*rum\b")
area_of_plot_pattern = re.compile(r"\b\d+\s*m²\s+tomt\b")
closing_price_pattern = re.compile(r"\b\d{1,3}(?: \d{3})* kr\b")

# What value to return if the information is not found
no_information = None

def __find_sale_date(house_info: Tag) -> str | None:
    sale_date_str = no_information
    try:
        sale_date_str: str = re.findall(date_of_sale_pattern, house_info.text)[0]
        sale_date_str = f"{sale_date_str[0]} {sale_date_str[1]} {sale_date_str[2]}"
    except IndexError:
        pass
    return sale_date_str

def __find_address(house_info: Tag) -> str | None:
    address_str = no_information
    try:
        address_tag: Tag = house_info.find_all("h2", class_="sold-property-listing__heading qa-selling-price-title hcl-card__title")[0]
        address_str = re.findall(address_pattern, address_tag.text)[0]
    except IndexError:
        pass
    return address_str

def __find_location(house_info: Tag) -> str | None:
    location_str = no_information
    try:
        location_str: str = re.findall(location_pattern, house_info.text)[0]
        location_str = location_str.replace("\n","") \
                                    .replace("         ","")
    except IndexError:
        pass
    return location_str

def __find_bo_and_bi_area(house_info: Tag) -> str | None:
    bo_and_bi_area_str = no_information
    try:
        bo_and_bi_area_tag: Tag = house_info.find_all("div", class_="sold-property-listing__subheading sold-property-listing__area")[0]
        bo_and_bi_area_str: str = bo_and_bi_area_tag.text.replace("\n","") \
                                                            .replace("         ","") \
                                                            .replace("  "," ") \
                                                            .replace(u"\xa0", u" ")
        bo_and_bi_area_str = re.findall(bo_and_bi_area_pattern, bo_and_bi_area_str)[0]
        bo_and_bi_area_str += "m²"
    except IndexError:
        pass
    return bo_and_bi_area_str

def __find_nr_of_rooms(house_info: Tag) -> str | None:
    nr_of_rooms_str = no_information
    try:
        nr_of_rooms_tag: Tag = house_info.find_all("div", class_="sold-property-listing__subheading sold-property-listing__area")[0]
        nr_of_rooms_str: str = nr_of_rooms_tag.text.replace("\n","") \
                                                        .replace("         ","") \
                                                        .replace("  "," ") \
                                                        .replace(u"\xa0", u" ")
        nr_of_rooms_str = re.findall(nr_of_rooms_pattern, nr_of_rooms_str)[0]
    except IndexError:
        pass
    return nr_of_rooms_str

def __find_closing_price(house_info: Tag) -> str | None:
    closing_price_str = no_information
    try:
        closing_price_tag: Tag = house_info.find_all("span", class_="hcl-text hcl-text--medium")[0]
        closing_price_str = closing_price_tag.text.replace(u"\xa0", u" ")
        closing_price_str = re.findall(closing_price_pattern, closing_price_str)[0]
    except IndexError:
        pass
    return closing_price_str

# Helper function to extract sold house information from a Tag object (which is a list item) 
def sold_house_info(house_info: Tag) -> tuple:

    sale_date_str = __find_sale_date(house_info)
    address_str = __find_address(house_info)
    location_str = __find_location(house_info)
    bo_and_bi_area_str = __find_bo_and_bi_area(house_info)
    nr_of_rooms_str = __find_nr_of_rooms(house_info)
    closing_price_str = __find_closing_price(house_info)

    return (sale_date_str, address_str, location_str, bo_and_bi_area_str, nr_of_rooms_str, closing_price_str)

In [94]:
from bs4 import BeautifulSoup, ResultSet, Tag
import pandas as pd, matplotlib as pl, glob, os, re

# get all html files in the directory
dir_path = os.getcwd() + "/kungalv_slutpriser/*.html"
files = glob.glob(dir_path)
soups = [BeautifulSoup(open(file, encoding="utf-8"), "html.parser") for file in files]

sold_houses_divs: list[ResultSet[Tag]] = [soup.find_all("li", class_ = "sold-results__normal-hit") for soup in soups]
sold_houses_divs: list[Tag] = flatten(sold_houses_divs)
houses_info: list[tuple] = [sold_house_info(house_info) for house_info in sold_houses_divs]
print(houses_info)



[('9 oktober 2023', 'Skårby station 350', 'Kareby, Kungälvs kommun', '143 + 25 m²', '7 rum', '3 005 000 kr'), ('5 oktober 2023', 'Högalidsgatan 3', 'Centrum, Kungälvs kommun', '103 + 103 m²', '5 rum', '3 800 000 kr'), ('3 oktober 2023', 'Kungälvsvägen 22', 'Centralt, Kungälvs kommun', '77 + 46 m²', '5 rum', '4 500 000 kr'), ('2 oktober 2023', 'Ädelstensvägen 58', 'Kode, Kungälvs kommun', '123 m²', '6 rum', '4 075 000 kr'), ('27 september 2023', 'Kantorvägen 4', 'Bohuslän, Kungälvs kommun', '166 m²', '6 rum', '3 625 000 kr'), ('26 september 2023', 'Diamantvägen 34', 'Kode, Kungälvs kommun', '123 m²', '5 rum', '2 900 000 kr'), ('25 september 2023', 'Tjäderstigen 8', 'Centralt, Kungälvs kommun', '126 m²', '4 rum', '4 760 000 kr'), ('22 september 2023', 'Heavägen 31', 'Lycke, Kungälvs kommun', '165 + 30 m²', '6 rum', '5 450 000 kr'), ('17 september 2023', 'Beryllvägen 14', 'Kode, Kungälvs kommun', '145 m²', '5 rum', '3 900 000 kr'), ('17 september 2023', 'Kornhall 290', 'Kornhall, Kungälvs

In [95]:
from bs4 import BeautifulSoup, ResultSet, Tag
import pandas as pd, matplotlib as pl, glob, os, re

# get all html files in the directory
dir_path = os.getcwd() + "/kungalv_slutpriser/*.html"
files = glob.glob(dir_path)
soups = [BeautifulSoup(open(file, encoding="utf-8"), "html.parser") for file in files]

# regex patterns
date_of_sale_pattern = r"Såld (\d{1,2}) (januari|februari|mars|april|maj|juni|juli|augusti|september|oktober|november|december) (\d{4})"
adress_pattern = r"\b\w+(?:\s+\w+)*\s+\d{1,3}\b"
location_pattern = r"\b\w+(?:,\s+\w+\s+\w+)\b"
bo_and_bi_area_pattern = r"\bboarea\+\bbiarea\s+(\d+\+\d+)\s+m2\b"
nr_of_rooms_pattern = r"\b\d+\s*rum\b"
area_of_plot_pattern = r"\b\d+\s*m2\s+tomt\b"
closing_price_pattern = r"\b\d{1,3}(?: \d{3})* kr\b"

# find all addresses in the html files
adress_h2s: list[ResultSet[Tag]] = [soup.find_all("h2", class_="sold-property-listing__heading qa-selling-price-title hcl-card__title") for soup in soups]
adress_strs = [h2.text for resultSet in adress_h2s for h2 in resultSet]

# find all bo and bi areas in the html files
bo_and_bi_area_divs: list[ResultSet[Tag]] = [soup.find_all("div", class_="sold-property-listing__subheading sold-property-listing__area") for soup in soups]
bo_and_bi_area_strs = [div.text for resultSet2 in bo_and_bi_area_divs for div in resultSet2]

dates: list[tuple] = flatten([re.findall(date_of_sale_pattern, soup.text) for soup in soups])
dates: list[str]  = [f"{date[0]} {date[1]} {date[2]}" for date in dates]
adresses: list[str] = flatten([re.findall(adress_pattern, str) for str in adress_strs])
locations: list[str] = flatten([re.findall(location_pattern, soups.text) for soups in soups])
locations = [item.replace("\n","") for item in locations]
locations = [item.replace("         ","") for item in locations]
#bo_and_bi_areas = bo_and_bi_area_divs.get_text(strip = True)
#nr_of_rooms = [re.findall(nr_of_rooms_pattern, soups.text) for soups in soups]
#area_of_plots = [re.findall(area_of_plot_pattern, soups.text) for soups in soups]
#closing_prices = [re.findall(closing_price_pattern, soups.text) for soups in soups]

#print(len(dates))
#print(len(adresses))
#print(len(locations))
#print(bo_and_bi_areas)
#print(nr_of_rooms)
#print(area_of_plots)
#print(closing_prices)
