# Assignment 2

In [1]:
import re
import tarfile
import numpy as np
import pandas as pd
from datetime import datetime
from bs4 import BeautifulSoup

# Problem 1

Html include:
- Date of sale S˚ald 28 oktober 2023,
- Address (or sometimes the name of the plot of land) Str¨omgatan 4,
- Location of the estate Ytterby, Kung¨alvs kommun,
- Area in the form of boarea+biarea 105+10 m2
- The number of rooms 5 rum,
- Area of the plot 972 m2 tomt, and
- Closing price 5 750 000 kr.

The data should have features：
- solddate
- address
- location
- boarea
- biarea
- totalarea
- nroom
- plotarea
- price

In [40]:
def extract_house_info(soup):
    """ 
    Extracts house infomation.

    Args:
        str: the html content
    
    Returns:
        dict: the extracted house information
    """
    # Date of sale
    sold_date_str = soup.find('span', class_='hcl-label--state')
    if sold_date_str:
        sold_date_text = sold_date_str.get_text(strip=True)
        if sold_date_text:
            date_text = sold_date_text.split(' ')[1:]
            day = date_text[0]
            month_swedish = date_text[1]
            year = date_text[2]
            swedish_months = {
                'januari': '01', 'februari': '02', 'mars': '03', 'april': '04',
                'maj': '05', 'juni': '06', 'juli': '07', 'augusti': '08',
                'september': '09', 'oktober': '10', 'november': '11', 'december': '12'
            }
            month = swedish_months.get(month_swedish.lower())
            date_str = f'{year}-{month}-{day}'
            sold_date = datetime.strptime(date_str, '%Y-%m-%d')
        else:
            sold_date = np.nan
    else:
        sold_date = np.nan

    # Address
    address = soup.find('h2', class_='sold-property-listing__heading')
    address_str = address.get_text(strip=True) if address else 'NaN'

    # Location
    location = soup.find('div', class_='sold-property-listing__location').find_all('div')[0]
    location_str = location.get_text(strip=True).replace('VillaVilla', '') if location else 'NaN'
    cleaned_location_str = re.sub(r'\n\s+', ' ', location_str)

    # Boarea, Biarea, Totalarea, Number of rooms
    numinfo = soup.find('div', class_='sold-property-listing__subheading')
    numinfo_str = numinfo.get_text(strip=True).replace('\xa0', ' ')
    numinfo_lst = numinfo_str.split('m²')
    # Boarea, Biarea, Totalarea
    area_lst = re.findall(r'\d+', numinfo_lst[0])
    # boarea (assume it's always there)
    if area_lst:
        boarea = int(area_lst[0])
    else: boarea, biarea, totalarea = np.nan, np.nan, np.nan,
    # biarea & totalarea
    if len(area_lst) > 1:
        biarea = int(area_lst[1])
        totalarea = boarea + biarea
    else: biarea, totalarea = np.nan, np.nan
    # number of rooms
    if len(numinfo_lst) > 1 and numinfo_lst[1] != '':
        nroom_lst = re.findall(r'\d+', numinfo_lst[1])
        nroom = int(nroom_lst[0])
    else: nroom = np.nan

    # Plot area
    plot_area = soup.find('div', class_='sold-property-listing__land-area')
    if plot_area:
        plot_area_str = plot_area.get_text(strip=True)
        plot_area_lst = re.findall(r'(\d+)', plot_area_str.replace(u'\xa0', ''))
        plot_area_num = int(plot_area_lst[0]) if plot_area_lst else np.nan
    else: plot_area_num = np.nan

    # Price
    sold_price = soup.find('span', class_='hcl-text--medium')
    if sold_price:
        sold_price_str = sold_price.get_text(strip=True)
        sold_price_lst = re.search(r'(\d[\d\s]*)', sold_price_str.replace(u'\xa0', ''))
        sold_price_num = int(sold_price_lst[0]) if sold_price_lst else np.nan
    else: sold_price_num = np.nan
    
    return {
        'sold_date': sold_date,
        'address': address_str,
        'location': cleaned_location_str,
        'boarea': boarea,
        'biarea': biarea,
        'totalarea': totalarea,
        'number_of_rooms': nroom,
        'plot_area': plot_area_num,
        'price':sold_price_num,
    }


In [41]:
property_list = []
with tarfile.open('kungalv_slutpriser.tar.gz', 'r:gz') as tar:
     for member in tar.getmembers():
          file = tar.extractfile(member)
          if file is not None:
               html_content = file.read().decode('utf-8') 
               soup = BeautifulSoup(html_content, 'html.parser')
               property_entries = soup.find_all('li', {'data-tracking-index': True})
               for entry in property_entries:
                    entry_info = extract_house_info(entry)
                    property_list.append(entry_info)

In [42]:
# the resulted list
print(len(property_list))
print(property_list)


1973
[{'sold_date': datetime.datetime(2023, 10, 9, 0, 0), 'address': 'Skårby station 350', 'location': 'Kareby, Kungälvs kommun', 'boarea': 143, 'biarea': 25, 'totalarea': 168, 'number_of_rooms': 7, 'plot_area': 2303, 'price': 3005000}, {'sold_date': datetime.datetime(2023, 10, 5, 0, 0), 'address': 'Högalidsgatan 3', 'location': 'Centrum, Kungälvs kommun', 'boarea': 103, 'biarea': 103, 'totalarea': 206, 'number_of_rooms': 5, 'plot_area': 862, 'price': 3800000}, {'sold_date': datetime.datetime(2023, 10, 3, 0, 0), 'address': 'Kungälvsvägen 22', 'location': 'Centralt, Kungälvs kommun', 'boarea': 77, 'biarea': 46, 'totalarea': 123, 'number_of_rooms': 5, 'plot_area': 1548, 'price': 4500000}, {'sold_date': datetime.datetime(2023, 10, 2, 0, 0), 'address': 'Ädelstensvägen 58', 'location': 'Kode, Kungälvs kommun', 'boarea': 123, 'biarea': nan, 'totalarea': nan, 'number_of_rooms': 6, 'plot_area': 379, 'price': 4075000}, {'sold_date': datetime.datetime(2023, 9, 27, 0, 0), 'address': 'Kantorvägen 

In [43]:
# converted to dataframe
properties = pd.DataFrame(property_list)
properties

Unnamed: 0,sold_date,address,location,boarea,biarea,totalarea,number_of_rooms,plot_area,price
0,2023-10-09,Skårby station 350,"Kareby, Kungälvs kommun",143.0,25.0,168.0,7.0,2303.0,3005000
1,2023-10-05,Högalidsgatan 3,"Centrum, Kungälvs kommun",103.0,103.0,206.0,5.0,862.0,3800000
2,2023-10-03,Kungälvsvägen 22,"Centralt, Kungälvs kommun",77.0,46.0,123.0,5.0,1548.0,4500000
3,2023-10-02,Ädelstensvägen 58,"Kode, Kungälvs kommun",123.0,,,6.0,379.0,4075000
4,2023-09-27,Kantorvägen 4,"Bohuslän, Kungälvs kommun",166.0,,,6.0,558.0,3625000
...,...,...,...,...,...,...,...,...,...
1968,2013-04-08,Jorsalagatan 5,"Iskällan, Kungälvs kommun",185.0,36.0,221.0,6.0,860.0,4100000
1969,2013-03-21,Egnahemsgatan 10,"Centralt, Kungälvs kommun",65.0,65.0,130.0,3.0,446.0,2825000
1970,2013-03-20,Källeröd 220,"Kareby, Kungälvs kommun",61.0,,,2.0,1696.0,1480000
1971,2013-03-08,Bokstigen 4,"Centralt, Kungälvs kommun",140.0,50.0,190.0,5.0,1661.0,3330000


In [44]:
# In the assignment, it is stated that 'assume that we always have information about boarea'ArithmeticError
# but 11 out of 1973 has no boarea data
nan_boarea_rows = properties[properties['boarea'].isna()]
nan_boarea_rows

Unnamed: 0,sold_date,address,location,boarea,biarea,totalarea,number_of_rooms,plot_area,price
336,2021-11-11,Prästvägen,"Bohuslän, Kungälvs kommun",,,,,24755.0,1725000
522,2021-03-25,Kroken 142,"Harestad, Kungälvs kommun",,,,,1175.0,3900000
639,2020-09-16,Skårby station 455,"Bohuslän, Kungälvs kommun",,,,,800.0,3995000
891,2019-11-01,,"Kungälv, Kungälvs kommun",,,,,708.0,1251360
920,2019-09-25,Solbräcke 1:47,"Kungälv, Kungälvs kommun",,,,,708.0,1000000
921,2019-09-25,Solbräcke 1:46,"Kungälv, Kungälvs kommun",,,,,784.0,1000000
922,2019-09-25,Solbräcke 1:45,"Kungälv, Kungälvs kommun",,,,,976.0,1000000
923,2019-09-25,Solbräcke 1:44,"Bohuslän, Kungälvs kommun",,,,,882.0,1000000
1373,2017-07-08,Solberg 455,"Romelanda, Kungälvs kommun",,,,,,2000000
1645,2015-12-02,Näckrosvägen 7,"Kungälv, Kungälvs kommun",,,,,1300.0,1200000


In [45]:
# removed these rows
properties_cleaned = properties.dropna(subset=['boarea']).reset_index(drop=True)
properties_cleaned

Unnamed: 0,sold_date,address,location,boarea,biarea,totalarea,number_of_rooms,plot_area,price
0,2023-10-09,Skårby station 350,"Kareby, Kungälvs kommun",143.0,25.0,168.0,7.0,2303.0,3005000
1,2023-10-05,Högalidsgatan 3,"Centrum, Kungälvs kommun",103.0,103.0,206.0,5.0,862.0,3800000
2,2023-10-03,Kungälvsvägen 22,"Centralt, Kungälvs kommun",77.0,46.0,123.0,5.0,1548.0,4500000
3,2023-10-02,Ädelstensvägen 58,"Kode, Kungälvs kommun",123.0,,,6.0,379.0,4075000
4,2023-09-27,Kantorvägen 4,"Bohuslän, Kungälvs kommun",166.0,,,6.0,558.0,3625000
...,...,...,...,...,...,...,...,...,...
1957,2013-04-08,Jorsalagatan 5,"Iskällan, Kungälvs kommun",185.0,36.0,221.0,6.0,860.0,4100000
1958,2013-03-21,Egnahemsgatan 10,"Centralt, Kungälvs kommun",65.0,65.0,130.0,3.0,446.0,2825000
1959,2013-03-20,Källeröd 220,"Kareby, Kungälvs kommun",61.0,,,2.0,1696.0,1480000
1960,2013-03-08,Bokstigen 4,"Centralt, Kungälvs kommun",140.0,50.0,190.0,5.0,1661.0,3330000


In [46]:
# to csv
properties_cleaned.to_csv('properties_cleaned.csv', index=None)

# Problem 2