# Scrapping first-layer info for all 2500 sold properites and save as a csv file.

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import glob
import re

In [26]:
# Generate csv file for each Hemnet page (total 50)

for p in range(49):

    page = 'hemnet_page' + str(p+1) + '.html'
    with open(page, 'r') as html_file:
        soup = BeautifulSoup(html_file, 'html.parser')
    body = soup.find('body')

    links = body.select("li.sold-results__normal-hit a")
    actual_links = [link['href'] for link in links]

    addresses = body.select("li.sold-results__normal-hit h2")
    str_addresses = [address.get_text().replace('\n', '').strip() for address in addresses]

    types = body.select("li.sold-results__normal-hit title")
    actual_type = [kind.get_text() for kind in types]

    info3 = body.select("div.sold-property-listing__subheading")
    actual_info = [info.get_text().replace('\n', '').replace('\xa0', '').strip() for info in info3]
    
    # Apparently there's some exceptions for "area" which gives None value. These elements
    # all have the room type as "Gård/Skog".

    area = [None for _ in range(50)]
    for i in range(len(actual_info)//2):
        # if i % 2 == 0:
        n = re.search(r'(.*?)m²', actual_info[2 * i])
        if n:
            area[i] = n.group(1).replace('                                 ', '')

    prices = [0 for _ in range(50)]
    number_of_rooms = [None for _ in range(50)]
    for i in range(len(actual_info)//2):
        prices[i] = int(int(re.search(r'Slutpris(.*?)kr', actual_info[2*i + 1]).group(1))/1000)
        m = re.search(r'm²                          (.*?)rum', actual_info[2 * i])
        if m:
            number_of_rooms[i] = float(m.group(1).replace(',', '.').strip())

    dates = body.select("div.sold-property-listing__sold-date")
    actual_date = [date.get_text().replace('\n', '').replace('Såld', '').strip() for date in dates]

    sizes = body.select("div.sold-property-listing__size")
    actual_size = [size.get_text().replace('\n', '').replace('\xa0', '').strip() for size in sizes]

    fees = [None for _ in range(50)]
    for i in range(len(actual_size)):
        n = re.search(r'rum        (.*?)kr/mån', actual_size[i])
        if n:
            fees[i] = int(n.group(1).strip())

    d = {'Addresses': str_addresses, 'Types': actual_type, 'area (m²)': area, '# of rooms': number_of_rooms, 'Monthly Fees (Kr)': fees, 'Sold Dates': actual_date,
         'Links': actual_links, 'Prices (tKr)': prices}
    df = pd.DataFrame(data=d)
    
    filename = 'hemnet' + str(p+1) + '.csv'

    df.to_csv(filename, index=False)

In [None]:
# Combine 50 csvs into 1 file

all_files = glob.glob("*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)
frame.to_csv('hemnet.csv', index=False)