# Web scrapping from a Hemnet sample page

In [1]:
# Import necessary libraries

from bs4 import BeautifulSoup
import pandas as pd
import re

In [2]:
# scrap info from the html <body> tag

with open('hemnet_page50.html', 'r') as html_file:
    soup = BeautifulSoup(html_file, 'html.parser')
body = soup.find('body')

In [21]:
# Get individual link for each sold property. These links provide additional information
# which I refered to as second-layer.

links = body.select("li.sold-results__normal-hit a")
actual_links = [link['href'] for link in links]
# for link in actual_links:
#     print(link)
actual_links[0:6]

['https://www.hemnet.se/salda/lagenhet-3rum-centrum-lunds-kommun-sodra-esplanaden-5a-1268410',
 'https://www.hemnet.se/salda/lagenhet-4rum-ostra-torn-lunds-kommun-stralsundsvagen-92-1268390',
 'https://www.hemnet.se/salda/lagenhet-1,5rum-veberod-lunds-kommun-vildgasvagen-45-1268417',
 'https://www.hemnet.se/salda/villa-7rum-stangby-lunds-kommun-vallkarratorn-502-1263056',
 'https://www.hemnet.se/salda/lagenhet-1rum-centrum-lunds-kommun-gronegatan-19b-1268104',
 'https://www.hemnet.se/salda/lagenhet-3rum-norra-faladen-lunds-kommun-skarpskyttevagen-22-f-1268135']

In [14]:
# Get addresses which will later be converted to lan-longitude coordinates. 

addresses = body.select("li.sold-results__normal-hit h2")
str_addresses = [address.get_text().replace('\n', '').strip() for address in addresses]
str_addresses[0:6]

['Södra Esplanaden 5A',
 'Stralsundsvägen 92',
 'Vildgåsvägen 45',
 'Vallkärratorn 502',
 'Grönegatan 19B',
 'Skarpskyttevägen 22 F']

In [15]:
# Housing type

types = body.select("li.sold-results__normal-hit title")
actual_type = [kind.get_text() for kind in types]
actual_type[0:6]

['Lägenhet', 'Lägenhet', 'Lägenhet', 'Villa', 'Lägenhet', 'Lägenhet']

In [6]:
# There are 3 pieces info (i.e. living area, # of rooms, sold price) embedded 
# in the div class=sold-property-listing__subheading, need to separate them.

info3 = body.select("div.sold-property-listing__subheading")
actual_info = [info.get_text().replace('\n', '').replace('\xa0', '').strip() for info in info3]

In [16]:
# Living area

area = [None for _ in range(50)]
for i in range(len(actual_info)//2):
    # if i % 2 == 0:
    area[i] = re.search(r'(.*?)m²', actual_info[2 * i]).group(1).replace('                                 ', '')
area[0:6]

['68', '80,5', '37 + 15', '251', '34,6', '95']

In [17]:
# Sold prices (in tKr) & Number of rooms

prices = [0 for _ in range(50)]
number_of_rooms = [None for _ in range(50)]
for i in range(len(actual_info)//2):
    prices[i] = int(int(re.search(r'Slutpris(.*?)kr', actual_info[2*i + 1]).group(1))/1000)
    m = re.search(r'm²                          (.*?)rum', actual_info[2 * i])
    if m:
        number_of_rooms[i] = float(m.group(1).replace(',', '.').strip())
prices[0:6]

[3200, 2225, 1000, 4450, 2300, 2065]

In [18]:
number_of_rooms[0:6]

[3.0, 4.0, 1.5, 7.0, 1.0, 3.0]

In [19]:
# Sold dates

dates = body.select("div.sold-property-listing__sold-date")
actual_date = [date.get_text().replace('\n', '').replace('Såld', '').strip() for date in dates]
actual_date[0:6]

['11 oktober 2020',
 '11 oktober 2020',
 '11 oktober 2020',
 '11 oktober 2020',
 '10 oktober 2020',
 '10 oktober 2020']

In [10]:
# There are also 3 pieces info embeded in the div class=sold-property-listing__size
# which we only need the monthly fees (avgift)

sizes = body.select("div.sold-property-listing__size")
actual_size = [size.get_text().replace('\n', '').replace('\xa0', '').strip() for size in sizes]

In [20]:
# Preset fees as None cause most houses (villa) do not have monthly fee. 

fees = [None for _ in range(50)]
for i in range(len(actual_size)):
    n = re.search(r'rum        (.*?)kr/mån', actual_size[i])
    if n:
        fees[i] = int(n.group(1).strip())
fees[0:6]

[3509, 5809, 2138, None, 2538, 4956]

In [12]:
# Now we organize all obtained info and make a table using Pandas and then save as a csv file. 

d = {'Addresses': str_addresses, 'Types': actual_type, 'area (m²)': area, '# of rooms': number_of_rooms, 'Monthly Fees (Kr)': fees, 'Sold Dates': actual_date,
     'Links': actual_links, 'Prices (tKr)': prices}
df = pd.DataFrame(data=d)
df.to_csv('hemnet50.csv', index=False)

all_data = pd.read_csv("hemnet50.csv")
all_data.head()

Unnamed: 0,Addresses,Types,area (m²),# of rooms,Monthly Fees (Kr),Sold Dates,Links,Prices (tKr)
0,Södra Esplanaden 5A,Lägenhet,68,3.0,3509.0,11 oktober 2020,https://www.hemnet.se/salda/lagenhet-3rum-cent...,3200
1,Stralsundsvägen 92,Lägenhet,805,4.0,5809.0,11 oktober 2020,https://www.hemnet.se/salda/lagenhet-4rum-ostr...,2225
2,Vildgåsvägen 45,Lägenhet,37 + 15,1.5,2138.0,11 oktober 2020,"https://www.hemnet.se/salda/lagenhet-1,5rum-ve...",1000
3,Vallkärratorn 502,Villa,251,7.0,,11 oktober 2020,https://www.hemnet.se/salda/villa-7rum-stangby...,4450
4,Grönegatan 19B,Lägenhet,346,1.0,2538.0,10 oktober 2020,https://www.hemnet.se/salda/lagenhet-1rum-cent...,2300
