# Demonstration of Web scrapping from a Hemnet sample page

In [1]:
# Import necessary libraries

from bs4 import BeautifulSoup
import pandas as pd
import re

## 1. Scrap info from the html \<body\> tag

In [2]:
with open('hemnet_page50.html', 'r') as html_file:
    soup = BeautifulSoup(html_file, 'html.parser')
body = soup.find('body')

### 1.1. Individual link for each sold property

In [7]:
# These links provide additional information (e.g. year of build, agent, etc.) which I refered to as second-layer info.

link_info = body.select("li.sold-results__normal-hit a")
links = [link['href'] for link in link_info]

print(links[0:3])

['https://www.hemnet.se/salda/lagenhet-3rum-centrum-lunds-kommun-sodra-esplanaden-5a-1268410',
 'https://www.hemnet.se/salda/lagenhet-4rum-ostra-torn-lunds-kommun-stralsundsvagen-92-1268390',
 'https://www.hemnet.se/salda/lagenhet-1,5rum-veberod-lunds-kommun-vildgasvagen-45-1268417']

### 1.2. Addresses

In [8]:
# Get address for each property 

str_addresses = body.select("li.sold-results__normal-hit h2")
addresses = [address.get_text().replace('\n', '').strip() for address in str_addresses]

print(addresses[0:3])

['Södra Esplanaden 5A', 'Stralsundsvägen 92', 'Vildgåsvägen 45']

### 1.3. Property type

In [10]:
# Get building type

str_types = body.select("li.sold-results__normal-hit title")
types = [type.get_text() for type in str_types]

print(types[0:6])

['Lägenhet', 'Lägenhet', 'Lägenhet', 'Villa', 'Lägenhet', 'Lägenhet']

In [11]:
# There are 3 pieces of info: living area, # of rooms, and sold price embedded 
# in the div class=sold-property-listing__subheading, need to separate them.

info_3piece = body.select("div.sold-property-listing__subheading")
info = [item.get_text().replace('\n', '').replace('\xa0', '').strip() for item in info_3piece]

### 1.4. Living area

In [12]:
area = [None for _ in range(50)]
for i in range(len(info)//2):
    area[i] = re.search(r'(.*?)m²', info[2 * i]).group(1).replace('                                 ', '')

print(area[0:6])

['68', '80,5', '37 + 15', '251', '34,6', '95']

### 1.5 Sold prices (in tKr) & Number of rooms

In [14]:
prices = [0 for _ in range(50)]
number_of_rooms = [None for _ in range(50)]

for i in range(len(info)//2):
    prices[i] = int(int(re.search(r'Slutpris(.*?)kr', info[2*i + 1]).group(1))/1000)
    m = re.search(r'm²                          (.*?)rum', info[2 * i])
    if m:  # this if condition is necessary cause group() does not work for None type
        number_of_rooms[i] = float(m.group(1).replace(',', '.').strip())

print(prices[0:6])
print(number_of_rooms[0:6])

[3200, 2225, 1000, 4450, 2300, 2065]
[3.0, 4.0, 1.5, 7.0, 1.0, 3.0]


### 1.6 Sold dates

In [16]:
dates = body.select("div.sold-property-listing__sold-date")
sold_date = [date.get_text().replace('\n', '').replace('Såld', '').strip() for date in dates]

print(sold_date[0:5])

['11 oktober 2020',
 '11 oktober 2020',
 '11 oktober 2020',
 '11 oktober 2020',
 '10 oktober 2020']

### 1.7 Monthly fees (avgift) in Kr

In [19]:
# There are also 3 pieces info embeded in the div class=sold-property-listing__size
# which we only need the monthly fees (avgift)

sizes = body.select("div.sold-property-listing__size")
size_and_avgift = [size.get_text().replace('\n', '').replace('\xa0', '').strip() for size in sizes]

In [20]:
# Preset avgift as None type cause most houses (villa) do not have that. 

avgift = [None for _ in range(50)]

for i in range(len(size_and_avgift )):
    n = re.search(r'rum        (.*?)kr/mån', size_and_avgift[i])
    if n:
        avgift[i] = int(n.group(1).strip())

print(avgift[0:5])

[3509, 5809, 2138, None, 2538]


## 2. Create dataframe

In [21]:
d = {'Addresses': addresses, 'Types': types, 'Area': area, 'RoomCount': number_of_rooms, 'Avgift': avgift, 'SoldDate': sold_date,
     'Links': links, 'Prices': prices}
df = pd.DataFrame(data=d)
df.head()

Unnamed: 0,Addresses,Types,Area,RoomCount,Avgift,SoldDate,Links,Prices
0,Södra Esplanaden 5A,Lägenhet,68,3.0,3509.0,11 oktober 2020,https://www.hemnet.se/salda/lagenhet-3rum-cent...,3200
1,Stralsundsvägen 92,Lägenhet,805,4.0,5809.0,11 oktober 2020,https://www.hemnet.se/salda/lagenhet-4rum-ostr...,2225
2,Vildgåsvägen 45,Lägenhet,37 + 15,1.5,2138.0,11 oktober 2020,"https://www.hemnet.se/salda/lagenhet-1,5rum-ve...",1000
3,Vallkärratorn 502,Villa,251,7.0,,11 oktober 2020,https://www.hemnet.se/salda/villa-7rum-stangby...,4450
4,Grönegatan 19B,Lägenhet,346,1.0,2538.0,10 oktober 2020,https://www.hemnet.se/salda/lagenhet-1rum-cent...,2300


In [None]:
# Merge into one CSV file

df.to_csv('hemnet50.csv', index=False)