# Scrapping first-layer info

## Import necessary libraries

In [3]:
from bs4 import BeautifulSoup
import pandas as pd
import glob
import re

## Generate CSV file for each downloaded Hemnet page (50 in total)

In [26]:
for p in range(49):

    page = 'hemnet_page' + str(p+1) + '.html'
    with open(page, 'r') as html_file:
        soup = BeautifulSoup(html_file, 'html.parser')
    body = soup.find('body')

    links = body.select("li.sold-results__normal-hit a")
    actual_links = [link['href'] for link in links]

    addresses = body.select("li.sold-results__normal-hit h2")
    str_addresses = [address.get_text().replace('\n', '').strip() for address in addresses]

    types = body.select("li.sold-results__normal-hit title")
    actual_type = [kind.get_text() for kind in types]

    info3 = body.select("div.sold-property-listing__subheading")
    actual_info = [info.get_text().replace('\n', '').replace('\xa0', '').strip() for info in info3]
    
    # Apparently there's some exceptions for "area" which gives None value. These elements
    # all have the room type as "Gård/Skog".

    area = [None for _ in range(50)]
    for i in range(len(actual_info)//2):
        # if i % 2 == 0:
        n = re.search(r'(.*?)m²', actual_info[2 * i])
        if n:
            area[i] = n.group(1).replace('                                 ', '')

    prices = [0 for _ in range(50)]
    number_of_rooms = [None for _ in range(50)]
    for i in range(len(actual_info)//2):
        prices[i] = int(int(re.search(r'Slutpris(.*?)kr', actual_info[2*i + 1]).group(1))/1000)
        m = re.search(r'm²                          (.*?)rum', actual_info[2 * i])
        if m:
            number_of_rooms[i] = float(m.group(1).replace(',', '.').strip())

    dates = body.select("div.sold-property-listing__sold-date")
    actual_date = [date.get_text().replace('\n', '').replace('Såld', '').strip() for date in dates]

    sizes = body.select("div.sold-property-listing__size")
    actual_size = [size.get_text().replace('\n', '').replace('\xa0', '').strip() for size in sizes]

    fees = [None for _ in range(50)]
    for i in range(len(actual_size)):
        n = re.search(r'rum        (.*?)kr/mån', actual_size[i])
        if n:
            fees[i] = int(n.group(1).strip())

    d = {'Addresses': str_addresses, 'Types': actual_type, 'area (m²)': area, '# of rooms': number_of_rooms, 'Monthly Fees (Kr)': fees, 'Sold Dates': actual_date,
         'Links': actual_links, 'Prices (tKr)': prices}
    df = pd.DataFrame(data=d)
    
    filename = 'hemnet' + str(p+1) + '.csv'

    df.to_csv(filename, index=False)

## Merge all 50 csvs into 1 file

In [None]:
all_files = glob.glob("*.csv")

li = []

for filename in all_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    li.append(df)

frame = pd.concat(li, axis=0, ignore_index=True)
frame.to_csv('hemnet.csv', index=False)

## Check the result

In [5]:
df = pd.read_csv('hemnet.csv')
print(df.head(5))

           Addresses     Types area (m²)  # of rooms  Monthly Fees (Kr)  \
0   Flormansgatan 2A  Lägenhet        43         1.5             2767.0   
1  Kastanjegatan 19F  Lägenhet        34         2.0             2415.0   
2   Karl XI gatan 47  Lägenhet      87,4         3.0             5787.0   
3          Äspet 163     Villa  158 + 22         8.0                NaN   
4  Margaretavägen 3K  Lägenhet        78         3.0             4584.0   

          Sold Dates                                              Links  \
0  30 september 2021  https://www.hemnet.se/salda/lagenhet-1,5rum-ce...   
1  30 september 2021  https://www.hemnet.se/salda/lagenhet-2rum-jarn...   
2  30 september 2021  https://www.hemnet.se/salda/lagenhet-3rum-lund...   
3  30 september 2021  https://www.hemnet.se/salda/villa-8rum-lunds-k...   
4  30 september 2021  https://www.hemnet.se/salda/lagenhet-3rum-moll...   

   Prices (tKr)  
0          2370  
1          1745  
2          4700  
3          5350  
4       

## Check the housing type

In [6]:
set(df['Types'])

{'Fritidshus', 'Gård/Skog', 'Lägenhet', 'Radhus', 'Tomt', 'Villa', 'Övrigt'}

## Generate a dataframe for only apartments

In [7]:
apart_df = df[df['Types'] == 'Lägenhet']
apart_df.head()

Unnamed: 0,Addresses,Types,area (m²),# of rooms,Monthly Fees (Kr),Sold Dates,Links,Prices (tKr)
0,Flormansgatan 2A,Lägenhet,43,1.5,2767.0,30 september 2021,"https://www.hemnet.se/salda/lagenhet-1,5rum-ce...",2370
1,Kastanjegatan 19F,Lägenhet,34,2.0,2415.0,30 september 2021,https://www.hemnet.se/salda/lagenhet-2rum-jarn...,1745
2,Karl XI gatan 47,Lägenhet,874,3.0,5787.0,30 september 2021,https://www.hemnet.se/salda/lagenhet-3rum-lund...,4700
4,Margaretavägen 3K,Lägenhet,78,3.0,4584.0,30 september 2021,https://www.hemnet.se/salda/lagenhet-3rum-moll...,2750
5,Qvantenborgsvägen 4B,Lägenhet,59,2.0,3125.0,29 september 2021,https://www.hemnet.se/salda/lagenhet-2rum-kobj...,2250


## Save as a CSV file

In [None]:
apart_df.to_csv('apart_df.csv')