In [1]:
#IMPORTS 

%load_ext autoreload
%autoreload 2

from bs4 import BeautifulSoup
from bs4 import SoupStrainer

import numpy as np
import pandas as pd
import requests
import re

from helpers import *
from scraping import *

import timeit

In [2]:
#Constants

URL_FOND = "https://www.nordnet.no/mux/web/fonder/sok.html?nn_kategori=&kategori=&forvaltare=&sokord=&sok=1&ppm=0&nobuy=&flik=&nm=&typ=1"

URL_DETAIL_PAGE_PREFIX = 'https://secust.msse.se/se/nordnetny/funds/' 

In [3]:
soup = get_soup(URL_FOND)

In [4]:
regex_string = "/mux/web/fonder/fondfakta.html?"
prepend_prefix = "https://www.nordnet.no"

urls = get_urls(soup, regex=regex_string, prefix=prepend_prefix)

In [5]:
def create_cost_dataframe(soup):
    
    table_soup = soup.find("h2", text = "Maks kjøpsavgift").next_sibling.next_sibling

    row_names = ["Kjøp", "Selg", "Forvaltningsavgift", "Resultatbasert avgift(maks)", "Løpende kostnader"]
    td_tags = table_soup.findAll("td", class_=None)

    values = get_floats_as_strings_from_tags(td_tags)
    columns = ['Kjøpsavgift', "Salgsavgift", "Forvaltningsavgift", "Resultatbastert avgift(maks)", "Løpende kostnader"]

    return create_df(columns, values)

def create_volatility_dataframe(soup):
    
    table_soup = soup.find("a", text = "Gj.snitt avkastning").parent.parent.parent

    row_names = ["Gj.snitt avkastning", "Standardavvik", "Sharpe Ratio"]
    td_tags = table_soup.findAll("td", class_=None)

    values = get_floats_as_strings_from_tags(td_tags)
    columns = ["Gj.snitt avkastning", "Standardavvik", "Sharpe Ratio"]

    return create_df(columns, values)


def get_overview_dataframe(overview_soup):
    
    dataFrame_costs = create_cost_dataframe(overview_soup)
    dataFrame_volatility = create_volatility_dataframe(overview_soup)
   
    #CONCAT RESULTS
    dataframe_fund = pd.concat([dataFrame_costs, dataFrame_volatility], axis=1)
    fund_name = overview_soup.find("div", id="container").h1.text
    dataframe_fund['Name'] = fund_name

    return dataframe_fund


In [6]:

def create_dataframe(urls):
    main_dataframe = pd.DataFrame()
    for url in urls:
        detail_page_soup = get_soup(url)

        url_postfix = detail_page_soup.find(lambda tag: tag.name == "iframe").get('src').split("overview")[1]

        url_portfolio_page = URL_DETAIL_PAGE_PREFIX + 'portfolio' + url_postfix
        url_overview_page = URL_DETAIL_PAGE_PREFIX + 'overview' + url_postfix

        portefolio_soup = get_soup(url_portfolio_page)    
        overview_soup = get_soup(url_overview_page)

        fund_dataframe = get_overview_dataframe(overview_soup)

        main_dataframe = main_dataframe.append(fund_dataframe)

%timeit -r3 create_dataframe(urls[:10:])

13.6 s ± 385 ms per loop (mean ± std. dev. of 3 runs, 1 loop each)


Iiiish.. We have around 700 funds, and running through 7 of them takes us around 10 seconds. This gives us a rough estimate that it's going to take around 1000 seconds, or 16.6 minutes. Let's try to speed this up by using a lib called `SoupStrainer` that's going to help us parse less html, as i suspect it's this that takes the most effort. 

In [7]:
def get_soup_with_strainer(url, strainer):
    response = requests.get(url)
    return BeautifulSoup(response.text, 'lxml', parse_only=strainer) 

def get_overview_dataframe_with_soupstrainer(overview_soup):
    
    #COST TABLE
    cost_table_soup = overview_soup.find("td", text = "Kjøp").parent.parent
    
    cost_row_names = ["Kjøp", "Selg", "Forvaltningsavgift", "Resultatbasert avgift(maks)", "Løpende kostnader"]
    cost_td_tags = cost_table_soup.findAll("td", class_=None)

    cost_values = get_floats_as_strings_from_tags(cost_td_tags)
    cost_columns = ['Kjøpsavgift', "Salgsavgift", "Forvaltningsavgift", "Resultatbastert avgift(maks)", "Løpende kostnader"]

    dataFrame_costs = create_df(cost_columns, cost_values)

    #VOLATILITY TABLE
    volatility_table_soup = overview_soup.find("td", text = "Gj.snitt avkastning").parent.parent

    vol_row_names = ["Gj.snitt avkastning", "Standardavvik", "Sharpe Ratio"]
    vol_td_tags = volatility_table_soup.findAll("td", class_=None)

    vol_values = get_floats_as_strings_from_tags(vol_td_tags)
    vol_columns = ["Gj.snitt avkastning", "Standardavvik", "Sharpe Ratio"]

    dataFrame_volatility =  create_df(vol_columns, vol_values)
        
    #CONCAT RESULTS
    return pd.concat([dataFrame_costs, dataFrame_volatility], axis=1)

def get_fund_name_from_url(url):
    
    return url.split('nm=')[-1].replace("+", " ")

def append_new_fund(main_dataframe, url):
    only_iframe = SoupStrainer("iframe")
    detail_page_soup = get_soup_with_strainer(url, only_iframe)

    url_postfix = detail_page_soup.find(lambda tag: tag.name == "iframe").get('src').split("overview")[1]

    url_portfolio_page = URL_DETAIL_PAGE_PREFIX + 'portfolio' + url_postfix
    url_overview_page = URL_DETAIL_PAGE_PREFIX + 'overview' + url_postfix

    only_tables = SoupStrainer("table") 

    portefolio_soup = get_soup_with_strainer(url_portfolio_page, only_tables)    
    overview_soup = get_soup_with_strainer(url_overview_page, only_tables)

    fund_dataframe = get_overview_dataframe_with_soupstrainer(overview_soup)

    fund_dataframe['Name'] = get_fund_name_from_url(url)

    return main_dataframe.append(fund_dataframe)

def create_dataframe_using_soupstrainer(urls):
    main_dataframe = pd.DataFrame()
    for url in urls:
        
        main_dataframe = append_new_fund(main_dataframe, url)
        
    return main_dataframe

%timeit -r1 create_dataframe_using_soupstrainer(urls[:10:])

13.6 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


I shaved a whopping .0 seconds of the run with 10 funds, giving me a total profit of ish 35 seconds for the whole set! 

Not too impressive, but still we made an improvement, ish! 

Another thing we can try is to parallelize the task of getting the dataframe for each url ( corresponding to a fund ) 

Let's do that next!

In [8]:
def create_fund_dataframe(url):
    only_iframe = SoupStrainer("iframe")
    detail_page_soup = get_soup_with_strainer(url, only_iframe)

    url_postfix = detail_page_soup.find(lambda tag: tag.name == "iframe").get('src').split("overview")[1]

    url_portfolio_page = URL_DETAIL_PAGE_PREFIX + 'portfolio' + url_postfix
    url_overview_page = URL_DETAIL_PAGE_PREFIX + 'overview' + url_postfix

    only_tables = SoupStrainer("table") 

    portefolio_soup = get_soup_with_strainer(url_portfolio_page, only_tables)    
    overview_soup = get_soup_with_strainer(url_overview_page, only_tables)

    fund_dataframe = get_overview_dataframe_with_soupstrainer(overview_soup)
    fund_dataframe['Name'] = get_fund_name_from_url(url)

    return fund_dataframe

def create_dataframe_in_parallel(urls, pool_size):
    
    result = pd.DataFrame()
    
    from multiprocessing import Pool
    with Pool(pool_size) as p:
        dataframes = p.map(create_fund_dataframe, urls)

    for dataframe in dataframes:
        result = result.append(dataframe)
        
    return result

%timeit -r1 create_dataframe_in_parallel(urls, 30)

42.1 s ± 0 ns per loop (mean ± std. dev. of 1 run, 1 loop each)


A WHOPPING increase in efficiency! Look at that go! The whole set done in 42.1 seconds, that's acceptable!

<h2> Checkpoint Charlie </h2>

We're now at a point where we have a dataframe with values we would like to show off to the world. But we should start thinking about how we'll serve this information to whoever wants it! 

The following notebook uses only the methods provided in `helpers.py` and `scraping.py` which is the methods found in this notebook to be the most efficient! 

See you in the next Notebook: `Analysing and enhancing our data` 