In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import requests

import matplotlib.pyplot as plt
import networkx as nx

In [35]:
base_url = 'https://courses.aalto.fi/course/'
domain = 'https://courses.aalto.fi'

In [36]:
''' prepare headless driver '''
def headless_driver():
    options = Options()
    options.add_argument("--headless")
    driver = webdriver.Chrome(options = options)
    return driver

In [37]:
''' get soup '''
def get_soup(url):
    content = requests.get(url).content
    return BeautifulSoup(content,'lxml') 

In [38]:
''' get soup from selenium '''
def get_soup_from_selenium(url):    
    driver = headless_driver()    
    driver.get(url)
    # Accept GDPR cookie consent
    driver.find_element(By.XPATH, '/html/body/div[1]/div[2]/div/div/button').click()
    # Scroll to bottom so that everything is returned
    # https://stackoverflow.com/a/51345544
    html = driver.find_element_by_tag_name('html')
    html.send_keys(Keys.END)
    # Wait a bit to get it all
    time.sleep(1.5)    
    # Hand over to BS
    return BeautifulSoup(driver.page_source, 'lxml')

In [39]:
''' return metadata from related courses '''
def extract_courses(url, c):  
    
    links = []
    titles = []
    codes = []
    credits = []
    languages = []
    
    soup = get_soup_from_selenium(url)
    
    related = soup.find_all("a", class_ = re.compile("^CourseRecommendations__StyledCourseRow"))

    for r in related:
        # URL
        link = domain + r.get('href')
        links.append(link)
        # Title
        titles.append(r.find("h3").text)
        # Course code
        codes.append(r.find_all("span")[0].text)
        # Credits
        credits.append(r.find("span", class_ = re.compile("^Credits")).text)
        # Language
        img = r.findChild('img')
        if img:
            languages.append(r.find("img", class_ = re.compile("^CourseRow__StyledFlag")).get('alt'))
        else:
            languages.append('NA')
            
    related_courses_df = pd.DataFrame({
        'relfrom' : c,
        'title': titles,
        'url': links,
        'code': codes,
        'credits': credits,
        'language': languages
    })
        
    return related_courses_df

In [12]:
''' Previously harvested metadata about courses'''
courses = pd.read_csv("aaltocourses.csv", sep = "\t")
courses.replace({'ARTS': 'Aalto ARTS', 
                 'BIZ': 'Aalto BIZ', 
                 'CHEM': 'Aalto CHEM', 
                 'ELEC': 'Aalto ELEC', 
                 'ENG': 'Aalto ENG', 
                 'SCI': 'Aalto SCI'}, inplace=True)
courses.index = range(len(courses))

In [None]:
# Note that the total harvesting time is about 3 hours 
# so consider splitting the data in half and running the script 
# with them one by one

related_courses = []
    
start = time.time()

for index, r in courses.iterrows():
    c = r['code']
    url = base_url + c
    print("Harvesting course", url, "\n")       
    ret = extract_courses(url, c)
    related_courses.append(ret)

all_related_courses_df = pd.concat(related_courses)

all_related_courses_df.to_csv("aalto_related_courses.csv", sep='\t', index=False)

end = time.time()-start

print("The script took this many minutes to run:", round(end/60,2))

In [9]:
reldata = pd.read_csv("aalto_related_courses.csv", sep = "\t")

joined = pd.merge(courses, reldata, 
                  left_on = 'code',
                  right_on = 'relfrom', 
                  how = 'left')

joined.head()

Unnamed: 0,school,dept,title_x,url_x,code_x,credits_x,language_x,description,relfrom,title_y,url_y,code_y,credits_y,language_y
0,Aalto ARTS,Architecture,"Advanced Course in Landscape Construction, Studio",https://courses.aalto.fi/course/MAR-E1030,MAR-E1030,10-15 op,fi,Syventää maisemarakentamisen tekniikan tuntemu...,MAR-E1030,Field Course of Landscape Architecture 1,https://courses.aalto.fi/course/MAR-A1002,MAR-A1002,2 op,fi
1,Aalto ARTS,Architecture,"Advanced Course in Landscape Construction, Studio",https://courses.aalto.fi/course/MAR-E1030,MAR-E1030,10-15 op,fi,Syventää maisemarakentamisen tekniikan tuntemu...,MAR-E1030,Field Course of Landscape Architecture 2,https://courses.aalto.fi/course/MAR-C1000,MAR-C1000,2 op,fi
2,Aalto ARTS,Architecture,Architecture in Finland and Scandinavia 2,https://courses.aalto.fi/course/ARK-C1001,ARK-C1001,3 op,fi,The main stages in the history of Finnish arch...,ARK-C1001,History of Wood Architecture in Finland,https://courses.aalto.fi/course/ARK-E1007,ARK-E1007,3-12 op,en
3,Aalto ARTS,Architecture,Architecture in Finland and Scandinavia 2,https://courses.aalto.fi/course/ARK-C1001,ARK-C1001,3 op,fi,The main stages in the history of Finnish arch...,ARK-C1001,"History of Architecture, Special Project",https://courses.aalto.fi/course/ARK-E1008,ARK-E1008,1-10 op,fi
4,Aalto ARTS,Architecture,Architecture in Finland and Scandinavia 2,https://courses.aalto.fi/course/ARK-C1001,ARK-C1001,3 op,fi,The main stages in the history of Finnish arch...,ARK-C1001,"Theory of Architecture, Special Project",https://courses.aalto.fi/course/ARK-E1506,ARK-E1506,1-10 op,fi


In [8]:
G = nx.DiGraph()

for course in joined.iterrows():
    G.add_edge(course[1][8], 
               course[1][11], 
               inboundtitle = course[1][2],
               outboundtitle = course[1][9])

In [9]:
selected = joined[['code_x','title_x','credits_x','language_x','description','school','dept']]

nodup = selected.drop_duplicates('code_x')

print(len(nodup))

1066


In [10]:
node_attr = nodup.set_index('code_x').to_dict('index')

In [11]:
nx.set_node_attributes(G, node_attr)

In [12]:
nx.write_gexf(G, "courses_attr.gexf")

In [13]:
G.number_of_edges()

4197