# From Lab  to Lecture: 

## Analyzing the Connection Between Professors’ Research and Course Content​ 👩‍🏫

**Authors**:
- Erik Wold Riise, s194633​
- Lukas Rasocha, s233498​
- Zou Yong Nan Klaassen, s230351

<center><img src="assets/intro.png" width="1000"  /></center>

*Image Prompt: minimalistic network visualization with two nodes: one representing a professor and the other a course they teach, connected by a single edge*

### Project Overview ✍️

This project investigates the alignment between professors’ research areas and the courses they teach through the angle of network analysis and natural language processing (NLP).
We plan to construct a bipartite graph of professors and courses, and analyze the structural and thematic patterns in teaching and research connections.

The central research question steering the project is:
_"How well do professors’ research areas align with the content and objectives of the courses they teach, and how does this alignment vary across disciplines?"_

To complement this, we also examine:
_"Does the alignment between professors’ research and the courses they teach influence student satisfaction and performance (grades)?"_

Using NLP techniques, we analyze course descriptions and research topics to measure alignment, and we relate these findings to course evaluations and grades. Additionally, network analysis methods, such as community detection and centrality measures, will be applied to uncover interdisciplinary trends and the influence of professors within the academic network.

By this we hope to shed light on how expertise and teaching intersect, and how does that impact educational outcomes in a broader sense.

### Imports

In [24]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests

import os
import json

from tqdm import tqdm
from bs4 import BeautifulSoup
from scholia import query as scholia_query
from SPARQLWrapper import SPARQLWrapper, JSON

### Load Course Data

In [6]:
file_path = 'data/course_df.csv'
course_df = pd.read_csv(file_path)

### DTU Orbit Scraper class + Scholia API

In [29]:

class DTUOrbitScraper:
    def __init__(self):
        self.base_url = "https://orbit.dtu.dk/en/persons/"
        self.endpoint_url = "https://query.wikidata.org/sparql"

    def search_person(self, name):
        """Search for the person and get the URL to their profile."""
        search_url = f"{self.base_url}?search={name.replace(' ', '+')}&isCopyPasteSearch=false"
        response = requests.get(search_url)
        
        if response.status_code != 200:
            raise Exception("Failed to fetch search results")
        
        soup = BeautifulSoup(response.text, "html.parser")
        # Find the first profile link (assuming it's the first result)
        profile_link = soup.find("h3", class_="title").find("a", href=True)
        
        if profile_link:
            return profile_link['href']
        else:
            raise Exception("Profile link not found")

    def get_topic_info(self, topic_url):
        """Scrape the description for a topic from its Wikidata page."""
        response = requests.get(topic_url)
        if response.status_code != 200:
            return "Description not found"
        
        soup = BeautifulSoup(response.text, "html.parser")
        description = soup.find("div", class_="wikibase-entitytermsview-heading-description")
        return description.text.strip() if description else "Description not found"

    def get_scholia_topics(self, qs):
        """Get topics and scores from Scholia using SPARQL."""
        query = f"""PREFIX target: <http://www.wikidata.org/entity/{qs}>
        SELECT ?score ?topic ?topicLabel
        WITH {{
            SELECT (SUM(?score_) AS ?score) ?topic WHERE {{
                {{ target: wdt:P101 ?topic . BIND(20 AS ?score_) }}
                UNION {{ SELECT (3 AS ?score_) ?topic WHERE {{ ?work wdt:P50 target: ; wdt:P921 ?topic . }} }}
                UNION {{ SELECT (1 AS ?score_) ?topic WHERE {{ ?work wdt:P50 target: . ?citing_work wdt:P2860 ?work . ?citing_work wdt:P921 ?topic . }} }}
            }} GROUP BY ?topic
        }} AS %results 
        WHERE {{
            INCLUDE %results
            SERVICE wikibase:label {{ bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }}
        }}
        ORDER BY DESC(?score)
        LIMIT 200"""
        
        sparql = SPARQLWrapper(self.endpoint_url)
        sparql.setQuery(query)
        sparql.setReturnFormat(JSON)
        results = sparql.query().convert()
        
        topics = [] 

        for result in results["results"]["bindings"]:
            topic_url = result["topic"]["value"]
            topic_label = result["topicLabel"]["value"]
            score = int(result["score"]["value"])
            #info = self.get_topic_info(topic_url)
            #topics[topic_label] = {"score": score, "info": info}
            topics.append({"topic":topic_label, "score": score, "topic_url": topic_url})
        return topics

    def get_profile_info(self, name):
        """Retrieve profile information given a person's name."""
        full_profile_url = self.search_person(name)
        response = requests.get(full_profile_url)
        
        if response.status_code != 200:
            raise Exception("Failed to fetch profile page")
        
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract profile information
        profile_info = {}
        
        # Get Profile Description
        profile_header = soup.find("h3", string="Profile")
        profile_section = profile_header.find_next("p") if profile_header else None
        profile_info["Profile_desc"] = profile_section.get_text(strip=True) if profile_section else "None"
        
        # Get Keywords
        keywords_section = soup.find("div", class_="keyword-group")
        if keywords_section:
            keywords = [keyword.get_text(strip=True) for keyword in keywords_section.find_all("li", class_="userdefined-keyword")]
            profile_info["Keywords"] = keywords
        else:
            profile_info["Keywords"] = []

        # Get Fingerprint (Concepts, Thesauri, Values)
        fingerprints = []
        fingerprint_section = soup.find("div", class_="person-top-concepts")
        if fingerprint_section:
            fingerprint_items = fingerprint_section.find_all("li", class_="concept-badge-large-container")
            for item in fingerprint_items:
                concept = item.find("span", class_="concept").get_text(strip=True) if item.find("span", class_="concept") else "N/A"
                thesauri = item.find("span", class_="thesauri").get_text(strip=True) if item.find("span", class_="thesauri") else "N/A"
                value = item.find("span", class_="value sr-only").get_text(strip=True) if item.find("span", class_="value sr-only") else "N/A"
                fingerprints.append({
                    "Concept": concept,
                    "Thesauri": thesauri,
                    "Value": value
                })
        profile_info["Fingerprint"] = fingerprints

        # Extract ORCID
        orcid_section = soup.find("div", class_="rendering_person_personorcidrendererportal")
        if orcid_section:
            orcid_link = orcid_section.find("a", href=True)
            profile_info["ORCID"] = orcid_link["href"] if orcid_link else "Not found"
            if orcid_link:
                orcid_id = orcid_link["href"].split("/")[-1]
                profile_info["QS"] = scholia_query.orcid_to_qs(orcid_id)
                # Retrieve Scholia topics if QS exists
                if len(profile_info["QS"]) == 1:
                    profile_info["scholia_topics"] = self.get_scholia_topics(profile_info["QS"][0])
                else:
                    profile_info["scholia_topics"] = {}
        else:
            profile_info["ORCID"] = "Not found"
            profile_info["QS"] = "Not found"
            profile_info["scholia_topics"] = {}

        return profile_info

### Scrape Professors information

In [31]:
scraper = DTUOrbitScraper()

professor_columns = [
    "MAIN_RESPONSIBLE_NAME", "CO_RESPONSIBLE_1_NAME",
    "CO_RESPONSIBLE_2_NAME", "CO_RESPONSIBLE_3_NAME", "CO_RESPONSIBLE_4_NAME"
]

# Extract unique professors from the dataset
def extract_professors(dataframe, professor_columns):
    professors = set()
    for col in professor_columns:
        professors.update(dataframe[col].dropna().unique())
    return list(professors)


def scrape_professor_data(professors, output_file):
    if os.path.exists(output_file):
        print(f"File {output_file} already exists. Skipping scraping.")
        return

    all_data = {}

    for professor in tqdm(professors, desc="Scraping Professors"):
        if professor in all_data:
            print(f"Skipping {professor} as it already exists in the JSON file.")
            continue

        try:
            profile_info = scraper.get_profile_info(professor)
            all_data[professor] = profile_info  # Add to dictionary
        except Exception as e:
            print(f"Failed to scrape data for {professor}: {str(e)}")
            continue

    with open(output_file, "w") as f:
        json.dump(all_data, f, indent=4)
    
    print(f"Saved all professor data to {output_file}")


output_file = "data/all_professors.json"
professors = extract_professors(course_df, professor_columns)
scrape_professor_data(professors, output_file)

Scraping Professors:   0%|          | 0/1063 [00:00<?, ?it/s]

Scraping Professors:  14%|█▎        | 144/1063 [03:45<26:08,  1.71s/it]

Failed to scrape data for René Sjøgren Hendriksen: HTTP Error 429: Too Many Requests


Scraping Professors:  37%|███▋      | 394/1063 [10:51<12:23,  1.11s/it]  

Failed to scrape data for Mette Lode Skovbo: 'NoneType' object has no attribute 'find'


Scraping Professors:  47%|████▋     | 501/1063 [13:46<12:37,  1.35s/it]

Failed to scrape data for Peter Bauer-Gottwein: 'NoneType' object has no attribute 'find'


Scraping Professors:  48%|████▊     | 506/1063 [13:55<14:54,  1.61s/it]

Failed to scrape data for Xiaodong Liang: HTTP Error 429: Too Many Requests


Scraping Professors:  49%|████▊     | 517/1063 [14:12<10:39,  1.17s/it]

Failed to scrape data for Jacqueline Eve Stenson: 'NoneType' object has no attribute 'find'


Scraping Professors:  51%|█████     | 540/1063 [14:57<13:26,  1.54s/it]

Failed to scrape data for Stig Christian Herluf S Andersen: 'NoneType' object has no attribute 'find'


Scraping Professors:  55%|█████▍    | 580/1063 [15:58<10:43,  1.33s/it]

Failed to scrape data for Markus Reinmöller: 'NoneType' object has no attribute 'find'


Scraping Professors:  59%|█████▉    | 625/1063 [17:20<11:35,  1.59s/it]

Failed to scrape data for Maria Ingeman: 'NoneType' object has no attribute 'find'


Scraping Professors:  59%|█████▉    | 632/1063 [17:30<07:55,  1.10s/it]

Failed to scrape data for Amelie Sina Wilde: 'NoneType' object has no attribute 'find'


Scraping Professors:  70%|██████▉   | 742/1063 [20:28<06:52,  1.28s/it]

Failed to scrape data for Babak Rezaei: 'NoneType' object has no attribute 'find'


Scraping Professors:  70%|███████   | 748/1063 [20:35<05:29,  1.05s/it]

Failed to scrape data for Jørgen Henrik Klinge Jacobsen: 'NoneType' object has no attribute 'find'


Scraping Professors:  73%|███████▎  | 773/1063 [21:17<06:36,  1.37s/it]

Failed to scrape data for Fengwen Wang: 'NoneType' object has no attribute 'find'


Scraping Professors:  73%|███████▎  | 781/1063 [21:28<04:51,  1.03s/it]

Failed to scrape data for Ida Stub Johansson: 'NoneType' object has no attribute 'find'


Scraping Professors:  76%|███████▋  | 813/1063 [22:24<06:38,  1.59s/it]

Failed to scrape data for Rasmus Eckholdt Andersen: 'NoneType' object has no attribute 'find'


Scraping Professors:  78%|███████▊  | 829/1063 [22:49<04:46,  1.22s/it]

Failed to scrape data for Brian Elmegaard: 'NoneType' object has no attribute 'find'


Scraping Professors:  84%|████████▍ | 896/1063 [24:45<03:54,  1.40s/it]

Failed to scrape data for Carolyn Rutherford: 'NoneType' object has no attribute 'find'


Scraping Professors:  88%|████████▊ | 931/1063 [25:57<04:35,  2.08s/it]

Failed to scrape data for Salla Marjukka Laasonen: 'NoneType' object has no attribute 'find'


Scraping Professors: 100%|██████████| 1063/1063 [29:51<00:00,  1.69s/it]

Saved all professor data to data/all_professors.json





### Load Professors data

In [50]:
with open("data/all_professors.json", "r") as f:
    professors_data = json.load(f)

### Create Bipartite Graph

In [52]:
B = nx.Graph()

professors = extract_professors(course_df, professor_columns)
courses = course_df["COURSE"].unique()

B.add_nodes_from(professors, bipartite=0, type="Professor")  # Professors
B.add_nodes_from(courses, bipartite=1, type="Course")  # Courses

# Add edges based on professor-course relationships
for _, row in course_df.iterrows():
    course = row["COURSE"]
    for col in professor_columns:
        professor = row[col]
        if pd.notna(professor):  
            B.add_edge(professor, course)

# Add metadata to nodes
for professor in professors:
    if professor in professors_data:
        # Add professor metadata (e.g., research areas)
        B.nodes[professor]["Profile_desc"] = professors_data[professor].get("Profile_desc", None)
        B.nodes[professor]["Keywords"] = professors_data[professor].get("Keywords", [])
        B.nodes[professor]["Fingerprint"] = professors_data[professor].get("Fingerprint", [])

for _, row in course_df.iterrows():
    course = row["COURSE"]
    # Add course metadata (e.g., description and objectives)
    B.nodes[course]["COURSE_DESCRIPTION"] = row.get("COURSE_DESCRIPTION", "")
    B.nodes[course]["LEARNING_OBJECTIVES"] = row.get("LEARNING_OBJECTIVES", "")

print(f"Bipartite graph created with {B.number_of_nodes()} nodes and {B.number_of_edges()} edges.")

Bipartite graph created with 2767 nodes and 4802 edges.


In [55]:
# pip install sparqlwrapper
# https://rdflib.github.io/sparqlwrapper/

import sys
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

query = """# tool: scholia
#defaultView:BubbleChart
PREFIX target: <http://www.wikidata.org/entity/Q24290415>

SELECT ?score ?topic ?topicLabel
WITH {
  SELECT
    (SUM(?score_) AS ?score)
    ?topic
  WHERE {
    { 
      target: wdt:P101 ?topic .
      BIND(20 AS ?score_)
    }
    UNION
    {
      SELECT (3 AS ?score_) ?topic WHERE {
        ?work wdt:P50 target: ;
              wdt:P921 ?topic . 
      }
    }
    UNION
    {
      SELECT (1 AS ?score_) ?topic WHERE {
        ?work wdt:P50 target: .
        ?citing_work wdt:P2860 ?work .
        ?citing_work wdt:P921 ?topic . 
      }
    }
  }
  GROUP BY ?topic
} AS %results 
WHERE {
  INCLUDE %results
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],mul,en,da,de,es,jp,no,ru,sv,zh". }
}
ORDER BY DESC(?score)
LIMIT 200"""


def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()


results = get_results(endpoint_url, query)

for result in results["results"]["bindings"]:
    print(result)


{'topic': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q197536'}, 'score': {'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'type': 'literal', 'value': '80'}, 'topicLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'deep learning'}}
{'topic': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q310899'}, 'score': {'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'type': 'literal', 'value': '60'}, 'topicLabel': {'xml:lang': 'en', 'type': 'literal', 'value': 'microRNA'}}
{'topic': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q719725'}, 'score': {'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'type': 'literal', 'value': '53'}, 'topicLabel': {'xml:lang': 'mul', 'type': 'literal', 'value': 'Saccharomyces cerevisiae'}}
{'topic': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q252857'}, 'score': {'datatype': 'http://www.w3.org/2001/XMLSchema#integer', 'type': 'literal', 'value': '51'}, 'topicLabel': {'xml:lang': 'en', 'type':