# From Lab  to Lecture: 

## Analyzing the Connection Between Professors’ Research and Course Content​ 👩‍🏫

**Authors**:
- Erik Wold Riise, s194633​
- Lukas Rasocha, s233498​
- Zou Yong Nan Klaassen, s230351

<center><img src="assets/intro.png" width="1000"  /></center>

*Image Prompt: minimalistic network visualization with two nodes: one representing a professor and the other a course they teach, connected by a single edge*

### Project Overview
This project investigates the alignment between professors’ research areas and the courses they teach through the angle of network analysis and natural language processing (NLP). By constructing a bipartite graph of professors and courses, we aim to explore structural and thematic patterns in teaching and research connections.

The central research question steering the project is:
*"How well do professors’ research areas align with the content and objectives of the courses they teach, and how does this alignment vary across disciplines?"*

To complement this, we also examine:
*"Does the alignment between professors’ research and the courses they teach influence student satisfaction and performance (grades)?"*

Using NLP techniques, we analyze course descriptions and research topics to measure alignment, and we relate these findings to course evaluations and grades. Additionally, network analysis methods, such as community detection and centrality measures, are applied to uncover interdisciplinary trends and the influence of professors within the academic network. 

By this we hope to shed light on how expertise and teaching intersect, and how does that impact educational outcomes in a broader sense.

### Imports

In [34]:
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import os
import json
from tqdm import tqdm

import requests
from bs4 import BeautifulSoup
from scholia import query

### Load Data

In [35]:
file_path = 'data/course_df.csv'
course_df = pd.read_csv(file_path)

### DTU Orbit Scraper class

In [36]:
class DTUOrbitScraper:
    def __init__(self):
        self.base_url = "https://orbit.dtu.dk/en/persons/"
        
    def search_person(self, name):
        """Search for the person and get the URL to their profile."""
        search_url = f"{self.base_url}?search={name.replace(' ', '+')}&isCopyPasteSearch=false"
        response = requests.get(search_url)
        
        if response.status_code != 200:
            raise Exception("Failed to fetch search results")
        
        soup = BeautifulSoup(response.text, "html.parser")
        # Find the first profile link (assuming it's the first result)
        profile_link = soup.find("h3", class_="title").find("a", href=True)
        
        if profile_link:
            return profile_link['href']
        else:
            raise Exception("Profile link not found")

    def get_profile_info(self, name):
        """Retrieve profile information given a person's name."""
        full_profile_url = self.search_person(name)
        response = requests.get(full_profile_url)
        
        if response.status_code != 200:
            raise Exception("Failed to fetch profile page")
        
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract profile information
        profile_info = {}
        
        # Get Profile Description
        profile_header = soup.find("h3", string="Profile")
        profile_section = profile_header.find_next("p") if profile_header else None
        profile_info["Profile_desc"] = profile_section.get_text(strip=True) if profile_section else "None"
        
        # Get Keywords
        keywords_section = soup.find("div", class_="keyword-group")
        if keywords_section:
            keywords = [keyword.get_text(strip=True) for keyword in keywords_section.find_all("li", class_="userdefined-keyword")]
            profile_info["Keywords"] = keywords
        else:
            profile_info["Keywords"] = []

        # Get Fingerprint (Concepts, Thesauri, Values)
        fingerprints = []
        fingerprint_section = soup.find("div", class_="person-top-concepts")
        if fingerprint_section:
            fingerprint_items = fingerprint_section.find_all("li", class_="concept-badge-large-container")
            for item in fingerprint_items:
                concept = item.find("span", class_="concept").get_text(strip=True) if item.find("span", class_="concept") else "N/A"
                thesauri = item.find("span", class_="thesauri").get_text(strip=True) if item.find("span", class_="thesauri") else "N/A"
                value = item.find("span", class_="value sr-only").get_text(strip=True) if item.find("span", class_="value sr-only") else "N/A"
                fingerprints.append({
                    "Concept": concept,
                    "Thesauri": thesauri,
                    "Value": value
                })
        profile_info["Fingerprint"] = fingerprints

        # Get ORCID
        orcid_section = soup.find("div", class_="rendering_person_personorcidrendererportal")
        if orcid_section:
            orcid_link = orcid_section.find("a", href=True)
            profile_info["ORCID"] = orcid_link["href"] if orcid_link else "Not found"

            profile_info["QS"] = query.orcid_to_qs(orcid_link["href"].split("/")[-1]) if orcid_link else "Not found"
        else:
            profile_info["ORCID"] = "Not found"
        return profile_info

### Scrape Professors information

In [37]:
scraper = DTUOrbitScraper()

professor_columns = [
    "MAIN_RESPONSIBLE_NAME", "CO_RESPONSIBLE_1_NAME",
    "CO_RESPONSIBLE_2_NAME", "CO_RESPONSIBLE_3_NAME", "CO_RESPONSIBLE_4_NAME"
]

# Extract unique professors from the dataset
def extract_professors(dataframe, professor_columns):
    professors = set()
    for col in professor_columns:
        professors.update(dataframe[col].dropna().unique())
    return list(professors)


# Function to scrape professor data
def scrape_professor_data(professors, output_file):
    # if file already exists just cancel and return with a print statement
    if os.path.exists(output_file):
        print(f"File {output_file} already exists. Skipping scraping.")
        return

    all_data = {}
    
    for professor in tqdm(professors, desc="Scraping Professors"):
        # Skip if the professor already exists in the JSON file
        if professor in all_data:
            print(f"Skipping {professor} as it already exists in the JSON file.")
            continue

        try:
            # Use the scraper to get the profile data
            profile_info = scraper.get_profile_info(professor)
            all_data[professor] = profile_info  # Add to dictionary
        except Exception as e:
            print(f"Failed to scrape data for {professor}: {str(e)}")
            continue

    with open(output_file, "w") as f:
        json.dump(all_data, f, indent=4)
    
    print(f"Saved all professor data to {output_file}")


output_file = "data/all_professors.json"
professors = extract_professors(course_df, professor_columns)
scrape_professor_data(professors, output_file)

Scraping Professors:   7%|▋         | 75/1063 [01:34<18:31,  1.13s/it]

Failed to scrape data for Jacqueline Eve Stenson: 'NoneType' object has no attribute 'find'


Scraping Professors:  11%|█         | 112/1063 [02:26<18:29,  1.17s/it]

Failed to scrape data for Carolyn Rutherford: 'NoneType' object has no attribute 'find'


Scraping Professors:  25%|██▍       | 264/1063 [11:25<12:26,  1.07it/s]    

Failed to scrape data for Brian Elmegaard: 'NoneType' object has no attribute 'find'


Scraping Professors:  26%|██▋       | 280/1063 [11:44<12:19,  1.06it/s]

Failed to scrape data for Salla Marjukka Laasonen: 'NoneType' object has no attribute 'find'


Scraping Professors:  32%|███▏      | 338/1063 [12:57<12:22,  1.02s/it]

Failed to scrape data for Peter Bauer-Gottwein: 'NoneType' object has no attribute 'find'


Scraping Professors:  41%|████      | 431/1063 [14:57<10:56,  1.04s/it]

Failed to scrape data for Ida Stub Johansson: 'NoneType' object has no attribute 'find'


Scraping Professors:  57%|█████▋    | 604/1063 [18:28<07:06,  1.08it/s]

Failed to scrape data for Fengwen Wang: 'NoneType' object has no attribute 'find'


Scraping Professors:  61%|██████    | 649/1063 [19:22<06:46,  1.02it/s]

Failed to scrape data for Babak Rezaei: 'NoneType' object has no attribute 'find'


Scraping Professors:  76%|███████▌  | 809/1063 [22:41<04:33,  1.08s/it]

Failed to scrape data for Maria Ingeman: 'NoneType' object has no attribute 'find'


Scraping Professors:  77%|███████▋  | 821/1063 [22:55<03:40,  1.10it/s]

Failed to scrape data for Mette Lode Skovbo: 'NoneType' object has no attribute 'find'


Scraping Professors:  79%|███████▉  | 843/1063 [23:21<03:35,  1.02it/s]

Failed to scrape data for Markus Reinmöller: 'NoneType' object has no attribute 'find'


Scraping Professors:  91%|█████████ | 969/1063 [25:56<01:24,  1.11it/s]

Failed to scrape data for Jørgen Henrik Klinge Jacobsen: 'NoneType' object has no attribute 'find'


Scraping Professors:  93%|█████████▎| 993/1063 [26:26<01:15,  1.08s/it]

Failed to scrape data for Amelie Sina Wilde: 'NoneType' object has no attribute 'find'


Scraping Professors:  97%|█████████▋| 1033/1063 [27:15<00:27,  1.09it/s]

Failed to scrape data for Stig Christian Herluf S Andersen: 'NoneType' object has no attribute 'find'


Scraping Professors:  98%|█████████▊| 1041/1063 [27:24<00:20,  1.10it/s]

Failed to scrape data for Rasmus Eckholdt Andersen: 'NoneType' object has no attribute 'find'


Scraping Professors: 100%|██████████| 1063/1063 [27:53<00:00,  1.57s/it]

Saved all professor data to data/all_professors.json





In [None]:

# Define columns of interest for constructing the network
professor_columns = [
    "MAIN_RESPONSIBLE_NAME", "CO_RESPONSIBLE_1_NAME",
    "CO_RESPONSIBLE_2_NAME", "CO_RESPONSIBLE_3_NAME", "CO_RESPONSIBLE_4_NAME"
]

course_columns = [
    "COURSE", "NAME", "ECTS_POINTS", "COURSE_TYPE",
    "AVERAGE_GRADE", "PERCENT_PASSED", "PERCENT_FAILED"
]

# Extract course attributes and professor-course relationships
course_nodes = course_df[course_columns].drop_duplicates(subset=["COURSE"]).set_index("COURSE").to_dict("index")

edges = []
for _, row in course_df.iterrows():
    course_id = row["COURSE"]
    for professor_column in professor_columns:
        professor_name = row[professor_column]
        if pd.notna(professor_name):  # Ensure the name is valid
            edges.append((course_id, professor_name))

# Extract unique professor names for creating professor nodes
professor_names = set([edge[1] for edge in edges])

# Initialize a NetworkX graph
B = nx.Graph()

# Add course nodes with attributes
for course_id, attributes in course_nodes.items():
    B.add_node(course_id, **attributes, bipartite=0)

# Add professor nodes without attributes for now (attributes will be added after scraping)
for professor_name in professor_names:
    B.add_node(professor_name, bipartite=1)

# Add edges between courses and professors
B.add_edges_from(edges)

In [None]:
node_colors = [
    "skyblue" if B.nodes[node].get("bipartite") == 0 else "lightgreen"
    for node in B.nodes
]


# Layout for bipartite graph (spring layout for aesthetics)
pos = nx.spring_layout(B, k=0.3, iterations=50, seed=42)  # Adjusted for better spacing

# Plot the network
plt.figure(figsize=(15, 15))
nx.draw(
    B,
    pos,
    with_labels=False,  # Disable labels for a cleaner look
    node_color=node_colors,
    edge_color="gray",
    alpha=0.7,
    linewidths=0.5
)

# Add legend
handles = [
    plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='skyblue', markersize=10, label='Courses'),
    plt.Line2D([0], [0], marker='o', color='w', markerfacecolor='lightgreen', markersize=10, label='Professors')
]
plt.legend(handles=handles, loc="upper right", fontsize=12, frameon=True)

plt.title("Bipartite Network of Courses and Professors", fontsize=16)
plt.show()