In [None]:
import pandas as pd
from keybert import KeyBERT

In [None]:
df = pd.read_csv('data/courses.csv')
df.head()

In [None]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device}')
print(torch.version.cuda)

In [None]:
kw_model = KeyBERT()


def extract_keywords(course_description, n_gram):
    if isinstance(course_description, str) and course_description not in ['-', '']:
        keywords = kw_model.extract_keywords(course_description, stop_words='english', top_n=30, use_mmr=True,
                                             keyphrase_ngram_range=(n_gram, n_gram))
        return [kw[0] for kw in keywords]
    return []  


for course in df.columns[1:]: 
    df[f'{course}_keywords'] = df[course].apply(lambda x: extract_keywords(x, 1))
    df[f'{course}_2word_phrases'] = df[course].apply(lambda x: extract_keywords(x, 2))
    df[f'{course}_3word_phrases'] = df[course].apply(lambda x: extract_keywords(x, 3))

print("Updated DataFrame with keywords and phrases:")
print(df)

In [None]:
df.columns

In [None]:
df.head()

In [None]:
from scipy.stats import chisquare

courses = [
    'Software Architecture',
    'Software Testing',
    'Requirements Engineering',
    'Preparation Masterproject Software Engineering',
    'Software Process',
    'Software Construction',
    'Masterproject Software Engineering',
    'Software Evolution',
    'Software Architecture (VU)',
    'Software Specification, Verification and Testing',
    'Embedded Software and Systems',
    'DevOps and Cloud-based Software',
    'Model-based Design of Cyber-physical Systems'
]


def count_phrases(text, phrases):
    text = text.lower()
    count = 0
    for phrase in phrases:
        count += text.count(phrase.lower())
    return count


results = {}

for course in courses:
    observed_keywords = []
    observed_2word_phrases = []
    observed_3word_phrases = []

    for index, row in df.iterrows():
        original_text = row[course]

        keyword_count = count_phrases(original_text, row[f'{course}_keywords'])
        two_word_count = count_phrases(original_text, row[f'{course}_2word_phrases'])
        three_word_count = count_phrases(original_text, row[f'{course}_3word_phrases'])

        observed_keywords.append(keyword_count)
        observed_2word_phrases.append(two_word_count)
        observed_3word_phrases.append(three_word_count)

    total_observed = [
        sum(observed_keywords),
        sum(observed_2word_phrases),
        sum(observed_3word_phrases)
    ]

    total_count = sum(total_observed)
    num_categories = len(total_observed)

    expected_counts = [total_count / num_categories] * num_categories

    chi2_stat, p_value = chisquare(total_observed, expected_counts)

    results[course] = {
        'Chi-Squared Statistic': chi2_stat,
        'P-value': p_value,
        'Significant Difference': p_value < 0.05
    }

for course, result in results.items():
    print(f"Course: {course}")
    print(f"  Chi-Squared Statistic: {result['Chi-Squared Statistic']:.4f}")
    print(f"  P-value: {result['P-value']:.4e}")
    print(f"  Significant Difference: {'Yes' if result['Significant Difference'] else 'No'}\n")


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


def create_check_matrix(df, column_name):
    exploded_keyphrases = df.explode(column_name)

    presence_matrix = exploded_keyphrases.groupby(['Year', column_name]).size().unstack(fill_value=0)

    presence_matrix = (presence_matrix > 0).astype(int)

    presence_matrix = presence_matrix.T

    return presence_matrix


keyword_column = 'Software Testing_keywords'
two_word_column = 'Software Testing_2word_phrases'
three_word_column = 'Software Testing_3word_phrases'

keyword_presence_matrix = create_check_matrix(df, keyword_column)
two_word_presence_matrix = create_check_matrix(df, two_word_column)
three_word_presence_matrix = create_check_matrix(df, three_word_column)

fig, axes = plt.subplots(nrows=3, ncols=1, figsize=(12, 18))

sns.heatmap(keyword_presence_matrix, annot=True, cmap='binary', fmt='d', ax=axes[0])
axes[0].set_title('Keyword Presence Over Years', fontsize=16)
axes[0].set_xlabel('Year', fontsize=14)
axes[0].set_ylabel('Keywords', fontsize=14)

sns.heatmap(two_word_presence_matrix, annot=True, cmap='binary', fmt='d', ax=axes[1])
axes[1].set_title('2-Word Phrase Presence Over Years', fontsize=16)
axes[1].set_xlabel('Year', fontsize=14)
axes[1].set_ylabel('2-Word Phrases', fontsize=14)

sns.heatmap(three_word_presence_matrix, annot=True, cmap='binary', fmt='d', ax=axes[2])
axes[2].set_title('3-Word Phrase Presence Over Years', fontsize=16)
axes[2].set_xlabel('Year', fontsize=14)
axes[2].set_ylabel('3-Word Phrases', fontsize=14)

plt.tight_layout()
plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

course_columns = df.columns[1:14]
relevant_df = df[['Year'] + list(course_columns)]

presence_matrix = relevant_df.copy()

for col in presence_matrix.columns[1:]:
    presence_matrix[col] = presence_matrix[col].apply(lambda x: 0 if x == '-' else 1)

presence_matrix.set_index('Year', inplace=True)

presence_matrix = presence_matrix.apply(pd.to_numeric)

presence_matrix_transposed = presence_matrix.T

plt.figure(figsize=(12, 8))
sns.heatmap(presence_matrix_transposed, annot=False, cmap='YlGn', cbar=False, fmt='d', linewidths=0.5)
plt.title('Course Presence Over Years', fontsize=16)
plt.xlabel('Year', fontsize=14)
plt.ylabel('Course', fontsize=14)

img_folder = 'img'
if not os.path.exists(img_folder):
    os.makedirs(img_folder)

file_path = os.path.join(img_folder, 'course_presence_heatmap.png')

plt.savefig(file_path, bbox_inches='tight', dpi=300)
plt.close()

print(f"Heatmap saved to {file_path}")


https://sbert.net/docs/sentence_transformer/pretrained_models.html#original-models

In [None]:

from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

embedded_keywords = [keyword for sublist in df['Embedded Software and Systems_keywords'].dropna().values for keyword in
                     sublist if sublist]
model_based_keywords = [keyword for sublist in
                        df['Model-based Design of Cyber-physical Systems_keywords'].dropna().values for keyword in
                        sublist if sublist]

all_keywords = embedded_keywords + model_based_keywords

model = SentenceTransformer('all-mpnet-base-v2')

keyword_embeddings = model.encode(all_keywords)

pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(keyword_embeddings)

labels = [0] * len(embedded_keywords) + [1] * len(model_based_keywords)

plt.figure(figsize=(10, 6))

scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=labels, cmap='coolwarm', edgecolors='k',
                      s=100)

for i, keyword in enumerate(all_keywords):
    plt.annotate(keyword, (reduced_embeddings[i, 0], reduced_embeddings[i, 1]),
                 textcoords="offset points", xytext=(0, 5), ha='center')

handles, _ = scatter.legend_elements()
plt.legend(handles, ["Embedded Software and Systems", "Model-based Design of Cyber-physical Systems"], title="Course")

plt.title("PCA Scatter Plot of Keyword Embeddings by Course")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cos_sim_matrix = cosine_similarity(keyword_embeddings)

similarity_threshold = 0.6

plt.figure(figsize=(20, 10))

scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=labels, cmap='coolwarm', edgecolors='k',
                      s=100)

for i, keyword in enumerate(all_keywords):
    plt.annotate(keyword, (reduced_embeddings[i, 0], reduced_embeddings[i, 1]),
                 textcoords="offset points", xytext=(0, 5), ha='center')

for i in range(len(all_keywords)):
    for j in range(i + 1, len(all_keywords)):
        if cos_sim_matrix[i, j] > similarity_threshold:
            plt.plot([reduced_embeddings[i, 0], reduced_embeddings[j, 0]],
                     [reduced_embeddings[i, 1], reduced_embeddings[j, 1]],
                     'k-', lw=0.5, alpha=0.6)

handles, _ = scatter.legend_elements()
plt.legend(handles, ["Embedded Software and Systems", "Model-based Design of Cyber-physical Systems"], title="Course")

plt.title("PCA Scatter Plot of Keyword Embeddings with Cosine Similarity")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.grid(True)

file_path = os.path.join(img_folder, 'embedded-cyber-pca.png')

plt.savefig(file_path, bbox_inches='tight', dpi=300)
plt.close()

print(f"Heatmap saved to {file_path}")

In [None]:

df.to_csv('data/courses_with_30keywords.csv', index=False)

In [None]:

from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

ssvt_keywords = [keyword for sublist in df['Software Specification, Verification and Testing_keywords'].dropna().values
                 for keyword in sublist if sublist]
testing_keywords = [keyword for sublist in df['Software Testing_keywords'].dropna().values for keyword in sublist if
                    sublist]

all_keywords = ssvt_keywords + testing_keywords

model = SentenceTransformer('all-mpnet-base-v2')

keyword_embeddings = model.encode(all_keywords)

pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(keyword_embeddings)

labels = [0] * len(ssvt_keywords) + [1] * len(testing_keywords)

plt.figure(figsize=(10, 6))

scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=labels, cmap='coolwarm', edgecolors='k',
                      s=100)

for i, keyword in enumerate(all_keywords):
    plt.annotate(keyword, (reduced_embeddings[i, 0], reduced_embeddings[i, 1]),
                 textcoords="offset points", xytext=(0, 5), ha='center')

handles, _ = scatter.legend_elements()
plt.legend(handles, ["Software Specification, Verification and Testing", "Software Testing"], title="Course")

plt.title("PCA Scatter Plot of Keyword Embeddings by Course")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cos_sim_matrix = cosine_similarity(keyword_embeddings)

similarity_threshold = 0.6

plt.figure(figsize=(20, 10))

scatter = plt.scatter(reduced_embeddings[:, 0], reduced_embeddings[:, 1], c=labels, cmap='coolwarm', edgecolors='k',
                      s=100)

for i, keyword in enumerate(all_keywords):
    plt.annotate(keyword, (reduced_embeddings[i, 0], reduced_embeddings[i, 1]),
                 textcoords="offset points", xytext=(0, 5), ha='center')

for i in range(len(all_keywords)):
    for j in range(i + 1, len(all_keywords)):
        if cos_sim_matrix[i, j] > similarity_threshold:
            plt.plot([reduced_embeddings[i, 0], reduced_embeddings[j, 0]],
                     [reduced_embeddings[i, 1], reduced_embeddings[j, 1]],
                     'k-', lw=0.5, alpha=0.6)

handles, _ = scatter.legend_elements()
plt.legend(handles, ["Software Specification, Verification and Testing", "Software Testing"], title="Course")

plt.title("PCA Scatter Plot of Keyword Embeddings with Cosine Similarity")
plt.xlabel("PCA Component 1")
plt.ylabel("PCA Component 2")
plt.grid(True)

file_path = os.path.join(img_folder, 'ssvt-testing-pca.png')

plt.savefig(file_path, bbox_inches='tight', dpi=300)
plt.close()

print(f"Heatmap saved to {file_path}")

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

course_columns = df.columns[1:14]

course_contents = {}
for course in course_columns:
    combined_content = df[course].dropna().str.cat(sep=' ')
    course_contents[course] = combined_content

courses_df = pd.DataFrame(list(course_contents.items()), columns=['Course', 'Content'])

model = SentenceTransformer('all-mpnet-base-v2')

course_embeddings = model.encode(courses_df['Content'].tolist())

similarity_matrix = cosine_similarity(course_embeddings)

similarity_df = pd.DataFrame(similarity_matrix, index=courses_df['Course'], columns=courses_df['Course'])

plt.figure(figsize=(16, 9))
sns.heatmap(similarity_df, annot=True, cmap='coolwarm', fmt=".2f", square=True, cbar_kws={"shrink": .8})
plt.title('Course Semantic Similarity Heatmap')
plt.xlabel('Courses')
plt.ylabel('Courses')
plt.xticks(rotation=90)
plt.yticks()

file_path = os.path.join(img_folder, 'similarity-courses.png')

plt.savefig(file_path, bbox_inches='tight', dpi=300)
plt.close()

print(f"Heatmap saved to {file_path}")

In [None]:

c_df = pd.read_csv('data/courses.csv')
c_df.head()

In [None]:

c_df['combined'] = c_df.iloc[:, 1:].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

c_df.drop(c_df.columns[1:-1], axis=1, inplace=True)
c_df.head()

c_df.to_csv('data/courses_combined.csv', index=False)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('all-mpnet-base-v2')

year_embeddings = model.encode(c_df['combined'].tolist())

similarity_matrix = cosine_similarity(year_embeddings)

similarity_df = pd.DataFrame(similarity_matrix, index=c_df['Year'], columns=c_df['Year'])

plt.figure(figsize=(16, 9))
sns.heatmap(similarity_df, annot=True, cmap='coolwarm', fmt=".2f", square=True, cbar_kws={"shrink": .8})
plt.title('Yearly Semantic Similarity Heatmap')
plt.xlabel('Year')
plt.ylabel('Year')
plt.xticks(rotation=90)
plt.yticks()
plt.tight_layout()

file_path = os.path.join(img_folder, 'similarity-years.png')

plt.savefig(file_path, bbox_inches='tight', dpi=300)
plt.close()

print(f"Heatmap saved to {file_path}")


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os

learning_outcomes = [
    "Graduates are familiar with the most relevant theories, methods and techniques in the domain of Software Engineering.",
    "Graduates have the necessary background knowledge to familiarise themselves with novel methods and techniques for life-long learning.",
    "Graduates can successfully apply theory in practice in order to find innovative solutions for both general and domain-specific software engineering problems.",
    "Graduates can make valuable contributions to complex software engineering projects through the independent and critical application of academic knowledge and skills.",
    "Graduates have sufficient technical understanding and intellectual capacity to play, after some years of practical experience, a managerial or advisory role in software engineering.",
    "Graduates can clearly report their findings, both in oral and in written form, and can explain problems at an audience-focused level of abstraction.",
    "Graduates have research skills at the academic level and are capable to autonomously perform research in the domain of software engineering.",
    "Graduates understand why user needs are difficult to express, capture and understand and graduates are familiar with best practices in requirements engineering as well as their shortcomings.",
    "Graduates are able to produce formal specifications of modest-sized samples of software and to use them for the generation of meaningful tests; they understand the essential concepts of software verification.",
    "Graduates master the methods and techniques for analysing existing software systems and their evolution in the context of changing requirements.",
    "Graduates are familiar with the characteristics of software for embedded systems and know how to accommodate these characteristics in the software design and development phases.",
    "Graduates understand why big software projects are prone to failure and are familiar with software engineering process models, their situation-awareness and their general shortcomings.",
    "Graduates are familiar with the concept of DevOps and their benefits for organisational IT infrastructure and services management; they understand how to build cloud-based applications and how to use cloud automation tools across a wide range of application scenarios."
]

model = SentenceTransformer('all-mpnet-base-v2')

course_columns = df.columns[1:14]

course_contents = {}
for course in course_columns:
    combined_content = df[course].dropna().str.cat(sep=' ')
    course_contents[course] = combined_content

courses_df = pd.DataFrame(list(course_contents.items()), columns=['Course', 'Content'])

course_embeddings = model.encode(courses_df['Content'].tolist())

outcome_embeddings = model.encode(learning_outcomes)

similarity_scores = cosine_similarity(course_embeddings, outcome_embeddings)

similarity_df = pd.DataFrame(similarity_scores, index=courses_df['Course'],
                             columns=[f'Outcome {i + 1}' for i in range(len(learning_outcomes))])

print("Similarity Scores between Courses and Learning Outcomes:")
print(similarity_df)

plt.figure(figsize=(16, 9))
sns.heatmap(similarity_df, annot=True, cmap='coolwarm', fmt=".2f", square=True, cbar_kws={"shrink": .8})
plt.title('Course-Level Semantic Similarity to Learning Outcomes')
plt.xlabel('Learning Outcomes')
plt.ylabel('Courses')
plt.xticks(rotation=45, ha='right')
plt.yticks()

img_folder = 'img'
file_path = os.path.join(img_folder, 'similarity-courses-to-outcomes-individual.png')

plt.savefig(file_path, bbox_inches='tight', dpi=300)
plt.close()

print(f"Heatmap saved to {file_path}")
