In [20]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import json

from scripts.helpers import load_courses

In [21]:
courses = load_courses('./data/generated/')[0]
len(courses)

21485

In [27]:

# Extract text data from courses for TF-IDF processing
# We'll combine relevant text fields from each course to create a corpus
corpus = []
course_codes = []

for course in courses:
    # Combine relevant text fields into a single document
    # document = f"{course['NAME']} {course['SYLLABUS']} {course['OBJECTIVES']} {course['LEARNING_OUTCOMES']} {course['DESCRIPTION']}"
    document = json.dumps(course, ensure_ascii=False)

    # Add keywords if available
    if 'KEYWORDS' in course and course['KEYWORDS']:
        document += " " + " ".join(course['KEYWORDS'])

    corpus.append(document)
    course_codes.append(course['CODE'])

# Initialize and fit the TF-IDF vectorizer
# max_features limits the vocabulary size to the most important terms
# stop_words removes common English words that don't carry much meaning
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    lowercase=True,
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=True  # Apply sublinear tf scaling (1 + log(tf))
)

# Transform the corpus into TF-IDF features
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Get feature names (terms)
feature_names = tfidf_vectorizer.get_feature_names_out()

print(f"Number of features (terms): {len(feature_names)}")
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")


Number of features (terms): 5000
TF-IDF matrix shape: (21485, 5000)


In [28]:
# Example: Get the top 5 terms for the first course
def get_top_terms(doc_idx, top_n=5):
    feature_index = tfidf_matrix[doc_idx].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[doc_idx, x] for x in feature_index])
    sorted_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
    return [(feature_names[idx], score) for idx, score in sorted_scores[:top_n]]

# Display top terms for the first course
if len(courses) > 0:
    print(f"\nTop terms for course {course_codes[0]} - {courses[0]['NAME']}:")
    for term, score in get_top_terms(0):
        print(f"  {term}: {score:.4f}")


Top terms for course bk4001 - Methodology:
  renata: 0.1705
  citation: 0.1677
  hendl: 0.1526
  statistical: 0.1363
  kvalitativní: 0.1357


In [29]:
code = "IB111"

for course in courses:
    if course['CODE'] == code:
        print(course)
        break


{'CODE': 'IB111', 'FACULTY': 'FI', 'NAME': 'Foundations of Programming', 'LANGUAGE': 'čeština', 'SEMESTER': 'podzim 2024', 'CREDITS': '5', 'DEPARTMENT': 'KPSK', 'TEACHERS': 'Beneš, N. - Bartek, F. - Bednařík, K. - Borošová, K. - Brdečko, V. - Bukor, O. - Burget, J. - Čepela, S. - Focko, M. - Foltýnek, T. - Glosner, R. - Jedelský, J. - Juračková, N. - Kasprzaková, I. - Lukačovič, B. - Marek, T. - Melkovič, D. - Pastva, S. - Patlevič, M. - Rakšány, P. - Ročkai, P. - Ručka, L. - Sedlák, E. - Stančík, S. - Šutor, D. - Trnavský, P. - Tuček, M. - Tvarožek, M. - Uhlík, V. - Vojnar, T. - Weinberger, F. - Winklerová, A. - Wolek, J. - Záborský, L. - Zatloukal, J. - Žbánek, V. - Balák, T. - Baník, R. - Barna, M. - Béreš, J. - Biačko, P. - Borský, J. - Bukáček, M. - Čech, R. - Čermák, K. - Černá, I. - Davidová, N. - Drkoš, T. - Dvořák, R. - Ergang, M. - Fedorko, F. - Frejlach, J. - Glos, J. - Hadar, A. - Halabala, J. - Halamka, M. - Hejčl, P. - Horák, J. - Jarošová, J. - Judiny, J. - Kamenov, D. -

In [32]:
# Find the index of the course with code 'IB111'
ib111_idx = None
for i, code in enumerate(course_codes):
    if code == 'PB111':
        ib111_idx = i
        break

# Display top 15 terms for IB111 if found
if ib111_idx is not None:
    course_name = courses[ib111_idx]['NAME']
    print(f"\nTop 15 terms for course {course_codes[ib111_idx]} - {course_name}:")
    for term, score in get_top_terms(ib111_idx, top_n=15):
        print(f"  {term}: {score:.4f}")
else:
    print("\nCourse with code 'IB111' not found in the dataset.")



Top 15 terms for course PB111 - Principles of low-level programming:
  programming: 0.2232
  memory: 0.2184
  trees: 0.1974
  allocation: 0.1898
  computational: 0.1805
  tables: 0.1670
  dynamic: 0.1635
  algorithms: 0.1588
  machine: 0.1529
  low: 0.1501
  search: 0.1317
  blocks: 0.1263
  variable: 0.1243
  block: 0.1224
  linked: 0.1203


In [34]:
# Extract top 15 keywords for each course and store in a JSON file
import json
from tqdm import tqdm

# Create a dictionary to store course codes and their top keywords
course_keywords = {}

# Process each course
print("Extracting top keywords for each course...")
for idx, code in enumerate(tqdm(course_codes)):
    # Get the top 15 terms for this course
    top_terms = get_top_terms(idx, top_n=15)

    # Store as a dictionary with term and score
    course_keywords[code] = [{"term": term, "score": float(score)} for term, score in top_terms]

# Save to JSON file
output_file = "course_top_keywords.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(course_keywords, f, indent=2, ensure_ascii=False)

print(f"Top keywords saved to {output_file}")

# Display sample of the data (first 3 courses)
print("\nSample of extracted keywords:")
sample_count = min(3, len(course_codes))
for i, code in enumerate(list(course_keywords.keys())[:sample_count]):
    print(f"\nCourse {code}:")
    for item in course_keywords[code][:5]:  # Show only top 5 for the sample
        print(f"  {item['term']}: {item['score']:.4f}")
    if len(course_keywords[code]) > 5:
        print("  ...")


Extracting top keywords for each course...


100%|██████████| 21485/21485 [00:54<00:00, 394.47it/s]


Top keywords saved to course_top_keywords.json

Sample of extracted keywords:

Course bk4001:
  renata: 0.1705
  citation: 0.1677
  hendl: 0.1526
  statistical: 0.1363
  kvalitativní: 0.1357
  ...

Course bk4003:
  sports: 0.2199
  spelling: 0.1831
  anatomy: 0.1688
  cefr: 0.1614
  sport: 0.1569
  ...

Course bk4005:
  pedagogy: 0.1748
  socialization: 0.1746
  průcha: 0.1691
  pedagogical: 0.1627
  youth: 0.1493
  ...


In [35]:
# Function to calculate similarity between courses based on shared keywords
def calculate_course_similarity(course1_code, course2_code, course_keywords, weight_by_score=True):
    if course1_code not in course_keywords or course2_code not in course_keywords:
        return 0.0

    keywords1 = {item["term"]: item["score"] for item in course_keywords[course1_code]}
    keywords2 = {item["term"]: item["score"] for item in course_keywords[course2_code]}

    shared_keywords = set(keywords1.keys()) & set(keywords2.keys())

    if not shared_keywords:
        return 0.0

    if weight_by_score:
        similarity = sum(keywords1[term] * keywords2[term] for term in shared_keywords)
        total_possible = sum(keywords1.values()) * sum(keywords2.values())
        if total_possible > 0:
            similarity = similarity / total_possible
    else:
        similarity = len(shared_keywords) / len(set(keywords1.keys()) | set(keywords2.keys()))

    return similarity

print("\nTesting course similarity function:")

test_courses = ['IB111', 'IB031', 'PB161', 'IB002', 'IV109']

print("\nSimilarity matrix for sample courses:")
print("Course Codes:", test_courses)
print("-" * 50)

# Create a similarity matrix for the test courses
for i, course1 in enumerate(test_courses):
    similarities = []
    for j, course2 in enumerate(test_courses):
        sim = calculate_course_similarity(course1, course2, course_keywords)
        similarities.append(f"{sim:.4f}")
    print(f"{course1}: {', '.join(similarities)}")

# Find the most similar course to a specific course
def find_most_similar_courses(course_code, course_keywords, top_n=5):
    if course_code not in course_keywords:
        return []

    similarities = []
    for other_code in course_keywords:
        if other_code != course_code:
            sim = calculate_course_similarity(course_code, other_code, course_keywords)
            similarities.append((other_code, sim))

    # Sort by similarity score in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)

    return similarities[:top_n]

# Test finding similar courses for a specific course
if 'IA161' in course_keywords:  # Natural Language Processing in Practice
    print("\nMost similar courses to IA161 (Natural Language Processing in Practice):")
    similar_courses = find_most_similar_courses('IA161', course_keywords)
    for code, sim in similar_courses:
        # Find the course name if available
        course_name = ""
        for idx, c_code in enumerate(course_codes):
            if c_code == code:
                course_name = courses[idx]['NAME']
                break
        print(f"  {code} - {course_name}: {sim:.4f}")



Testing course similarity function:

Similarity matrix for sample courses:
Course Codes: ['IB111', 'IB031', 'PB161', 'IB002', 'IV109']
--------------------------------------------------
IB111: 0.0732, 0.0000, 0.0096, 0.0314, 0.0047
IB031: 0.0000, 0.0697, 0.0000, 0.0073, 0.0104
PB161: 0.0096, 0.0000, 0.0688, 0.0046, 0.0000
IB002: 0.0314, 0.0073, 0.0046, 0.0709, 0.0058
IV109: 0.0047, 0.0104, 0.0000, 0.0058, 0.0707

Most similar courses to IA161 (Natural Language Processing in Practice):
  PA153 - Natural Language Processing: 0.0311
  F4500 - Python for physicists: 0.0254
  PLIN013 - Proseminar, Pt. I: 0.0250
  Z8154 - Programming in geoinformatics: 0.0245
  PA154 - Language Modeling: 0.0243
