In [1]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import json

from scripts.helpers import load_courses

In [None]:
courses = load_courses('./data/generated/')[0]
len(courses)

21106

In [3]:

# Extract text data from courses for TF-IDF processing
# We'll combine relevant text fields from each course to create a corpus
corpus = []
course_codes = []

for course in courses:
    # Combine relevant text fields into a single document
    # document = f"{course['NAME']} {course['SYLLABUS']} {course['OBJECTIVES']} {course['LEARNING_OUTCOMES']} {course['DESCRIPTION']}"
    document = json.dumps(course, ensure_ascii=False)

    # Add keywords if available
    if 'KEYWORDS' in course and course['KEYWORDS']:
        document += " " + " ".join(course['KEYWORDS'])

    corpus.append(document)
    course_codes.append(course['CODE'])

# Initialize and fit the TF-IDF vectorizer
# max_features limits the vocabulary size to the most important terms
# stop_words removes common English words that don't carry much meaning
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    lowercase=True,
    norm='l2',
    use_idf=True,
    smooth_idf=True,
    sublinear_tf=True  # Apply sublinear tf scaling (1 + log(tf))
)

# Transform the corpus into TF-IDF features
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Get feature names (terms)
feature_names = tfidf_vectorizer.get_feature_names_out()

print(f"Number of features (terms): {len(feature_names)}")
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")


Number of features (terms): 5000
TF-IDF matrix shape: (21106, 5000)


In [4]:
# Example: Get the top 5 terms for the first course
def get_top_terms(doc_idx, top_n=5):
    feature_index = tfidf_matrix[doc_idx].nonzero()[1]
    tfidf_scores = zip(feature_index, [tfidf_matrix[doc_idx, x] for x in feature_index])
    sorted_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
    return [(feature_names[idx], score) for idx, score in sorted_scores[:top_n]]

# Display top terms for the first course
if len(courses) > 0:
    print(f"\nTop terms for course {course_codes[0]} - {courses[0]['NAME']}:")
    for term, score in get_top_terms(0):
        print(f"  {term}: {score:.4f}")


Top terms for course A_SPT - English for court translators and interpreters:
  court: 0.4272
  translation: 0.3135
  english: 0.3018
  legal: 0.2935
  judicial: 0.2780


In [5]:
code = "IB111"

for course in courses:
    if course['CODE'] == code:
        print(course)
        break


{'CODE': 'IB111', 'FACULTY': 'FI', 'NAME': 'Foundations of Programming', 'LANGUAGE': 'čeština', 'SEMESTER': 'podzim 2024', 'CREDITS': '5', 'DEPARTMENT': 'KPSK', 'TEACHERS': 'Beneš, N. - Bartek, F. - Bednařík, K. - Borošová, K. - Brdečko, V. - Bukor, O. - Burget, J. - Čepela, S. - Focko, M. - Foltýnek, T. - Glosner, R. - Jedelský, J. - Juračková, N. - Kasprzaková, I. - Lukačovič, B. - Marek, T. - Melkovič, D. - Pastva, S. - Patlevič, M. - Rakšány, P. - Ročkai, P. - Ručka, L. - Sedlák, E. - Stančík, S. - Šutor, D. - Trnavský, P. - Tuček, M. - Tvarožek, M. - Uhlík, V. - Vojnar, T. - Weinberger, F. - Winklerová, A. - Wolek, J. - Záborský, L. - Zatloukal, J. - Žbánek, V. - Balák, T. - Baník, R. - Barna, M. - Béreš, J. - Biačko, P. - Borský, J. - Bukáček, M. - Čech, R. - Čermák, K. - Černá, I. - Davidová, N. - Drkoš, T. - Dvořák, R. - Ergang, M. - Fedorko, F. - Frejlach, J. - Glos, J. - Hadar, A. - Halabala, J. - Halamka, M. - Hejčl, P. - Horák, J. - Jarošová, J. - Judiny, J. - Kamenov, D. -

In [6]:
# Find the index of the course with code 'IB111'
ib111_idx = None
for i, code in enumerate(course_codes):
    if code == 'PB111':
        ib111_idx = i
        break

# Display top 15 terms for IB111 if found
if ib111_idx is not None:
    course_name = courses[ib111_idx]['NAME']
    print(f"\nTop 15 terms for course {course_codes[ib111_idx]} - {course_name}:")
    for term, score in get_top_terms(ib111_idx, top_n=15):
        print(f"  {term}: {score:.4f}")
else:
    print("\nCourse with code 'IB111' not found in the dataset.")



Top 15 terms for course PB111 - Principles of low-level programming:
  programming: 0.2384
  memory: 0.2221
  allocation: 0.1924
  computational: 0.1864
  tables: 0.1685
  algorithms: 0.1678
  dynamic: 0.1662
  machine: 0.1586
  low: 0.1518
  search: 0.1326
  blocks: 0.1265
  variable: 0.1249
  block: 0.1229
  linked: 0.1206
  computer: 0.1198


In [7]:
len(course_codes)

21106

In [8]:
# Extract top 15 keywords for each course and store in a JSON file
import json
from tqdm import tqdm

# Create a dictionary to store course codes and their top keywords
course_keywords = {}

# Process each course
print("Extracting top keywords for each course...")
for idx, code in enumerate(tqdm(course_codes)):
    # Get the top 15 terms for this course
    top_terms = get_top_terms(idx, top_n=15)

    # Store as a dictionary with term and score
    item = [{"term": term, "score": float(score)} for term, score in top_terms]
    if code not in course_keywords:
        course_keywords[code] = item
    else:
        course_keywords[f"{code}_{i}"] = item

# Save to JSON file
output_file = "course_top_keywords.json"
with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(course_keywords, f, indent=2, ensure_ascii=False)

print(f"Top keywords saved to {output_file}")

# Display sample of the data (first 3 courses)
print("\nSample of extracted keywords:")
sample_count = min(3, len(course_codes))
for i, code in enumerate(list(course_keywords.keys())[:sample_count]):
    print(f"\nCourse {code}:")
    for item in course_keywords[code][:5]:  # Show only top 5 for the sample
        print(f"  {item['term']}: {item['score']:.4f}")
    if len(course_keywords[code]) > 5:
        print("  ...")


Extracting top keywords for each course...


  0%|          | 0/21106 [00:00<?, ?it/s]

100%|██████████| 21106/21106 [02:37<00:00, 134.11it/s]


Top keywords saved to course_top_keywords.json

Sample of extracted keywords:

Course A_SPT:
  court: 0.4272
  translation: 0.3135
  english: 0.3018
  legal: 0.2935
  judicial: 0.2780
  ...

Course AUT_TM1:
  habits: 0.3458
  organizational: 0.2883
  autumn: 0.2829
  time: 0.2750
  management: 0.2573
  ...

Course BELONG:
  ma: 0.2808
  phd: 0.2513
  video: 0.2224
  soft: 0.1869
  career: 0.1827
  ...


In [9]:
# Function to calculate similarity between courses based on shared keywords
def calculate_course_similarity(course1_code, course2_code, course_keywords, weight_by_score=True):
    if course1_code not in course_keywords or course2_code not in course_keywords:
        return 0.0

    keywords1 = {item["term"]: item["score"] for item in course_keywords[course1_code]}
    keywords2 = {item["term"]: item["score"] for item in course_keywords[course2_code]}

    shared_keywords = set(keywords1.keys()) & set(keywords2.keys())

    if not shared_keywords:
        return 0.0

    if weight_by_score:
        similarity = sum(keywords1[term] * keywords2[term] for term in shared_keywords)
        total_possible = sum(keywords1.values()) * sum(keywords2.values())
        if total_possible > 0:
            similarity = similarity / total_possible
    else:
        similarity = len(shared_keywords) / len(set(keywords1.keys()) | set(keywords2.keys()))

    return similarity

def find_most_similar_courses(course_code, course_keywords, top_n=5):
    if course_code not in course_keywords:
        return []

    similarities = []
    for other_code in course_keywords:
        if other_code != course_code:
            sim = calculate_course_similarity(course_code, other_code, course_keywords)
            similarities.append((other_code, sim))

    # Sort by similarity score in descending order
    similarities.sort(key=lambda x: x[1], reverse=True)

    return similarities[:top_n]

query = 'IB109'
if query in course_keywords:  # Natural Language Processing in Practice
    print(f"Most similar courses to {query}:")
    similar_courses = find_most_similar_courses(query, course_keywords, top_n=10)
    for code, sim in similar_courses:
        course_name = ""
        for idx, c_code in enumerate(course_codes):
            if c_code == code:
                course_name = courses[idx]['NAME']
                break
        print(f"  {code} - {course_name}: {sim:.4f}")


Most similar courses to IB109:
  PV197 - GPU Programming: 0.0360
  PA039 - Supercomputer Architecture and Intensive Computations: 0.0357
  IV100 - Parallel and distributed computations: 0.0310
  IV003 - Algorithms and Data Structures II: 0.0285
  C2143 - Design of algorithms in life sciences - seminary: 0.0275
  MA015 - Graph Algorithms: 0.0266
  IB002 - Algorithms and data structures I: 0.0254
  C2142 - Design of algorithms in life sciences: 0.0251
  IB114 - Introduction to Programming and Algorithms II: 0.0247
  PV281 - Programming in Rust: 0.0244


In [10]:
len(course_keywords)

21106

In [36]:
from scipy import sparse
import os
import pickle

import scripts.helpers as helpers
helpers.add_backend_to_path()

from app.courses import CourseClient

courseClient = CourseClient(os.path.join("..", "web", "backend", "assets", "courses"))

def create_intersection_matrix(course_keywords):
    course_codes_list = list(course_keywords.keys())
    course_indices = {code: idx for idx, code in enumerate(course_codes_list)}
    n_courses = len(course_codes_list)

    intersection_matrix = np.zeros((n_courses, n_courses), dtype=np.uint8)

    print("Building intersection matrix...")
    keyword_sets = {}
    for code, keywords in course_keywords.items():
        keywords = [item["term"] for item in keywords]
        keyword_sets[code] = set(keywords)

    # keyword_sets = {code: set(keywords) for code, keywords in course_keywords.items()}

    # Create a mapping from ID to course code to avoid repeated lookups
    id_to_code = []
    for id_val in range(len(courseClient.id_df)):
        try:
            idx = courseClient.id_df.loc[id_val, 'index']
            code = courseClient.df.iloc[idx]['CODE']
            id_to_code.append((id_val, code))
        except (KeyError, TypeError):
            continue
    sorted_id_code = sorted(id_to_code, key=lambda x: x[0])
    id_to_code = {id: code for id, code in sorted_id_code}

    for id1 in tqdm(range(n_courses), total=n_courses, desc="Calculating intersections"):
        if id1 not in id_to_code:
            continue
            
        code1 = id_to_code[id1]
        if code1 not in keyword_sets:
            code1 = code1.split(" ")[0]
            if code1 not in keyword_sets:
                continue

        keywords1_set = keyword_sets[code1]
        intersection_matrix[id1, id1] = len(keywords1_set)

        for id2 in range(id1 + 1, n_courses):
            if id2 not in id_to_code:
                continue
                
            code2 = id_to_code[id2]
            if code2 not in keyword_sets:
                code2 = code2.split(" ")[0]
                if code2 not in keyword_sets:
                    continue
                    
            keywords2_set = keyword_sets[code2]

            intersection_count = len(keywords1_set.intersection(keywords2_set))

            intersection_matrix[id1, id2] = intersection_count
            intersection_matrix[id2, id1] = intersection_count

    return intersection_matrix, course_indices

intersection_matrix_u8, course_indices = create_intersection_matrix(course_keywords)
print("Converting to sparse uint8 matrix...")
sparse_intersection_matrix_u8 = sparse.csr_matrix(intersection_matrix_u8)

os.makedirs('assets', exist_ok=True)

matrix_filename = os.path.join('..','web','backend','assets','intersects_tfidf.npz')
print(f"Saving sparse uint8 intersection matrix to {matrix_filename}...")
sparse.save_npz(matrix_filename, sparse_intersection_matrix_u8)

indices_reverse = {idx: code for code, idx in course_indices.items()}
indices_filename = '../web/backend/assets/intersection_course_indices.pkl'
print(f"Saving course indices mapping to {indices_filename}...")
with open(indices_filename, 'wb') as f:
    pickle.dump(course_indices, f)


print("Intersection matrix and indices saved.")


Unnamed: 0_level_0,CODE,FACULTY,NAME_EN,LANGUAGE,SEMESTER,CREDITS,DEPARTMENT,TEACHERS,COMPLETION,PREREQUISITES,...,LITERATURE,STUDENTS_ENROLLED,STUDENTS_PASSED,AVERAGE_GRADE,FOLLOWUP_COURSES,KEYWORDS,DESCRIPTION,RATINGS,ID,NAME
CODE,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A_SPT,A_SPT,CST,English for court translators and interpreters,čeština,jaro 2025,,,"Klabal, O. - Fillinger, V. - Frecerová, K. - H...",z,,...,,8,8,-,,,Enhance your English skills specifically for c...,,0,Angličtina pro soudní překladatele a tlumočníky
AUT_TM1,AUT_TM1,CST,Introduction to scheduling and time management...,čeština,podzim 2024,1.0,Teiresiás,"Lázničková, A. - Žampachová, H. - Oulehlová, I...",z,předp. SOUHLAS,...,,26,24,-,,,Learn practical scheduling and time management...,,1,Základy plánování a organizace času pro stude...
BELONG,BELONG,CST,Professional Writing and Communication with Em...,angličtina,jaro 2025,3.0,,"Beneš, J. - Jamie Hoversen - Linda M. Steyne -...",z,předp. souhlas,...,,6,0,-,,,Develop essential communication skills for car...,,2,Professional Writing and Communication with E...
CJV_AJ_ESL,CJV_AJ_ESL,CST,English Speaking and Listening,angličtina,jaro 2025,2.0,,"Allan, B. - Holasová, M.",z,,...,TUNI's Moodle system (https://moodle.tuni.fi/)...,13,0,-,,,Improve your English speaking and listening sk...,,3,English Speaking and Listening
CJV_A_KPed,CJV_A_KPed,CST,English for Department of Pedagogy,čeština,podzim 2024,,,"Malášková, M. - Frecerová, K. - Holasová, M.",z,,...,New English File Upper-Intermediate,4,0,-,,,Improve your English! This course focuses on u...,,4,Angličtina pro KPed PdF


Building intersection matrix...


Calculating intersections:   6%|▌         | 1194/21106 [00:27<07:32, 43.96it/s]


KeyboardInterrupt: 

In [12]:
indices_reverse = {idx: code for code, idx in course_indices.items()}
indices_filename = '../web/backend/assets/course_indices_tfidf.pkl'
print(f"Saving course indices mapping to {indices_filename}...")
with open(indices_filename, 'wb') as f:
    pickle.dump(indices_reverse, f)

Saving course indices mapping to ../web/backend/assets/course_indices_tfidf.pkl...


In [13]:
course_indices = pickle.load(open("../web/backend/assets/course_indices_tfidf.pkl", "rb"))


In [14]:
import scipy.sparse as sp
from typing import List, Tuple

# Load the sparse similarity matrix
print("Loading the sparse similarity matrix...")
sparse_similarity_matrix = sparse.load_npz('../web/backend/assets/intersects_tfidf.npz')

# course_indices = pickle.load(open(indices_filename, 'rb'))

similarity_matrix = sparse_similarity_matrix.toarray()

def find_top_courses_multiple(idx_liked: List[int], idx_disliked: List[int], matrix: sp.csr_matrix, m: int) -> List[Tuple[int, float]]:
    matrix = matrix.toarray().astype(np.float32)
    liked_scores = matrix[idx_liked]
    disliked_scores = matrix[idx_disliked]

    summed = liked_scores.sum(axis=0) - disliked_scores.sum(axis=0)

    course_scores = [(i, score) for i, score in enumerate(summed)]
    course_scores.sort(key=lambda x: x[1], reverse=True)

    return course_scores[:m]

liked_codes = ['IB111', 'IB109']
disliked_codes = ['IB000']

ctoi = {code: idx for idx, code in enumerate(course_codes)}

liked_ids = [ctoi[code] for code in liked_codes]
disliked_ids = [ctoi[code] for code in disliked_codes]

n = 15

top_course_ids = find_top_courses_multiple(liked_ids, disliked_ids, sparse_similarity_matrix, n)
res = []
for idx, score  in top_course_ids:
    if idx in liked_ids or idx in disliked_ids:
        continue
    course = courses[idx]
    res.append((course['CODE'], course['NAME'], score))

# Print the results
print(f"Top {n} most similar courses to {liked_codes}:")
for code, name, score in res:
    print(f"{code}  {name}: Similarity score = {score:.4f}")


Loading the sparse similarity matrix...
Top 15 most similar courses to ['IB111', 'IB109']:
IB113  Introduction to Programming and Algorithms: Similarity score = 9.0000
IB114  Introduction to Programming and Algorithms II: Similarity score = 9.0000
IV003  Algorithms and Data Structures II: Similarity score = 9.0000
IA101  Algorithmics for Hard Problems: Similarity score = 8.0000
PA039  Supercomputer Architecture and Intensive Computations: Similarity score = 8.0000
PV197  GPU Programming: Similarity score = 8.0000
C2142  Design of algorithms in life sciences: Similarity score = 8.0000
C2143  Design of algorithms in life sciences - seminary: Similarity score = 8.0000
ISKM72  Basics of Algorithmic Thinking: Similarity score = 7.0000
IB002  Algorithms and data structures I: Similarity score = 7.0000
TI2011  Didactics - Electrical and Electronics: Similarity score = 7.0000
ISKM80  Python for non-programmers: Similarity score = 6.0000
PLIN081  Advanced machine learning methods: Similarity sc

In [15]:
import scipy.sparse as sp
from typing import List, Tuple
import numpy as np
import pickle


loaded_sparse_matrix = sp.load_npz("../web/backend/assets/intersects_tfidf.npz")
kwd_intersects = loaded_sparse_matrix.toarray().astype(np.float32)
course_indices = pickle.load(open("../web/backend/assets/course_indices_tfidf.pkl", "rb"))

def find_top_courses(
    idx_liked: List[int], idx_disliked: List[int], matrix: sp.csr_matrix
) -> List[Tuple[int, float]]:
    liked_scores = matrix[idx_liked]
    disliked_scores = matrix[idx_disliked]

    summed = liked_scores.sum(axis=0) - disliked_scores.sum(axis=0)

    course_scores = [(i, score) for i, score in enumerate(summed)]
    course_scores.sort(key=lambda x: x[1], reverse=True)

    return course_scores


def recommend_courses_keywords_tfidf(
    liked_ids: List[str],
    disliked_ids: List[str],
    skipped_ids: List[str],
    n: int,
):
    top_courses = find_top_courses(liked_ids, disliked_ids, kwd_intersects)

    res = []
    for idx, _ in top_courses:
        if idx in liked_ids or idx in disliked_ids or idx in skipped_ids:
            continue
        # course = courseClient.get_course_by_id(idx)
        course_code = course_indices[idx]
        course = course_code
        if course is not None:
            res.append(course)
        if len(res) == n:
            break

    return res

liked_codes = ['IB111', 'IB109']
disliked_codes = ['IB000']

ctoi = {code: idx for idx, code in enumerate(course_codes)}

liked_ids = [ctoi[code] for code in liked_codes]
disliked_ids = [ctoi[code] for code in disliked_codes]
skipped_ids = []

rec_codes = recommend_courses_keywords_tfidf(liked_ids, disliked_ids, skipped_ids, n)

for code in rec_codes:
    print(code)


IB113
IB114
IV003
IA101
PA039
PV197
C2142
C2143
ISKM72
IB002
TI2011
ISKM80
PLIN081
IA012
IV104
