In [1]:
import numpy as np
from tqdm import tqdm
from scripts.helpers import load_courses, stem_keywords, keyword_intersection

In [4]:
courses, ctoi = load_courses('../data/generated')

In [5]:
# kwd_intersects = keyword_intersection(courses)

In [6]:
# import scipy.sparse as sp

# sparse = sp.csr_matrix(kwd_intersects)
# sp.save_npz("../data/intersects/intersects_sparse.npz", sparse)

In [7]:
import scipy.sparse as sp

loaded_sparse_matrix = sp.load_npz("../data/intersects/intersects_sparse.npz")

kwd_intersects = loaded_sparse_matrix.toarray()

In [8]:
kwd_intersects


array([[15.,  0.,  1., ...,  0.,  0.,  0.],
       [ 0., 15.,  0., ...,  0.,  0.,  0.],
       [ 1.,  0., 15., ...,  0.,  0.,  0.],
       ...,
       [ 0.,  0.,  0., ..., 15.,  0.,  2.],
       [ 0.,  0.,  0., ...,  0., 15.,  0.],
       [ 0.,  0.,  0., ...,  2.,  0., 15.]])

In [9]:
stemmed = stem_keywords(courses)
courses[0]

100%|██████████| 21106/21106 [00:10<00:00, 1977.47it/s]


{'CODE': ' A_SPT ',
 'FACULTY': ' CST ',
 'NAME': ' English for court translators and interpreters ',
 'LANGUAGE': ' čeština ',
 'SEMESTER': ' jaro 2025 ',
 'CREDITS': ' ',
 'DEPARTMENT': ' ',
 'TEACHERS': ' Klabal, O. - Fillinger, V. - Frecerová, K. - Holasová, M. ',
 'COMPLETION': ' z ',
 'PREREQUISITES': ' ',
 'FIELDS_OF_STUDY': ' ASTP ',
 'TYPE_OF_STUDY': ' celoživotní ',
 'LECTURES_SEMINARS_HOMEWORK': ' 0/80/0 ',
 'SYLLABUS': ' ',
 'OBJECTIVES': ' ',
 'TEXT_PREREQUISITS': ' ',
 'ASSESMENT_METHODS': ' ',
 'TEACHING_METHODS': ' ',
 'TEACHER_INFO': ' ',
 'LEARNING_OUTCOMES': ' ',
 'LITERATURE': ' ',
 'STUDENTS_ENROLLED': ' 8 ',
 'STUDENTS_PASSED': ' 8 ',
 'AVERAGE_GRADE': ' - ',
 'FOLLOWUP_COURSES': ' ',
 'KEYWORDS': ['english',
  'court translators',
  'interpreters',
  'legal english',
  'translation',
  'interpretation',
  'legal settings',
  'linguistic abilities',
  'court interpretation',
  'legal translation',
  'terminology',
  'judicial system',
  'communication skills',
  '

In [10]:
def find_top_courses(course_idx, matrix, n=10):
    intersection_scores = matrix[course_idx]
    course_scores = [(i, score) for i, score in enumerate(intersection_scores)]
    sorted_courses = sorted(course_scores, key=lambda x: x[1], reverse=True)

    return sorted_courses[1 : n + 1]


# Find course index with given code
course_idx = ctoi['IV109']
top_similar_courses = find_top_courses(course_idx, kwd_intersects)

print("Looking for similar courses to course", courses[course_idx]['CODE'], " - ", courses[course_idx]['NAME'])

print(f"Top 10 courses with highest keyword intersection with course {course_idx}:")
for idx, (course_id, score) in enumerate(top_similar_courses):
    print(f"{idx+1}. Course {courses[course_id]['CODE']} - {courses[course_id]['NAME']} - Intersection score: {int(score)}")


Looking for similar courses to course IV109  -  Modeling and Simulation
Top 10 courses with highest keyword intersection with course 7447:
1. Course  BSSn4489  -  Models and simulations of complex systems  - Intersection score: 4
2. Course  CORE114  -  Biological Mathematics  - Intersection score: 4
3. Course  ENSb1315  -  Network analysis: social, ecological, and social-ecological approaches  - Intersection score: 3
4. Course  MPE_MATL  -  MATLAB  - Intersection score: 2
5. Course  MPE_ZMAT  -  Basics of MATLAB  - Intersection score: 2
6. Course  MPH_ACMS  -  Corporate Management System  - Intersection score: 2
7. Course  MPH_SYRP  -  Corporate Management Systems  - Intersection score: 2
8. Course  MPR_MVRR  -  Research methods in regional development  - Intersection score: 2
9. Course  MPR_QQRM  -  Qualitative and Quantitative Research Methods  - Intersection score: 2
10. Course  DESB14  -  Sensitivity to the important issues of today  - Intersection score: 2


In [11]:
# def ratings_similarity(courses):
#     """
#     Calculate similarity between courses based on their ratings.
#     Lower score means more similar (less difference in ratings).
#     """
#     n_courses = len(courses)
#     similarity_matrix = np.zeros((n_courses, n_courses))

#     for i, course1 in tqdm(enumerate(courses), total=n_courses):
#         for j, course2 in enumerate(courses):
#             diff_sum = 0
#             for rating_key in course1["RATINGS"]:
#                 if rating_key in course2["RATINGS"]:
#                     rating1 = int(course1["RATINGS"][rating_key])
#                     rating2 = int(course2["RATINGS"][rating_key])
#                     diff_sum += abs(rating1 - rating2)

#             similarity_matrix[i, j] = diff_sum

#     return similarity_matrix


# ratings_sim = ratings_similarity(courses)

In [12]:
# course_idx = ctoi['PV021']

# ratings_similar_courses = find_top_courses(course_idx, -ratings_sim)

# print(
#     f"Searching similar courses to course {courses[course_idx]['CODE']} - {courses[course_idx]['NAME']}"
# )

# print("\nTop 10 courses with most similar ratings profile:")
# for idx, (course_id, score) in enumerate(ratings_similar_courses):
#     print(
#         f"{idx+1}. Course {courses[course_id]['CODE']} - {courses[course_id]['NAME']} - Rating difference: {int(-score)}"
#     )

In [13]:
# PA017_ratings = courses[ctoi['PA017']]['RATINGS']
# IB114_ratings = courses[ctoi['IB114']]['RATINGS']

# PA017_ratings, IB114_ratings

In [14]:
def find_top_courses_multiple(idx_liked, idx_disliked, matrix, n=10):
    liked_scores = matrix[idx_liked]
    disliked_scores = matrix[idx_disliked]
    summed = liked_scores.sum(axis=0) - disliked_scores.sum(axis=0)
    course_scores = [(i, score) for i, score in enumerate(summed)]
    course_scores.sort(key=lambda x: x[1], reverse=True)
    filtered_scores = [(i, score) for i, score in course_scores]

    return filtered_scores[:n]

In [16]:
liked_codes = ['PB161']
disliked_codes = ['PB152']

liked = [ctoi[code] for code in liked_codes]
disliked = [ctoi[code] for code in disliked_codes]

# Find course index with given code
top_similar_courses = find_top_courses_multiple(liked, disliked, kwd_intersects, n=15)

# print("Looking for similar courses to:")
# for course in liked:
#     print(f"{courses[course]['CODE']} - {courses[course]['NAME']}")
# print("and not similar to:")
# for course in disliked:
#     print(f"{courses[course]['CODE']} - {courses[course]['NAME']}")
# print()

for idx, (course_id, score) in enumerate(top_similar_courses):
    if courses[course_id]['CODE'].strip() not in liked_codes and courses[course_id]['CODE'].strip() not in disliked_codes:
        print(f"{idx+1}. Course {courses[course_id]['CODE']} - {courses[course_id]['NAME']} - Intersection score: {int(score)}")


2. Course PV294 - Advanced C++ - Intersection score: 3
3. Course  C3220  -  Advanced C Programming for Chemists  - Intersection score: 3
4. Course PB006 - Principles of Programming Languages and OOP - Intersection score: 2
5. Course PB111 - Principles of low-level programming - Intersection score: 2
6. Course  ZPIT01  -  Efficient usage of learning and information resources for visually impaired  - Intersection score: 1
7. Course  BPR_MGCR  -  Tourism management  - Intersection score: 1
8. Course  MPE_MATL  -  MATLAB  - Intersection score: 1
9. Course  AI001  -  Elements of AI  - Intersection score: 1
10. Course  ISKM72  -  Basics of Algorithmic Thinking  - Intersection score: 1
11. Course  JSFF_Blok  -  Blokace poslucháren pro JŠ FF MU  - Intersection score: 1
12. Course  PG_Kombi  -  timetable  - Intersection score: 1
13. Course  PHV444en  -   - Intersection score: 1
14. Course  PLIN041  -  History of Computational Linguistics  - Intersection score: 1
15. Course  TIM_BK_015  -  Bache

In [45]:
#MB152 keywords
kw = courses[ctoi['MB152']]['KEYWORDS']
kw

['calculus',
 'derivatives',
 'integrals',
 'infinite series',
 'mathematical analysis',
 'limits',
 'elementary functions',
 'riemann integral',
 'differential equations',
 'real variable functions',
 'power series',
 'applications of calculus',
 'concrete problems',
 'theoretical methods',
 'practical methods']