In [4]:
import numpy as np
from tqdm import tqdm
from helpers import load_courses, stem_keywords, keyword_intersection

In [5]:
courses, ctoi = load_courses('../data/generated')
kwd_intersects = keyword_intersection(courses)

  0%|          | 0/379 [00:00<?, ?it/s]

100%|██████████| 379/379 [00:00<00:00, 2338.98it/s]


In [6]:
stemmed = stem_keywords(courses)
courses[0]

100%|██████████| 379/379 [00:00<00:00, 6882.73it/s]


{'CODE': ' CORE012 ',
 'FACULTY': ' FI ',
 'NAME': ' Information Society ',
 'LANGUAGE': ' angličtina ',
 'SEMESTER': ' podzim 2024 ',
 'CREDITS': ' 3 ',
 'DEPARTMENT': ' KTP ',
 'TEACHERS': ' Zlatuška, J. ',
 'COMPLETION': ' k ',
 'PREREQUISITES': ' předp. typ_studia ( BM ) && forma ( P ) ',
 'FIELDS_OF_STUDY': ' ',
 'TYPE_OF_STUDY': ' ',
 'LECTURES_SEMINARS_HOMEWORK': ' 2/0/0 ',
 'SYLLABUS': ' This course deals with the impact of Information Technologies on society, with the nature of computer (information) revolution, and the advent of an information society.\n    Informatics in historical perspective.\n    Computer revolution.\n    Productivity paradox.\n    The Internet and WWW.\n    Digital economy.\n    Network economy and virtual communities.\n    Organizational and company structure.\n    Organizational transformation.\n    Teleceoomunications and information infrastructure.\n    Legal aspects of an information society.\n    Ethical problems.\n    Riskc of computing technology

In [7]:
def find_top_courses(course_idx, matrix, n=10):
    intersection_scores = matrix[course_idx]
    course_scores = [(i, score) for i, score in enumerate(intersection_scores)]
    sorted_courses = sorted(course_scores, key=lambda x: x[1], reverse=True)

    return sorted_courses[1 : n + 1]


# Find course index with given code
course_idx = ctoi['PV021']
top_similar_courses = find_top_courses(course_idx, kwd_intersects)

print("Looking for similar courses to course", courses[course_idx]['CODE'], " - ", courses[course_idx]['NAME'])

print(f"Top 10 courses with highest keyword intersection with course {course_idx}:")
for idx, (course_id, score) in enumerate(top_similar_courses):
    print(f"{idx+1}. Course {courses[course_id]['CODE']} - {courses[course_id]['NAME']} - Intersection score: {int(score)}")


Looking for similar courses to course  PV021   -   Neural Networks 
Top 10 courses with highest keyword intersection with course 196:
1. Course  IB031  -  Introduction to Machine Learning  - Intersection score: 4
2. Course  PB016  -  Introduction to Artificial Intelligence  - Intersection score: 3
3. Course  PV115  -  Laboratory of Knowledge Discovery  - Intersection score: 3
4. Course  PA026  -  Artificial Intelligence Project  - Intersection score: 2
5. Course  PA153  -  Natural Language Processing  - Intersection score: 2
6. Course  PA228  -  Machine Learning in Image Processing  - Intersection score: 2
7. Course  PV056  -  Machine Learning and Data Mining  - Intersection score: 2
8. Course  PV061  -  Machine Translation  - Intersection score: 2
9. Course  PV211  -  Introduction to Information Retrieval  - Intersection score: 2
10. Course  PV287  -  Artificial Intelligence and Machine Learning in Healthcare  - Intersection score: 2


In [8]:
def ratings_similarity(courses):
    """
    Calculate similarity between courses based on their ratings.
    Lower score means more similar (less difference in ratings).
    """
    n_courses = len(courses)
    similarity_matrix = np.zeros((n_courses, n_courses))

    for i, course1 in tqdm(enumerate(courses), total=n_courses):
        for j, course2 in enumerate(courses):
            diff_sum = 0
            for rating_key in course1["RATINGS"]:
                if rating_key in course2["RATINGS"]:
                    rating1 = int(course1["RATINGS"][rating_key])
                    rating2 = int(course2["RATINGS"][rating_key])
                    diff_sum += abs(rating1 - rating2)

            similarity_matrix[i, j] = diff_sum

    return similarity_matrix


ratings_sim = ratings_similarity(courses)

100%|██████████| 379/379 [00:00<00:00, 1456.98it/s]


In [9]:
course_idx = ctoi['PV021']

ratings_similar_courses = find_top_courses(course_idx, -ratings_sim)

print(
    f"Searching similar courses to course {courses[course_idx]['CODE']} - {courses[course_idx]['NAME']}"
)

print("\nTop 10 courses with most similar ratings profile:")
for idx, (course_id, score) in enumerate(ratings_similar_courses):
    print(
        f"{idx+1}. Course {courses[course_id]['CODE']} - {courses[course_id]['NAME']} - Rating difference: {int(-score)}"
    )

Searching similar courses to course  PV021  -  Neural Networks 

Top 10 courses with most similar ratings profile:
1. Course  PV079  -  Applied Cryptography  - Rating difference: 0
2. Course  PA010  -  Intermediate Computer Graphics  - Rating difference: 1
3. Course  PA013  -  Software Testing and Analysis  - Rating difference: 1
4. Course  PA166  -  Advanced Methods of Digital Image Processing  - Rating difference: 1
5. Course  PA193  -  Seminar on secure coding principles and practices  - Rating difference: 1
6. Course  IB016  -  Seminar on Functional Programming  - Rating difference: 2
7. Course  PA018  -  Advanced Topics in Information Technology Security  - Rating difference: 2
8. Course  PA164  -  Machine learning and natural language processing  - Rating difference: 2
9. Course  PA192  -  Secure hardware-based system design  - Rating difference: 2
10. Course  PA217  -  Artificial Intelligence for Computer Games  - Rating difference: 2


In [10]:
PA017_ratings = courses[ctoi['PA017']]['RATINGS']
IB114_ratings = courses[ctoi['IB114']]['RATINGS']

PA017_ratings, IB114_ratings

({'theoretical_vs_practical': '5',
  'usefulness': '8',
  'interest': '7',
  'stem_vs_humanities': '3',
  'abstract_vs_specific': '6',
  'difficulty': '6',
  'multidisciplinary': '4',
  'project_based': '5',
  'creative': '4'},
 {'theoretical_vs_practical': '5',
  'usefulness': '8',
  'interest': '7',
  'stem_vs_humanities': '2',
  'abstract_vs_specific': '6',
  'difficulty': '6',
  'multidisciplinary': '3',
  'project_based': '5',
  'creative': '4'})

In [11]:
def find_top_courses_multiple(idx_liked, idx_disliked, matrix, n=10):
    liked_scores = matrix[idx_liked]
    disliked_scores = matrix[idx_disliked]
    summed = liked_scores.sum(axis=0) - disliked_scores.sum(axis=0)
    course_scores = [(i, score) for i, score in enumerate(summed)]
    course_scores.sort(key=lambda x: x[1], reverse=True)
    print(course_scores)
    filtered_scores = [
        (i, score)
        for i, score in course_scores
        if i not in idx_liked and i not in idx_disliked
    ]

    return filtered_scores[:n]

In [17]:
liked = ['PV021']
disliked = ['PB007']

liked = [ctoi[code] for code in liked]
disliked = [ctoi[code] for code in disliked]

# Find course index with given code
top_similar_courses = find_top_courses_multiple(liked, disliked, kwd_intersects)

print("Looking for similar courses to:")
for course in liked:
    print(f"{courses[course]['CODE']} - {courses[course]['NAME']}")
print("and not similar to:")
for course in disliked:
    print(f"{courses[course]['CODE']} - {courses[course]['NAME']}")

print()
print(f"Top 10 courses with highest keyword intersection with course {course_idx}:")
for idx, (course_id, score) in enumerate(top_similar_courses):
    print(f"{idx+1}. Course {courses[course_id]['CODE']} - {courses[course_id]['NAME']} - Intersection score: {int(score)}")


[(196, 15.0), (41, 4.0), (162, 3.0), (220, 3.0), (103, 2.0), (117, 2.0), (155, 2.0), (200, 2.0), (201, 2.0), (256, 2.0), (304, 2.0), (54, 1.0), (57, 1.0), (67, 1.0), (70, 1.0), (84, 1.0), (123, 1.0), (257, 1.0), (266, 1.0), (279, 1.0), (290, 1.0), (0, 0.0), (2, 0.0), (3, 0.0), (4, 0.0), (5, 0.0), (6, 0.0), (7, 0.0), (8, 0.0), (9, 0.0), (10, 0.0), (11, 0.0), (12, 0.0), (13, 0.0), (14, 0.0), (15, 0.0), (16, 0.0), (17, 0.0), (18, 0.0), (19, 0.0), (20, 0.0), (21, 0.0), (22, 0.0), (23, 0.0), (24, 0.0), (25, 0.0), (26, 0.0), (27, 0.0), (28, 0.0), (29, 0.0), (30, 0.0), (31, 0.0), (32, 0.0), (33, 0.0), (34, 0.0), (35, 0.0), (36, 0.0), (37, 0.0), (38, 0.0), (39, 0.0), (40, 0.0), (42, 0.0), (43, 0.0), (44, 0.0), (45, 0.0), (46, 0.0), (47, 0.0), (48, 0.0), (49, 0.0), (50, 0.0), (51, 0.0), (52, 0.0), (53, 0.0), (55, 0.0), (56, 0.0), (58, 0.0), (59, 0.0), (60, 0.0), (61, 0.0), (62, 0.0), (64, 0.0), (65, 0.0), (66, 0.0), (68, 0.0), (69, 0.0), (71, 0.0), (72, 0.0), (73, 0.0), (74, 0.0), (75, 0.0), (7

In [16]:
#MV008 keywords
kw = courses[ctoi['MB152']]['KEYWORDS']
kw

['calculus',
 'derivatives',
 'integrals',
 'infinite series',
 'mathematical analysis',
 'limits',
 'elementary functions',
 'riemann integral',
 'differential equations',
 'real variable functions',
 'power series',
 'applications of calculus',
 'concrete problems',
 'theoretical methods',
 'practical methods']