In [179]:
import os
import json
import numpy as np
from nltk.stem import PorterStemmer
from tqdm import tqdm
from helpers import load_courses, stem_keywords, keyword_intersection

In [186]:
def load_courses(dir_path):
    files = os.listdir(dir_path)
    courses = []
    for file in files:
        with open(f'{dir_path}/{file}', 'r') as f:
            courses.append(json.load(f))
    courses = courses[0]
    ctoi = {course['CODE'].strip(): i for i, course in enumerate(courses)}
    return courses, ctoi

def stem_keywords(courses):
    ps = PorterStemmer()
    for course in tqdm(courses, total=len(courses)):
        course['KEYWORDS'] = {ps.stem(keyword.lower()) for keyword in course['KEYWORDS']}
    return courses

def keyword_intersection(courses):
    intersects = np.zeros((len(courses), len(courses)))

    for i, course in tqdm(enumerate(courses), total=len(courses)):
        for j, other_course in enumerate(courses):
            intersects[i, j] = len(set(course['KEYWORDS']) & set(other_course['KEYWORDS']))

    return intersects

In [188]:
courses, ctoi = load_courses('../data/generated')
kwd_intersects = keyword_intersection(courses)

100%|██████████| 379/379 [00:00<00:00, 2556.03it/s]


In [182]:
def find_top_courses(course_idx, matrix, n=10):
    intersection_scores = matrix[course_idx]
    course_scores = [(i, score) for i, score in enumerate(intersection_scores)]
    sorted_courses = sorted(course_scores, key=lambda x: x[1], reverse=True)

    return sorted_courses[1 : n + 1]


# Find course index with given code
course_idx = ctoi['PV021']
top_similar_courses = find_top_courses(course_idx, kwd_intersects)

print("Looking for similar courses to course", courses[course_idx]['CODE'], " - ", courses[course_idx]['NAME'])

print(f"Top 10 courses with highest keyword intersection with course {course_idx}:")
for idx, (course_id, score) in enumerate(top_similar_courses):
    print(f"{idx+1}. Course {courses[course_id]['CODE']} - {courses[course_id]['NAME']} - Intersection score: {int(score)}")


Looking for similar courses to course  PV021   -   Neural Networks 
Top 10 courses with highest keyword intersection with course 196:
1. Course  IB031  -  Introduction to Machine Learning  - Intersection score: 4
2. Course  PB016  -  Introduction to Artificial Intelligence  - Intersection score: 3
3. Course  PV115  -  Laboratory of Knowledge Discovery  - Intersection score: 3
4. Course  PA026  -  Artificial Intelligence Project  - Intersection score: 2
5. Course  PA153  -  Natural Language Processing  - Intersection score: 2
6. Course  PA228  -  Machine Learning in Image Processing  - Intersection score: 2
7. Course  PV056  -  Machine Learning and Data Mining  - Intersection score: 2
8. Course  PV061  -  Machine Translation  - Intersection score: 2
9. Course  PV211  -  Introduction to Information Retrieval  - Intersection score: 2
10. Course  PV287  -  Artificial Intelligence and Machine Learning in Healthcare  - Intersection score: 2


In [183]:
def ratings_similarity(courses):
    """
    Calculate similarity between courses based on their ratings.
    Lower score means more similar (less difference in ratings).
    """
    n_courses = len(courses)
    similarity_matrix = np.zeros((n_courses, n_courses))

    for i, course1 in tqdm(enumerate(courses), total=n_courses):
        for j, course2 in enumerate(courses):
            diff_sum = 0
            for rating_key in course1["RATINGS"]:
                if rating_key in course2["RATINGS"]:
                    rating1 = int(course1["RATINGS"][rating_key])
                    rating2 = int(course2["RATINGS"][rating_key])
                    diff_sum += abs(rating1 - rating2)

            similarity_matrix[i, j] = diff_sum

    return similarity_matrix


ratings_sim = ratings_similarity(courses)

100%|██████████| 379/379 [00:00<00:00, 1441.37it/s]


In [185]:
course_idx = ctoi['PV021']

ratings_similar_courses = find_top_courses(course_idx, -ratings_sim)

print(
    f"Searching similar courses to course {courses[course_idx]['CODE']} - {courses[course_idx]['NAME']}"
)

print("\nTop 10 courses with most similar ratings profile:")
for idx, (course_id, score) in enumerate(ratings_similar_courses):
    print(
        f"{idx+1}. Course {courses[course_id]['CODE']} - {courses[course_id]['NAME']} - Rating difference: {int(-score)}"
    )

Searching similar courses to course  PV021  -  Neural Networks 

Top 10 courses with most similar ratings profile:
1. Course  PV079  -  Applied Cryptography  - Rating difference: 0
2. Course  PA010  -  Intermediate Computer Graphics  - Rating difference: 1
3. Course  PA013  -  Software Testing and Analysis  - Rating difference: 1
4. Course  PA166  -  Advanced Methods of Digital Image Processing  - Rating difference: 1
5. Course  PA193  -  Seminar on secure coding principles and practices  - Rating difference: 1
6. Course  IB016  -  Seminar on Functional Programming  - Rating difference: 2
7. Course  PA018  -  Advanced Topics in Information Technology Security  - Rating difference: 2
8. Course  PA164  -  Machine learning and natural language processing  - Rating difference: 2
9. Course  PA192  -  Secure hardware-based system design  - Rating difference: 2
10. Course  PA217  -  Artificial Intelligence for Computer Games  - Rating difference: 2


In [167]:
PA017_ratings = courses[ctoi['PA017']]['RATINGS']
IB114_ratings = courses[ctoi['IB114']]['RATINGS']

PA017_ratings, IB114_ratings

({'theoretical_vs_practical': '5',
  'usefulness': '8',
  'interest': '7',
  'stem_vs_humanities': '3',
  'abstract_vs_specific': '6',
  'difficulty': '6',
  'multidisciplinary': '4',
  'project_based': '5',
  'creative': '4'},
 {'theoretical_vs_practical': '5',
  'usefulness': '8',
  'interest': '7',
  'stem_vs_humanities': '2',
  'abstract_vs_specific': '6',
  'difficulty': '6',
  'multidisciplinary': '3',
  'project_based': '5',
  'creative': '4'})

In [168]:
def find_top_courses_multiple(idx_liked, idx_disliked, matrix, n=10):
    liked_scores = matrix[idx_liked]
    disliked_scores = matrix[idx_disliked]
    summed = liked_scores.sum(axis=0) - disliked_scores.sum(axis=0)
    course_scores = [(i, score) for i, score in enumerate(summed)]
    course_scores.sort(key=lambda x: x[1], reverse=True)
    print(course_scores)
    filtered_scores = [
        (i, score)
        for i, score in course_scores
        if i not in idx_liked and i not in idx_disliked
    ]

    return filtered_scores[:n]

In [169]:
liked = ['MV008', 'MA010', 'MA018', 'PV080', 'PB007', 'PV021', 'IB111']
disliked = ['IB000']

liked = [ctoi[code] for code in liked]
disliked = [ctoi[code] for code in disliked]

# Find course index with given code
top_similar_courses = find_top_courses_multiple(liked, disliked, kwd_intersects)

print("Looking for similar courses to:")
for course in liked:
    print(f"{courses[course]['CODE']} - {courses[course]['NAME']}")
print("and not similar to:")
for course in disliked:
    print(f"{courses[course]['CODE']} - {courses[course]['NAME']}")

print()
print(f"Top 10 courses with highest keyword intersection with course {course_idx}:")
for idx, (course_id, score) in enumerate(top_similar_courses):
    print(f"{idx+1}. Course {courses[course_id]['CODE']} - {courses[course_id]['NAME']} - Intersection score: {int(score)}")


[(80, 16.0), (46, 15.0), (88, 15.0), (160, 15.0), (196, 15.0), (209, 15.0), (76, 13.0), (47, 7.0), (41, 6.0), (36, 5.0), (48, 5.0), (220, 5.0), (256, 5.0), (52, 4.0), (87, 4.0), (189, 4.0), (200, 4.0), (257, 4.0), (53, 3.0), (78, 3.0), (99, 3.0), (101, 3.0), (102, 3.0), (155, 3.0), (161, 3.0), (169, 3.0), (201, 3.0), (237, 3.0), (294, 3.0), (1, 2.0), (2, 2.0), (30, 2.0), (33, 2.0), (40, 2.0), (43, 2.0), (45, 2.0), (54, 2.0), (56, 2.0), (58, 2.0), (63, 2.0), (70, 2.0), (81, 2.0), (84, 2.0), (103, 2.0), (104, 2.0), (114, 2.0), (117, 2.0), (118, 2.0), (121, 2.0), (128, 2.0), (136, 2.0), (143, 2.0), (171, 2.0), (173, 2.0), (181, 2.0), (182, 2.0), (195, 2.0), (226, 2.0), (239, 2.0), (244, 2.0), (255, 2.0), (264, 2.0), (293, 2.0), (304, 2.0), (309, 2.0), (310, 2.0), (323, 2.0), (0, 1.0), (10, 1.0), (17, 1.0), (18, 1.0), (29, 1.0), (38, 1.0), (39, 1.0), (49, 1.0), (57, 1.0), (67, 1.0), (71, 1.0), (74, 1.0), (83, 1.0), (98, 1.0), (100, 1.0), (105, 1.0), (110, 1.0), (111, 1.0), (113, 1.0), (120

In [171]:
#MV008 keywords
kw = courses[ctoi['MB152']]['KEYWORDS']
kw

['calculus',
 'derivatives',
 'integrals',
 'infinite series',
 'mathematical analysis',
 'limits',
 'elementary functions',
 'riemann integral',
 'differential equations',
 'real variable functions',
 'power series',
 'applications of calculus',
 'concrete problems',
 'theoretical methods',
 'practical methods']