In [63]:
import os
import json
import numpy as np
from nltk.stem import PorterStemmer
from tqdm import tqdm

In [64]:
files = os.listdir('../data/generated')
courses = []
for file in files:
    with open(f'../data/generated/{file}', 'r') as f:
        courses.append(json.load(f))

courses = courses[0]
ctoi = {course['CODE'].strip(): i for i, course in enumerate(courses)}

def stem_keywords(courses):
    ps = PorterStemmer()
    for course in tqdm(courses, total=len(courses)):
        course['STEMMED_KEYWORDS'] = {ps.stem(keyword.lower()) for keyword in course['KEYWORDS']}
    return courses

courses = stem_keywords(courses)

100%|██████████| 379/379 [00:00<00:00, 7371.67it/s]


In [65]:
def keyword_intersection(courses):
    intersects = np.zeros((len(courses), len(courses)))

    for i, course in tqdm(enumerate(courses), total=len(courses)):
        for j, other_course in enumerate(courses):
            intersects[i, j] = len(course['STEMMED_KEYWORDS'] & other_course['STEMMED_KEYWORDS'])

    return intersects

kwd_intersects = keyword_intersection(courses)

100%|██████████| 379/379 [00:00<00:00, 5571.32it/s]


In [66]:
def find_top_courses(course_idx, matrix, n=10):
    intersection_scores = matrix[course_idx]
    course_scores = [(i, score) for i, score in enumerate(intersection_scores)]
    sorted_courses = sorted(course_scores, key=lambda x: x[1], reverse=True)

    return sorted_courses[1 : n + 1]


# Find course index with given code
course_idx = 101
top_similar_courses = find_top_courses(course_idx, kwd_intersects)

print("Looking for similar courses to course", courses[course_idx]['CODE'], " - ", courses[course_idx]['NAME'])

print(f"Top 10 courses with highest keyword intersection with course {course_idx}:")
for idx, (course_id, score) in enumerate(top_similar_courses):
    print(f"{idx+1}. Course {courses[course_id]['CODE']} - {courses[course_id]['NAME']} - Intersection score: {int(score)}")


Looking for similar courses to course  PA017   -   Information Systems Management 
Top 10 courses with highest keyword intersection with course 101:
1. Course  CORE013  -  Software Development: from an idea to working solution  - Intersection score: 4
2. Course  PA013  -  Software Testing and Analysis  - Intersection score: 3
3. Course  PA187  -  Project managment and project  - Intersection score: 3
4. Course  PB007  -  Software Engineering I  - Intersection score: 3
5. Course  PV260  -  Software Quality  - Intersection score: 3
6. Course  PB175  -  Project managment and project  - Intersection score: 2
7. Course  PV167  -  Seminar on Design and Architecture Patterns  - Intersection score: 2
8. Course  SA200  -  Internship - Software Engineering  - Intersection score: 2
9. Course  SB100  -  Bachelor Internship - Programming and Development  - Intersection score: 2
10. Course  IA159  -  Formal Methods for Software Analysis  - Intersection score: 1


In [67]:
def ratings_similarity(courses):
    """
    Calculate similarity between courses based on their ratings.
    Lower score means more similar (less difference in ratings).
    """
    n_courses = len(courses)
    similarity_matrix = np.zeros((n_courses, n_courses))

    for i, course1 in tqdm(enumerate(courses), total=n_courses):
        for j, course2 in enumerate(courses):
            diff_sum = 0
            for rating_key in course1["RATINGS"]:
                if rating_key in course2["RATINGS"]:
                    rating1 = int(course1["RATINGS"][rating_key])
                    rating2 = int(course2["RATINGS"][rating_key])
                    diff_sum += abs(rating1 - rating2)

            similarity_matrix[i, j] = diff_sum

    return similarity_matrix

ratings_sim = ratings_similarity(courses)

100%|██████████| 379/379 [00:00<00:00, 1337.74it/s]


In [68]:
ratings_similar_courses = find_top_courses(
    course_idx, -ratings_sim
)


print(f"Searching similar courses to course {courses[course_idx]['CODE']} - {courses[course_idx]['NAME']}")

print("\nTop 10 courses with most similar ratings profile:")
for idx, (course_id, score) in enumerate(ratings_similar_courses):
    print(
        f"{idx+1}. Course {courses[course_id]['CODE']} - {courses[course_id]['NAME']} - Rating difference: {int(-score)}"
    )


Searching similar courses to course  PA017  -  Information Systems Management 

Top 10 courses with most similar ratings profile:
1. Course  IB114  -  Introduction to Programming and Algorithms II  - Rating difference: 2
2. Course  CORE013  -  Software Development: from an idea to working solution  - Rating difference: 3
3. Course  MA012  -  Statistics II  - Rating difference: 3
4. Course  MV013  -  Statistics for Computer Science  - Rating difference: 3
5. Course  PA176  -  Architecture of Digital Systems II  - Rating difference: 3
6. Course  PV269  -  Advanced methods in bioinformatics  - Rating difference: 3
7. Course  MB143  -  Design and analysis of statistical experiments  - Rating difference: 4
8. Course  MB153  -  Statistics I  - Rating difference: 4
9. Course  PV027  -  Optimization  - Rating difference: 4
10. Course  PV241  -  Enterprise and Financial Management  - Rating difference: 4


In [70]:
PA017_ratings = courses[ctoi['PA017']]['RATINGS']
IB114_ratings = courses[ctoi['IB114']]['RATINGS']

PA017_ratings, IB114_ratings

({'theoretical_vs_practical': '5',
  'usefulness': '8',
  'interest': '7',
  'stem_vs_humanities': '3',
  'abstract_vs_specific': '6',
  'difficulty': '6',
  'multidisciplinary': '4',
  'project_based': '5',
  'creative': '4'},
 {'theoretical_vs_practical': '5',
  'usefulness': '8',
  'interest': '7',
  'stem_vs_humanities': '2',
  'abstract_vs_specific': '6',
  'difficulty': '6',
  'multidisciplinary': '3',
  'project_based': '5',
  'creative': '4'})