In [19]:
import os
import json
import numpy as np
from nltk.stem import PorterStemmer
from tqdm import tqdm

In [20]:
files = os.listdir('../data/generated')
courses = []
for file in files:
    with open(f'../data/generated/{file}', 'r') as f:
        courses.append(json.load(f))

courses = courses[0]

def stem_keywords(courses):
    ps = PorterStemmer()
    for course in tqdm(courses, total=len(courses)):
        course['STEMMED_KEYWORDS'] = {ps.stem(keyword.lower()) for keyword in course['KEYWORDS']}
    return courses

courses = stem_keywords(courses)

100%|██████████| 379/379 [00:00<00:00, 6894.13it/s]


In [21]:
def keyword_intersection(courses):
    intersects = np.zeros((len(courses), len(courses)))

    for i, course in tqdm(enumerate(courses), total=len(courses)):
        for j, other_course in enumerate(courses):
            intersects[i, j] = len(course['STEMMED_KEYWORDS'] & other_course['STEMMED_KEYWORDS'])

    return intersects

kwd_intersects = keyword_intersection(courses)

100%|██████████| 379/379 [00:00<00:00, 6361.90it/s]


In [24]:
def find_top_courses(course_idx, matrix, n=10):
    intersection_scores = matrix[course_idx]
    course_scores = [(i, score) for i, score in enumerate(intersection_scores)]
    sorted_courses = sorted(course_scores, key=lambda x: x[1], reverse=True)

    return sorted_courses[1 : n + 1]


# Find course index with given code
course_idx = 101
top_similar_courses = find_top_courses(course_idx, kwd_intersects)

print("Looking for similar courses to course", courses[course_idx]['CODE'], " - ", courses[course_idx]['NAME'])

print(f"Top 10 courses with highest keyword intersection with course {course_idx}:")
for idx, (course_id, score) in enumerate(top_similar_courses):
    print(f"{idx+1}. Course {courses[course_id]['CODE']} - {courses[course_id]['NAME']} - Intersection score: {int(score)}")


Looking for similar courses to course  PA017   -   Information Systems Management 
Top 10 courses with highest keyword intersection with course 101:
1. Course  CORE013  -  Software Development: from an idea to working solution  - Intersection score: 4
2. Course  PA013  -  Software Testing and Analysis  - Intersection score: 3
3. Course  PA187  -  Project managment and project  - Intersection score: 3
4. Course  PB007  -  Software Engineering I  - Intersection score: 3
5. Course  PV260  -  Software Quality  - Intersection score: 3
6. Course  PB175  -  Project managment and project  - Intersection score: 2
7. Course  PV167  -  Seminar on Design and Architecture Patterns  - Intersection score: 2
8. Course  SA200  -  Internship - Software Engineering  - Intersection score: 2
9. Course  SB100  -  Bachelor Internship - Programming and Development  - Intersection score: 2
10. Course  IA159  -  Formal Methods for Software Analysis  - Intersection score: 1
