In [1]:
import os
import numpy as np
import pandas as pd
import os
import logging

import scripts.helpers as helpers
helpers.add_backend_to_path()

import app.courses as courses

course_client = courses.CourseClient(os.path.join("..", "web", "backend", "assets", "courses"))
df = course_client.df

embeddings_file = os.path.join("data", "embeddings", "embeddings_tomas_02.npy")
#embeddings_file = os.path.join("data", "embeddings", "embeds_from_catalogue.npy")
embeddings = np.load(embeddings_file)

print(embeddings.shape)
print(df.shape)

(21106, 768)
(21106, 30)


In [7]:
from IPython.display import display_html

def compare_results(df1, df2, title1="Method 1", title2="Method 2", columns=["CODE", "FACULTY", "NAME"]):
    """
    Display two dataframes side by side with titles for comparison.
    
    Args:
        df1: First dataframe to display
        df2: Second dataframe to display
        title1: Title for the first dataframe
        title2: Title for the second dataframe
        columns: Columns to display from the dataframes
    """
    # Format both DataFrames with styling
    styled_df1 = df1[columns].style.set_properties(**{'text-align': 'left'})
    styled_df2 = df2[columns].style.set_properties(**{'text-align': 'left'})

    # Convert to HTML strings
    html1 = styled_df1._repr_html_()
    html2 = styled_df2._repr_html_()

    # Display side by side with titles
    display_html(
        f'<div style="display: grid; grid-template-columns: 1fr 1fr; width: 100%; gap: 20px;">'
        f'<div>'
        f'<h3>{title1}:</h3>'
        f'{html1}'
        f'</div>'
        f'<div>'
        f'<h3>{title2}:</h3>'
        f'{html2}'
        f'</div>'
        f'</div>',
        raw=True
    )

In [2]:
def recommend_average(
    liked_codes: list[str],
    disliked_codes: list[str],
    skipped_codes: list[str],
    all_embeds: np.ndarray,
    courseClient,
    n: int = 10
) -> list[dict]:
    """
    Recommends courses based on the average of liked embeddings minus the average of disliked embeddings.
    
    Args:
        liked_codes: List of course codes that the user likes
        disliked_codes: List of course codes that the user dislikes
        skipped_codes: List of course codes to skip in recommendations
        all_embeds: Array of all course embeddings
        courseClient: Client for retrieving course information
        n: Number of recommendations to return
        
    Returns:
        List of recommended courses with similarity scores
    """
    # Get indices of liked and disliked courses
    liked_indices = [i for i, code in enumerate(df['CODE']) if code in liked_codes]
    disliked_indices = [i for i, code in enumerate(df['CODE']) if code in disliked_codes]
    
    # Skip empty sets
    if not liked_indices:
        logging.warning("No liked courses found in the dataset")
        return []
    
    # Calculate average embeddings
    liked_avg = np.mean(all_embeds[liked_indices], axis=0)
    
    # If there are disliked courses, subtract their average from the liked average
    if disliked_indices:
        disliked_avg = np.mean(all_embeds[disliked_indices], axis=0)
        target_embedding = liked_avg - disliked_avg*0.5
    else:
        target_embedding = liked_avg
    
    # Calculate Euclidean distances
    distances = np.linalg.norm(all_embeds - target_embedding, axis=1)
    
    # Create a list of (index, distance) tuples and sort by distance (ascending)
    indices_with_distances = [(i, dist) for i, dist in enumerate(distances)]
    indices_with_distances.sort(key=lambda x: x[1])
    
    # Filter out liked, disliked, and skipped courses
    excluded_codes = set(liked_codes + disliked_codes + skipped_codes)
    recommendations = []
    
    for i in range(len(indices_with_distances)):
        if len(recommendations) >= n:
            break
            
        idx, distance = indices_with_distances[i]
        code = df['CODE'].iloc[idx]
        if code in excluded_codes:
            continue
        course = courseClient.get_course_by_code(code)
        if not course:
            continue

        # Convert distance to similarity (lower distance = higher similarity)
        similarity = 1.0 / (1.0 + distance)  # Simple conversion to a 0-1 scale
        course.SIMILARITY = similarity
        recommendations.append(course)
    
    return recommendations


In [9]:
from app.recommend.embeddings import recommend_courses

LIKED_CODES = ["PV197", "IB031", "CORE047"]
DISLIKED_CODES = ["MB151", "MB152", "MB154"]
SKIPPED_CODES = []

rec_base = pd.DataFrame(recommend_courses(
    liked_codes=LIKED_CODES,
    disliked_codes=DISLIKED_CODES,
    skipped_codes=SKIPPED_CODES,
    all_embeds=embeddings,
    courseClient=course_client,
    n=10
))

rec_avg = pd.DataFrame(recommend_average(
    liked_codes=LIKED_CODES,
    disliked_codes=DISLIKED_CODES,
    skipped_codes=SKIPPED_CODES,
    all_embeds=embeddings,
    courseClient=course_client,
    n=10
))

compare_results(
    rec_base, 
    rec_avg, 
    title1="Standard recommendation method", 
    title2="Average embedding recommendation method"
)


Unnamed: 0,CODE,FACULTY,NAME
0,XV004,PřF,Od nápadu k podnikání
1,BKM_OPRO,ESF,Optimalizace a rozhodování
2,BKM_DAMI,ESF,Datamining
3,PV115,FI,Laboratoř dobývání znalostí
4,M5170,PřF,Matematické programování
5,E0034,PřF,Analýza a klasifikace dat
6,F1400,PřF,Programování
7,PV242,FI,Inovace a podnikání
8,MPH_EVPP,ESF,Empirický výzkum pro podnikovou praxi
9,PV021,FI,Neural Networks

Unnamed: 0,CODE,FACULTY,NAME
0,XV004,PřF,Od nápadu k podnikání
1,PV242,FI,Inovace a podnikání
2,Bi9680en,PřF,"Artificial Intelligence in Biology, Chemistry, and Bioengineering"
3,PV115,FI,Laboratoř dobývání znalostí
4,PA164,FI,Machine learning and natural language processing
5,PV227,FI,GPU Rendering
6,PA228,FI,Machine Learning in Image Processing
7,MPH_PODN,ESF,Podnikání
8,BKM_DAMI,ESF,Datamining
9,MKH_PODN,ESF,Business
