In [9]:
import os
import numpy as np
import pandas as pd
import os
import logging

import scripts.helpers as helpers
helpers.add_backend_to_path()

import app.courses as courses

course_client = courses.CourseClient(os.path.join("..", "web", "backend", "assets", "courses"))
df = course_client.df

embeddings_file = os.path.join("data", "embeddings", "embeddings_tomas_03.npy")
#embeddings_file = os.path.join("data", "embeddings", "embeds_from_catalogue.npy")
embeddings = np.load(embeddings_file)

print(embeddings.shape)
print(df.shape)

(21106, 768)
(21106, 30)


In [10]:
from IPython.display import display_html

def compare_results(df1, df2, title1="Method 1", title2="Method 2", columns=["CODE", "FACULTY", "NAME"]):
    """
    Display two dataframes side by side with titles for comparison.
    
    Args:
        df1: First dataframe to display
        df2: Second dataframe to display
        title1: Title for the first dataframe
        title2: Title for the second dataframe
        columns: Columns to display from the dataframes
    """
    # Format both DataFrames with styling
    styled_df1 = df1[columns].style.set_properties(**{'text-align': 'left'})
    styled_df2 = df2[columns].style.set_properties(**{'text-align': 'left'})

    # Convert to HTML strings
    html1 = styled_df1._repr_html_()
    html2 = styled_df2._repr_html_()

    # Display side by side with titles
    display_html(
        f'<div style="display: grid; grid-template-columns: 1fr 1fr; width: 100%; gap: 20px;">'
        f'<div>'
        f'<h3>{title1}:</h3>'
        f'{html1}'
        f'</div>'
        f'<div>'
        f'<h3>{title2}:</h3>'
        f'{html2}'
        f'</div>'
        f'</div>',
        raw=True
    )

In [25]:
def recommend_mmr(
  liked_codes: list[str],
  disliked_codes: list[str],
  skipped_codes: list[str],
  all_embeds: np.ndarray,
  courseClient,
  n: int = 10,
  lambda_param: float = 0.7
) -> list[dict]:
  # … same setup as before …
  liked_indices = courseClient.get_course_ids_by_codes(liked_codes)
  if not liked_indices:
    return []
  liked_avg = np.mean(all_embeds[liked_indices], axis=0)
  if disliked_codes:
    disliked_indices = courseClient.get_course_ids_by_codes(disliked_codes)
    disliked_avg = np.mean(all_embeds[disliked_indices], axis=0)
    target_embed = liked_avg - 0.5 * disliked_avg
  else:
    target_embed = liked_avg

  # 1) compute raw distances and raw target‐similarities
  distances = np.linalg.norm(all_embeds - target_embed, axis=1)
  sim_to_target = 1.0 / (1.0 + distances)

  excluded = set(liked_codes + disliked_codes + skipped_codes)

  # 2) build initial candidate list, sorted by descending sim_to_target
  candidate_idxs = [
    i for i in np.argsort(-sim_to_target)
  ][:(max(n, 100) + len(excluded))]

  excluded_idxs = courseClient.get_course_ids_by_codes(excluded)
  candidate_idxs = [
    c for c in candidate_idxs
    if c not in excluded_idxs
  ]

  # 3) MMR re‐ranking loop
  selected_idxs: list[int] = []
  while len(selected_idxs) < n and candidate_idxs:
    # Get current candidate and liked embeddings
    current_candidate_embeds = all_embeds[candidate_idxs]
    # liked_embeds can be calculated once outside the loop if liked_indices is static
    liked_embeds = all_embeds[liked_indices]

    # 1) Relevance term (vectorized)
    rel_vector = sim_to_target[candidate_idxs]

    # 2) Diversity term (vectorized)
    # Calculate distances between each candidate and all liked embeddings
    # Shape: (len(candidate_idxs), len(liked_indices))
    distances_cl = np.linalg.norm(
        current_candidate_embeds[:, None, :] - liked_embeds[None, :, :],
        axis=2
    )
    # Convert distances to similarities
    similarities_cl = 1.0 / (1.0 + distances_cl)
    # Calculate diversity for each candidate (max similarity to any liked item)
    # Shape: (len(candidate_idxs),)
    div_vector = np.max(similarities_cl, axis=1)

    # 3) Calculate MMR scores (vectorized)
    mmr_scores_vector = lambda_param * rel_vector - (1 - lambda_param) * div_vector

    # 4) Find the index *within candidate_idxs* corresponding to the max score
    max_score_local_idx = np.argmax(mmr_scores_vector)

    # 5) Get the actual course index (ID) with the highest score
    next_idx = candidate_idxs[max_score_local_idx]

    # 6) Add the best candidate to selected list and remove from candidates
    selected_idxs.append(next_idx)
    candidate_idxs.pop(max_score_local_idx) # More efficient than remove() when we have the index


  # 4) fetch the courses in the final order
  recommendations: list[dict] = []
  for idx in selected_idxs:
    course = courseClient.get_course_by_id(idx)
    if course:
      # you can still store the original distance or sim in an attribute
      course.SIMILARITY = 1 / (1 + float(distances[idx]))
      recommendations.append(course)

  return recommendations


In [31]:
def recommend_max(
  liked_codes: list[str],
  disliked_codes: list[str],
  skipped_codes: list[str],
  all_embeds: np.ndarray,
  courseClient,
  n: int = 10,
) -> list[dict]:
  """
  Most smimilar to any of the liked based on cosine
  """
  excluded = set(liked_codes + disliked_codes + skipped_codes)

  liked_indices = courseClient.get_course_ids_by_codes(liked_codes)
  disliked_indices = courseClient.get_course_ids_by_codes(disliked_codes)
  excluded_indices = courseClient.get_course_ids_by_codes(excluded)

  liked_embeds = all_embeds[liked_indices]
  disliked_embeds = all_embeds[disliked_indices]

  # 1. calculate overall similarity
  candidate_embeds_norm = all_embeds / np.linalg.norm(all_embeds, axis=1, keepdims=True)
  liked_embeds_norm = liked_embeds / np.linalg.norm(liked_embeds, axis=1, keepdims=True)
  # Shape: (len(candidate_idxs), len(liked_indices))
  similarity_liked = np.dot(candidate_embeds_norm, liked_embeds_norm.T)

  # 2. select best match for each course
  best_match_liked = np.max(similarity_liked, axis=1)

  # 3. filter out courses that are too similar
  if disliked_embeds.shape[0] > 0:
    disliked_embeds_norm = disliked_embeds / np.linalg.norm(disliked_embeds, axis=1, keepdims=True)
    similarity_disliked = np.dot(candidate_embeds_norm, disliked_embeds_norm.T)
    best_match_disliked = np.max(similarity_disliked, axis=1)

    to_filter_idx = np.where(best_match_disliked > 0.9)[0]
    best_match_liked[to_filter_idx] = -np.inf

  # 4. get indices of top n courses
  selected_idxs = np.argsort(-best_match_liked)[:(n + len(excluded))]
  selected_idxs = [i for i in selected_idxs if i not in excluded_indices]

  # 5. fetch the courses in the final order
  recommendations: list[dict] = []
  for idx in selected_idxs:
    course = courseClient.get_course_by_id(idx)
    if course:
      # Optionally, attach the similarity score
      course.SIMILARITY = float(best_match_liked[idx])
      recommendations.append(course)

  return recommendations

In [33]:
from app.recommend.embeddings import recommend_courses

DISLIKED_CODES = []

LIKED_CODES = ["PV197", "IB031", "CORE047"] # base
DISLIKED_CODES = ["MB151", "MB152", "MB154"] # base
#LIKED_CODES = ["C7073", "Bi4010", "IB002"] # bioinformatics
#LIKED_CODES = ["IB111"]
SKIPPED_CODES = []

rec_base = pd.DataFrame(recommend_mmr(
    liked_codes=LIKED_CODES,
    disliked_codes=DISLIKED_CODES,
    skipped_codes=SKIPPED_CODES,
    all_embeds=embeddings,
    courseClient=course_client,
    n=10,
    lambda_param=0.8
))

rec_avg = pd.DataFrame(recommend_max(
    liked_codes=LIKED_CODES,
    disliked_codes=DISLIKED_CODES,
    skipped_codes=SKIPPED_CODES,
    all_embeds=embeddings,
    courseClient=course_client,
    n=10
))

compare_results(
    rec_base, 
    rec_avg, 
    title1="mmr", 
    title2="max",
    columns=["CODE", "FACULTY", "NAME"]
)


Unnamed: 0,CODE,FACULTY,NAME
0,PB173,FI,Tematicky zaměřený vývoj aplikací
1,PV278,FI,Vývoj intuitivních uživatelských rozhraní
2,MDAX01,PřF,Bachelor's practice projects
3,PV182,FI,Human-Computer Interaction
4,PLIN081,FF,Pokročilé metody strojového učení
5,ISKB82,FF,AI v praxi: Dovednosti pro budoucnost a lepší služby
6,PV177,FI,Laboratory of Advanced Network Technologies
7,PV207,FI,Business Process Management
8,PV258,FI,Software Requirements Engineering
9,BKM_VIBD,ESF,Vizualizace businessových dat

Unnamed: 0,CODE,FACULTY,NAME
0,XV004,PřF,Od nápadu k podnikání
1,MPH_PODN,ESF,Podnikání
2,MKH_PODN,ESF,Business
3,BKH_ZAPO,ESF,Základy podnikání
4,BPH_ZAPO,ESF,Základy podnikání
5,PV242,FI,Inovace a podnikání
6,Bi9680en,PřF,"Artificial Intelligence in Biology, Chemistry, and Bioengineering"
7,ENTRE01,FF,Od UČO k IČO: Úvod do podnikání
8,E0034,PřF,Analýza a klasifikace dat
9,np4059,FSpS,Podnikání ve sportu


In [14]:
# Compare embeddings

embeddings_1 = np.load(os.path.join("data", "embeddings", "embeddings_tomas_03.npy"))
embeddings_2 = np.load(os.path.join("data", "embeddings", "embeds_from_catalogue.npy"))

LIKED_CODES = ["PV197", "IB031", "CORE047", "RLB666"]
DISLIKED_CODES = ["MB151", "MB152", "MB154"]
SKIPPED_CODES = []

rec_avg_1 = pd.DataFrame(recommend_average(
    liked_codes=LIKED_CODES,
    disliked_codes=DISLIKED_CODES,
    skipped_codes=SKIPPED_CODES,
    all_embeds=embeddings_1,
    courseClient=course_client,
    n=10
))

rec_avg_2 = pd.DataFrame(recommend_average(
    liked_codes=LIKED_CODES,
    disliked_codes=DISLIKED_CODES,
    skipped_codes=SKIPPED_CODES,
    all_embeds=embeddings_2,
    courseClient=course_client,
    n=10   
))

compare_results(
    rec_avg_1,  
    rec_avg_2,
    title1="Embeddings 03",
    title2="Embeddings base"
)

Unnamed: 0,CODE,FACULTY,NAME
0,PV242,FI,Inovace a podnikání
1,XV004,PřF,Od nápadu k podnikání
2,MKH_PODN,ESF,Business
3,MPH_PODN,ESF,Podnikání
4,BKH_ZAPO,ESF,Základy podnikání
5,BPH_ZAPO,ESF,Základy podnikání
6,BKF_TEZP,ESF,Teze závěrečné práce
7,BPE_TEBP,ESF,Teze bakalářské práce
8,BPH_BAS2,ESF,Bakalářský seminář 2
9,BPF_TEBP,ESF,Teze bakalářské práce

Unnamed: 0,CODE,FACULTY,NAME
0,XV004,PřF,Od nápadu k podnikání
1,SLAV01,FF,Seminář k magisterské diplomové práci I
2,SLAV01S,FF,Seminář k magisterské diplomové práci I
3,PV242,FI,Inovace a podnikání
4,BPF_TEBP,ESF,Teze bakalářské práce
5,BPE_TEBP,ESF,Teze bakalářské práce
6,SLAV02,FF,Seminář k magisterské diplomové práci II
7,BKF_TEZP,ESF,Teze závěrečné práce
8,BKF_SZP1,ESF,Seminář k závěrečné práci 1
9,RLBcA014,FF,Seminář k bakalářské práci I
