In [2]:
import os
import numpy as np
import pandas as pd
import os
import logging

import scripts.helpers as helpers
helpers.add_backend_to_path()

import app.courses as courses

course_client = courses.CourseClient(os.path.join("..", "web", "backend", "assets", "courses"))
df = course_client.df

embeddings_file = os.path.join("data", "embeddings", "embeddings_tomas_03.npy")
#embeddings_file = os.path.join("data", "embeddings", "embeds_from_catalogue.npy")
embeddings = np.load(embeddings_file)

print(embeddings.shape)
print(df.shape)

(21106, 768)
(21106, 30)


In [3]:
from IPython.display import display_html

def compare_results(df1, df2, title1="Method 1", title2="Method 2", columns=["CODE", "FACULTY", "NAME"]):
    """
    Display two dataframes side by side with titles for comparison.
    
    Args:
        df1: First dataframe to display
        df2: Second dataframe to display
        title1: Title for the first dataframe
        title2: Title for the second dataframe
        columns: Columns to display from the dataframes
    """
    # Format both DataFrames with styling
    styled_df1 = df1[columns].style.set_properties(**{'text-align': 'left'})
    styled_df2 = df2[columns].style.set_properties(**{'text-align': 'left'})

    # Convert to HTML strings
    html1 = styled_df1._repr_html_()
    html2 = styled_df2._repr_html_()

    # Display side by side with titles
    display_html(
        f'<div style="display: grid; grid-template-columns: 1fr 1fr; width: 100%; gap: 20px;">'
        f'<div>'
        f'<h3>{title1}:</h3>'
        f'{html1}'
        f'</div>'
        f'<div>'
        f'<h3>{title2}:</h3>'
        f'{html2}'
        f'</div>'
        f'</div>',
        raw=True
    )

In [4]:
def recommend_average(
    liked_codes: list[str],
    disliked_codes: list[str],
    skipped_codes: list[str],
    all_embeds: np.ndarray,
    courseClient,
    n: int = 10
) -> list[dict]:
    """
    Recommends courses based on the average of liked embeddings minus the average of disliked embeddings.
    
    Args:
        liked_codes: List of course codes that the user likes
        disliked_codes: List of course codes that the user dislikes
        skipped_codes: List of course codes to skip in recommendations
        all_embeds: Array of all course embeddings
        courseClient: Client for retrieving course information
        n: Number of recommendations to return
        
    Returns:
        List of recommended courses with similarity scores
    """
    # Get indices of liked and disliked courses
    liked_indices = [i for i, code in enumerate(df['CODE']) if code in liked_codes]
    disliked_indices = [i for i, code in enumerate(df['CODE']) if code in disliked_codes]
    
    # Skip empty sets
    if not liked_indices:
        logging.warning("No liked courses found in the dataset")
        return []
    
    # Calculate average embeddings
    liked_avg = np.mean(all_embeds[liked_indices], axis=0)
    
    # If there are disliked courses, subtract their average from the liked average
    if disliked_indices:
        disliked_avg = np.mean(all_embeds[disliked_indices], axis=0)
        target_embedding = liked_avg - disliked_avg*0.5
    else:
        target_embedding = liked_avg
    
    distances = np.linalg.norm(all_embeds - target_embedding, axis=1)
    
    # Create a list of (index, distance) tuples and sort by distance (ascending)
    indices_with_distances = [(i, dist) for i, dist in enumerate(distances)]
    indices_with_distances.sort(key=lambda x: x[1])
    
    # Filter out liked, disliked, and skipped courses
    excluded_codes = set(liked_codes + disliked_codes + skipped_codes)
    recommendations = []
    
    for i in range(len(indices_with_distances)):
        if len(recommendations) >= n:
            break
            
        idx, distance = indices_with_distances[i]
        code = df['CODE'].iloc[idx]
        if code in excluded_codes:
            continue
        course = courseClient.get_course_by_code(code)
        if not course:
            continue

        # Convert distance to similarity (lower distance = higher similarity)
        similarity = 1.0 / (1.0 + distance)  # Simple conversion to a 0-1 scale
        course.SIMILARITY = similarity
        recommendations.append(course)
    
    return recommendations


In [5]:
def recommend_with_mmr(
  liked_codes: list[str],
  disliked_codes: list[str],
  skipped_codes: list[str],
  all_embeds: np.ndarray,
  courseClient,
  n: int = 10,
  lambda_param: float = 0.7
) -> list[dict]:
  # … same setup as before …
  liked_indices = [i for i, code in enumerate(df['CODE']) if code in liked_codes]
  if not liked_indices:
    logging.warning("No liked courses found in the dataset")
    return []
  liked_avg = np.mean(all_embeds[liked_indices], axis=0)
  if disliked_codes:
    dis_indices = [i for i, code in enumerate(df['CODE']) if code in disliked_codes]
    disliked_avg = np.mean(all_embeds[dis_indices], axis=0)
    target_embed = liked_avg - 0.5 * disliked_avg
  else:
    target_embed = liked_avg

  # 1) compute raw distances and raw target‐similarities
  distances = np.linalg.norm(all_embeds - target_embed, axis=1)
  sim_to_target = 1.0 / (1.0 + distances)

  excluded = set(liked_codes + disliked_codes + skipped_codes)

  # 2) build initial candidate list, sorted by descending sim_to_target
  candidate_idxs = [
    i for i in np.argsort(-sim_to_target)
    if df['CODE'].iloc[i] not in excluded
  ][:100]

  # 3) MMR re‐ranking loop
  selected_idxs: list[int] = []
  while len(selected_idxs) < n and candidate_idxs:
    mmr_scores: list[tuple[int, float]] = []
    for i in candidate_idxs:
      # relevance term
      rel = sim_to_target[i]
      # diversity term: max similarity to any already selected
      sims_to_sel = 1.0 / (
        1.0 + np.linalg.norm(
          all_embeds[i][None, :] - all_embeds[liked_indices],
          axis=1
        )
      )
      div = np.max(sims_to_sel)
      score = lambda_param * rel - (1 - lambda_param) * div
      mmr_scores.append((i, score))

    # pick the one with largest MMR score
    next_idx, _ = max(mmr_scores, key=lambda x: x[1])
    selected_idxs.append(next_idx)
    candidate_idxs.remove(next_idx)

  # 4) fetch the courses in the final order
  recommendations: list[dict] = []
  for idx in selected_idxs:
    code = df['CODE'].iloc[idx]
    course = courseClient.get_course_by_code(code)
    if course:
      # you can still store the original distance or sim in an attribute
      course.SIMILARITY = float(distances[idx])
      recommendations.append(course)

  return recommendations


In [None]:
from app.recommend.embeddings import recommend_courses

DISLIKED_CODES = []

#LIKED_CODES = ["PV197", "IB031", "CORE047"] # base
#DISLIKED_CODES = ["MB151", "MB152", "MB154"] # base
LIKED_CODES = ["C7073", "Bi4010", "IB002"] # bioinformatics
#LIKED_CODES = ["IB111"]
SKIPPED_CODES = []

rec_base = pd.DataFrame(recommend_with_mmr(
    liked_codes=LIKED_CODES,
    disliked_codes=DISLIKED_CODES,
    skipped_codes=SKIPPED_CODES,
    all_embeds=embeddings,
    courseClient=course_client,
    n=10,
    lambda_param=0.8
))

rec_avg = pd.DataFrame(recommend_average(
    liked_codes=LIKED_CODES,
    disliked_codes=DISLIKED_CODES,
    skipped_codes=SKIPPED_CODES,
    all_embeds=embeddings,
    courseClient=course_client,
    n=10
))

compare_results(
    rec_base, 
    rec_avg, 
    title1="base", 
    title2="average",
    columns=["CODE", "FACULTY", "NAME", "SIMILARITY"]
)


Unnamed: 0,CODE,FACULTY,NAME,SIMILARITY
0,PB173,FI,Tematicky zaměřený vývoj aplikací,0.730219
1,PV278,FI,Vývoj intuitivních uživatelských rozhraní,0.743375
2,MDAX01,PřF,Bachelor's practice projects,0.740629
3,PV182,FI,Human-Computer Interaction,0.741724
4,PLIN081,FF,Pokročilé metody strojového učení,0.720745
5,ISKB82,FF,AI v praxi: Dovednosti pro budoucnost a lepší služby,0.741155
6,PV177,FI,Laboratory of Advanced Network Technologies,0.732999
7,PV207,FI,Business Process Management,0.738564
8,PV258,FI,Software Requirements Engineering,0.742187
9,BKM_VIBD,ESF,Vizualizace businessových dat,0.73837

Unnamed: 0,CODE,FACULTY,NAME,SIMILARITY
0,XV004,PřF,Od nápadu k podnikání,0.589702
1,PV242,FI,Inovace a podnikání,0.589418
2,Bi9680en,PřF,"Artificial Intelligence in Biology, Chemistry, and Bioengineering",0.58719
3,PV227,FI,GPU Rendering,0.584771
4,PV115,FI,Laboratoř dobývání znalostí,0.583738
5,MKH_PODN,ESF,Business,0.583437
6,MPH_PODN,ESF,Podnikání,0.583311
7,PA228,FI,Machine Learning in Image Processing,0.583171
8,PA164,FI,Machine learning and natural language processing,0.583115
9,BKM_DAMI,ESF,Datamining,0.582491


In [17]:
# Compare embeddings

embeddings_1 = np.load(os.path.join("data", "embeddings", "embeddings_tomas_03.npy"))
embeddings_2 = np.load(os.path.join("data", "embeddings", "embeds_from_catalogue.npy"))

LIKED_CODES = ["PV197", "IB031", "CORE047", "RLB666"]
DISLIKED_CODES = ["MB151", "MB152", "MB154"]
SKIPPED_CODES = []

rec_avg_1 = pd.DataFrame(recommend_average(
    liked_codes=LIKED_CODES,
    disliked_codes=DISLIKED_CODES,
    skipped_codes=SKIPPED_CODES,
    all_embeds=embeddings_1,
    courseClient=course_client,
    n=10
))

rec_avg_2 = pd.DataFrame(recommend_average(
    liked_codes=LIKED_CODES,
    disliked_codes=DISLIKED_CODES,
    skipped_codes=SKIPPED_CODES,
    all_embeds=embeddings_2,
    courseClient=course_client,
    n=10   
))

compare_results(
    rec_avg_1,  
    rec_avg_2,
    title1="Embeddings 03",
    title2="Embeddings base"
)

Unnamed: 0,CODE,FACULTY,NAME
0,PV242,FI,Inovace a podnikání
1,XV004,PřF,Od nápadu k podnikání
2,MKH_PODN,ESF,Business
3,MPH_PODN,ESF,Podnikání
4,BKH_ZAPO,ESF,Základy podnikání
5,BPH_ZAPO,ESF,Základy podnikání
6,BKF_TEZP,ESF,Teze závěrečné práce
7,BPE_TEBP,ESF,Teze bakalářské práce
8,BPH_BAS2,ESF,Bakalářský seminář 2
9,BPF_TEBP,ESF,Teze bakalářské práce

Unnamed: 0,CODE,FACULTY,NAME
0,XV004,PřF,Od nápadu k podnikání
1,SLAV01,FF,Seminář k magisterské diplomové práci I
2,SLAV01S,FF,Seminář k magisterské diplomové práci I
3,PV242,FI,Inovace a podnikání
4,BPF_TEBP,ESF,Teze bakalářské práce
5,BPE_TEBP,ESF,Teze bakalářské práce
6,SLAV02,FF,Seminář k magisterské diplomové práci II
7,BKF_TEZP,ESF,Teze závěrečné práce
8,BKF_SZP1,ESF,Seminář k závěrečné práci 1
9,RLBcA014,FF,Seminář k bakalářské práci I
