In [1]:
import os
import numpy as np
import pandas as pd
import os
import logging

import scripts.helpers as helpers
helpers.add_backend_to_path()

import app.courses as courses

course_client = courses.CourseClient(os.path.join("..", "web", "backend", "assets", "courses"))
df = course_client.df

embeddings_file = os.path.join("data", "embeddings", "embeddings_tomas_03.npy")
#embeddings_file = os.path.join("data", "embeddings", "embeds_from_catalogue.npy")
embeddings = np.load(embeddings_file)

print(embeddings.shape)
print(df.shape)

(21106, 768)
(21106, 30)


In [None]:
from IPython.display import display_html

def compare_results(df1, df2, title1="Method 1", title2="Method 2", columns=["CODE", "FACULTY", "NAME"]):
    """
    Display two dataframes side by side with titles for comparison.
    
    Args:
        df1: First dataframe to display
        df2: Second dataframe to display
        title1: Title for the first dataframe
        title2: Title for the second dataframe
        columns: Columns to display from the dataframes
    """
    # Format both DataFrames with styling
    styled_df1 = df1[columns].style.set_properties(**{'text-align': 'left'})
    styled_df2 = df2[columns].style.set_properties(**{'text-align': 'left'})

    # Convert to HTML strings
    html1 = styled_df1._repr_html_()
    html2 = styled_df2._repr_html_()

    # Display side by side with titles
    display_html(
        f'<div style="display: grid; grid-template-columns: 1fr 1fr; width: 100%; gap: 20px;">'
        f'<div>'
        f'<h3>{title1}:</h3>'
        f'{html1}'
        f'</div>'
        f'<div>'
        f'<h3>{title2}:</h3>'
        f'{html2}'
        f'</div>'
        f'</div>',
        raw=True
    )

In [54]:
def compare_results_general(*dfs_with_titles, columns=["CODE", "FACULTY", "NAME"]):
    """
    Display multiple dataframes side by side with titles for comparison.
    
    Args:
        *dfs_with_titles: A variable number of tuples, where each tuple
                          contains a dataframe and its title (e.g., (df1, "Title 1"), (df2, "Title 2")).
        columns: Columns to display from the dataframes.
    """
    if not dfs_with_titles:
        print("No dataframes to display.")
        return

    num_dfs = len(dfs_with_titles)
    html_parts = []

    for df, title in dfs_with_titles:
        styled_df = df[columns].style.set_properties(**{'text-align': 'left'})
        html_df = styled_df._repr_html_()
        html_parts.append(
            f'<div>'
            f'<h3>{title}:</h3>'
            f'{html_df}'
            f'</div>'
        )

    # Display side by side with titles
    display_html(
        f'<div style="display: grid; grid-template-columns: repeat({num_dfs}, 1fr); width: 100%; gap: 20px;">'
        f'{"".join(html_parts)}'
        f'</div>',
        raw=True
    )

In [3]:
# def recommend_mmr(
#   liked_codes: list[str],
#   disliked_codes: list[str],
#   skipped_codes: list[str],
#   all_embeds: np.ndarray,
#   courseClient,
#   n: int = 10,
#   lambda_param: float = 0.7
# ) -> list[dict]:
#   # … same setup as before …
#   liked_indices = courseClient.get_course_ids_by_codes(liked_codes)
#   if not liked_indices:
#     return []
#   liked_avg = np.mean(all_embeds[liked_indices], axis=0)
#   if disliked_codes:
#     disliked_indices = courseClient.get_course_ids_by_codes(disliked_codes)
#     disliked_avg = np.mean(all_embeds[disliked_indices], axis=0)
#     target_embed = liked_avg - 0.5 * disliked_avg
#   else:
#     target_embed = liked_avg

#   # 1) compute raw distances and raw target‐similarities
#   distances = np.linalg.norm(all_embeds - target_embed, axis=1)
#   sim_to_target = 1.0 / (1.0 + distances)

#   excluded = set(liked_codes + disliked_codes + skipped_codes)

#   # 2) build initial candidate list, sorted by descending sim_to_target
#   candidate_idxs = [
#     i for i in np.argsort(-sim_to_target)
#   ][:(max(n, 100) + len(excluded))]

#   excluded_idxs = courseClient.get_course_ids_by_codes(excluded)
#   candidate_idxs = [
#     c for c in candidate_idxs
#     if c not in excluded_idxs
#   ]

#   # 3) MMR re‐ranking loop
#   selected_idxs: list[int] = []
#   while len(selected_idxs) < n and candidate_idxs:
#     # Get current candidate and liked embeddings
#     current_candidate_embeds = all_embeds[candidate_idxs]
#     # liked_embeds can be calculated once outside the loop if liked_indices is static
#     liked_embeds = all_embeds[liked_indices]

#     # 1) Relevance term (vectorized)
#     rel_vector = sim_to_target[candidate_idxs]

#     # 2) Diversity term (vectorized)
#     # Calculate distances between each candidate and all liked embeddings
#     # Shape: (len(candidate_idxs), len(liked_indices))
#     distances_cl = np.linalg.norm(
#         current_candidate_embeds[:, None, :] - liked_embeds[None, :, :],
#         axis=2
#     )
#     # Convert distances to similarities
#     similarities_cl = 1.0 / (1.0 + distances_cl)
#     # Calculate diversity for each candidate (max similarity to any liked item)
#     # Shape: (len(candidate_idxs),)
#     div_vector = np.max(similarities_cl, axis=1)

#     # 3) Calculate MMR scores (vectorized)
#     mmr_scores_vector = lambda_param * rel_vector - (1 - lambda_param) * div_vector

#     # 4) Find the index *within candidate_idxs* corresponding to the max score
#     max_score_local_idx = np.argmax(mmr_scores_vector)

#     # 5) Get the actual course index (ID) with the highest score
#     next_idx = candidate_idxs[max_score_local_idx]

#     # 6) Add the best candidate to selected list and remove from candidates
#     selected_idxs.append(next_idx)
#     candidate_idxs.pop(max_score_local_idx) # More efficient than remove() when we have the index


#   # 4) fetch the courses in the final order
#   recommendations: list[dict] = []
#   for idx in selected_idxs:
#     course = courseClient.get_course_by_id(idx)
#     if course:
#       # you can still store the original distance or sim in an attribute
#       course.SIMILARITY = 1 / (1 + float(distances[idx]))
#       recommendations.append(course)

#   return recommendations


In [4]:
# def recommend_max(
#   liked_codes: list[str],
#   disliked_codes: list[str],
#   skipped_codes: list[str],
#   all_embeds: np.ndarray,
#   courseClient,
#   n: int = 10,
# ) -> list[dict]:
#   """
#   Most smimilar to any of the liked based on cosine
#   """
#   excluded = set(liked_codes + disliked_codes + skipped_codes)

#   liked_indices = courseClient.get_course_ids_by_codes(liked_codes)
#   disliked_indices = courseClient.get_course_ids_by_codes(disliked_codes)
#   excluded_indices = courseClient.get_course_ids_by_codes(excluded)

#   liked_embeds = all_embeds[liked_indices]
#   disliked_embeds = all_embeds[disliked_indices]

#   # 1. calculate overall similarity
#   candidate_embeds_norm = all_embeds / np.linalg.norm(all_embeds, axis=1, keepdims=True)
#   liked_embeds_norm = liked_embeds / np.linalg.norm(liked_embeds, axis=1, keepdims=True)
#   # Shape: (len(candidate_idxs), len(liked_indices))
#   similarity_liked = np.dot(candidate_embeds_norm, liked_embeds_norm.T)

#   # 2. select best match for each course
#   best_match_liked = np.max(similarity_liked, axis=1)

#   # 3. filter out courses that are too similar
#   if disliked_embeds.shape[0] > 0:
#     disliked_embeds_norm = disliked_embeds / np.linalg.norm(disliked_embeds, axis=1, keepdims=True)
#     similarity_disliked = np.dot(candidate_embeds_norm, disliked_embeds_norm.T)
#     best_match_disliked = np.max(similarity_disliked, axis=1)

#     to_filter_idx = np.where(best_match_disliked > 0.9)[0]
#     best_match_liked[to_filter_idx] = -np.inf

#   # 4. get indices of top n courses
#   selected_idxs = np.argsort(-best_match_liked)[:(n + len(excluded))]
#   selected_idxs = [i for i in selected_idxs if i not in excluded_indices]

#   # 5. fetch the courses in the final order
#   recommendations: list[dict] = []
#   for idx in selected_idxs:
#     course = courseClient.get_course_by_id(idx)
#     if course:
#       # Optionally, attach the similarity score
#       course.SIMILARITY = float(best_match_liked[idx])
#       recommendations.append(course)

#   return recommendations

In [5]:
from app.recommend.embeddings import recommend_mmr_cos
from app.recommend.baseline import recommend_courses_baseline

LIKED_CODES = []
DISLIKED_CODES = []
SKIPPED_CODES = []

# Baseline can be great in certain situations, e.g. Brazdil - same teaching methods, all are AI courses
LIKED_CODES = ["IB031"]
# But also very bad if:
# - teachers teache a wider spectrum of courses - e.g. IB015
# - you run out of courses taught by that teacher
DISLIKED_CODES = ["PV027", "PV021", "IA168", "IV125", "IV133"]

# BASELINE:
# Not surprising at all, zero serendipity, not very useful
# Overall, not better than just looking up the teacher in IS

rec_mmr = pd.DataFrame(recommend_mmr_cos(
    liked_codes=LIKED_CODES,
    disliked_codes=DISLIKED_CODES,
    skipped_codes=SKIPPED_CODES,
    all_embeds=embeddings,
    courseClient=course_client,
    n=10,
    lambda_param=0.8
))

rec_baseline = pd.DataFrame(recommend_courses_baseline(
    liked=LIKED_CODES,
    disliked=DISLIKED_CODES,
    skipped=SKIPPED_CODES,
    # all_embeds=embeddings,
    courseClient=course_client,
    n=10
))

compare_results(
    rec_mmr, 
    rec_baseline, 
    title1="mmr", 
    title2="baseline",
    columns=["CODE", "FACULTY", "NAME", "TEACHERS", "DEPARTMENT"]
)

Unnamed: 0,CODE,FACULTY,NAME,TEACHERS,DEPARTMENT
0,MKF_AIIF,ESF,AI in Finance,"Lyócsa, Š.",KFin
1,MPF_AIIF,ESF,AI in Finance,"Lyócsa, Š. - Lyócsa, Š. - Vondráček, D.",KFin
2,Bi9680en,PřF,"Artificial Intelligence in Biology, Chemistry, and Bioengineering","Damborský, J. - Mazurenko, S. - Haddadi, F. - Kohout, P. - Velecký, J.",ÚEB
3,E0034,PřF,Analýza a klasifikace dat,"Koriťáková, E. - Holčík, J. - Vyškovský, R. - Jurková, T.",RECETOX
4,PA164,FI,Machine learning and natural language processing,"Nováček, V.",KSUZD
5,M8DM1,PřF,Data mining I,"Navrátil, R.",ÚMS
6,PV115,FI,Laboratoř dobývání znalostí,"Popelínský, L.",KSUZD
7,E7490,PřF,Pokročilé neparametrické metody,"Komprdová, K.",RECETOX
8,PV211,FI,Introduction to Information Retrieval,"Sojka, P. - Fajčík, M. - Vrabcová, T. - Ščavnická, Š. - Štefánik, M. - Toma, M. - Starý Novotný, V.",KVI
9,dc4004,FSpS,Aplikovaná matematická statistika,"Sebera, M.",HEALTH

Unnamed: 0,CODE,FACULTY,NAME,TEACHERS,DEPARTMENT
0,PA053,FI,Distributed Systems and Middleware,"Batko, M.",KSUZD
1,IV129,FI,Laboratoř Sybila,"Šafránek, D. - Brim, L.",KSUZD
2,PV254,FI,Recommender Systems,"Pelánek, R.",KSUZD
3,PV290,FI,Chemoinformatics,"Svobodová, R. - Horský, V. - Raček, T.",KSUZD
4,PA195,FI,NoSQL Databases,"Dohnal, V. - Bártek, L.",KSUZD
5,PV072,FI,Seminář z asistivních technologií,"Plhák, J.",KSUZD
6,IV127,FI,Seminář laboratoře adaptabilní výuky,"Pelánek, R.",KSUZD
7,PA036,FI,Projekt z databázových systémů,"Dohnal, V. - Procházka, D.",KSUZD
8,PA107,FI,Corpus Tools Project,"Rychlý, P.",KSUZD
9,PA154,FI,Language Modeling,"Rychlý, P. - Nevěřilová, Z.",KSUZD


In [6]:
if not LIKED_CODES:
    LIKED_CODES = ["MV008"] # crashes if empty
if not DISLIKED_CODES:
    DISLIKED_CODES = ["PB007"] # crashes if empty

liked_courses_with_scores = pd.DataFrame([course_client.get_course_by_code(code) for code in LIKED_CODES])
disliked_courses_with_scores = pd.DataFrame([course_client.get_course_by_code(code) for code in DISLIKED_CODES])

compare_results(
    liked_courses_with_scores, 
    disliked_courses_with_scores, 
    title1="liked", 
    title2="disliked",
    columns=["CODE", "FACULTY", "NAME", "TEACHERS", "DEPARTMENT"]
)

Unnamed: 0,CODE,FACULTY,NAME,TEACHERS,DEPARTMENT
0,IB031,FI,Úvod do strojového učení,"Brázdil, T. - Čechák, J. - Čechová, M. - Foltýnek, T. - Gregora, F. - Kecskésová, M. - Nováček, V. - Pavlík, T. - Wernerová, P.",KSUZD

Unnamed: 0,CODE,FACULTY,NAME,TEACHERS,DEPARTMENT
0,PV027,FI,Optimization,"Brázdil, T. - Musil, V. - Kurečka, M.",KSUZD
1,PV021,FI,Neural Networks,"Brázdil, T. - Foltýnek, T. - Gallo, M. - Bajger, A. - Ivora, A. - Zelina, P. - Kubín, J. - Kraus, J. - Lejdar, L. - Šimurka, A.",KSUZD
2,IA168,FI,Algorithmic game theory,"Brázdil, T. - Balabán, J. - Klaška, D. - Kurečka, M. - Žáček, M.",KSUZD
3,IV125,FI,Formela lab seminar,"Kučera, A. - Brázdil, T. - Řehák, V. - Blumensath, A. - Obdržálek, J. - Novotný, P. - Jonáš, M. - Musil, V. - Klaška, D. - Anselm Paulus",KTP
4,IV133,FI,Seminář laboratoře RationAI,"Brázdil, T. - Musil, V.",KTP


In [51]:
assets = "assets"
kwd_intersects_gemini = sp.load_npz("../web/backend/assets/intersects_sparse.npz")
kwd_intersects_tfidf = sp.load_npz("../web/backend/assets/intersects_tfidf.npz")

In [218]:
import app.recommend.embeddings as rec_embeddings
import importlib
importlib.reload(rec_embeddings)

LIKED_CODES = []
DISLIKED_CODES = []
SKIPPED_CODES = []

# LIKED_CODES = ["IB015", "IB016", "IB002", "MB152", "AEB_A14b", "DUCIT", "p952"] # Tereza
# DISLIKED_CODES = ["IB000", "PV080"]
# Very diverse set of interests - Functional programming, Algorithms, Mathematical Analysis, Archeology, Teaching Lab, Juggling

# LIKED_CODES = ["IB015", "IB016", "IB002", "MB152", "p952", "IB111", "PB156", "PB162", "IB005", "IB031", "AEB_A14b"]
# DISLIKED_CODES = ["IB000", "PV080"]
# A lot of courses:
#  - Average_embed algorith becomes useless - starts recommending nonsensical courses that have nothing to do with the liked ones (Proseminar on the study of language, Methodology of Czech language 2, Theory of Instruction...)
#  - Other algos ok (even keywords)

# LIKED_CODES = ["MB151", "IV111", "MB153", "PV027", "PB156", "PB152", "PB152cv"]
# DISLIKED_CODES = ["MB154", "MB152"]

# DISLIKED_CODES = ["PB156", "PB162", "PB152", "IB031", "MB152", "PB152cv", "PV005", "MB151", "MB153" ,"ISKM72", "IB005", "p983", "IB000ext", "PB071", "IB114"]
# LIKED_CODES = ["PB161"]
# DISLIKED_CODES = ["PB162", "C2160"]

# LIKED_CODES = ["C7073", "Bi4010", "IB111"] # bioinformatics

# LIKED_CODES = ["C7073", "IB111"] # bioinformatics
# Max cannot combine interests, even when the combination makes sense
# Average can combine, but has problems when applied to too many liked courses
# LIKED_CODES = ["IB015", "IB016", "IB002", "MB152", "p952", "IB111", "PB156", "PB162", "IB005", "IB031", "AEB_A14b"]
# DISLIKED_CODES = ["IB000", "PV080"]
# It starts recommending courses that are not related to the liked ones at all ( Theory of Instruction, Using Corpora in Teaching Czech as a Foreign Language, Didactics of Czech Language and Literature II...)


LIKED_CODES = ["CORE059", "np4412", "BSPC011", "BZPO011c", "VLPP3X1"]
LIKED_CODES = ["IB031", "PV021", "PB016" ,"PV061"]
LIKED_CODES = ["IB111"]

# LIKED_CODES = ["Bi4010", "IB111", "IB015", "MB152", "p952", "IB111", "PB156", "PB162", "IB005", "IB031", "AEB_A14b"]

# LIKED_CODES = ["BIp001", "IB002", "IB005"] # bioinformatics
# LIKED_CODES = ["IB005", "MA010", "IA159", "PV021", "IB031"]
# LIKED_CODES = ["IB031", "CORE016"]
# LIKED_CODES = ["IV124", "PV021", "IV109"]
# LIKED_CODES = ["PV021", "NLI_04"]

# LIKED_CODES = ["PB071", "CORE052"]
# DISLIKED_CODES = ["C2160", "PV065"]

# LIKED_CODES = ['IB111', "MV008", "IB031"]
# DISLIKED_CODES = ["PB007"]

rec_mmr = pd.DataFrame(rec_embeddings.recommend_mmr_cos(
    liked_codes=LIKED_CODES,
    disliked_codes=DISLIKED_CODES,
    skipped_codes=SKIPPED_CODES,
    all_embeds=embeddings,
    courseClient=course_client,
    n=30,
    lambda_param=1
))

rec_max_with_combs = pd.DataFrame(rec_embeddings.recommend_max_with_combinations(
    liked_codes=LIKED_CODES,
    disliked_codes=DISLIKED_CODES,
    skipped_codes=SKIPPED_CODES,
    all_embeds=embeddings,
    courseClient=course_client,
    n=30,
))

rec_max = pd.DataFrame(rec_embeddings.recommend_max(
    liked_codes=LIKED_CODES,
    disliked_codes=DISLIKED_CODES,
    skipped_codes=SKIPPED_CODES,
    all_embeds=embeddings,
    courseClient=course_client,
    n=30,
))

rec_kwd_gemini = pd.DataFrame(rec_keywords.recommend_courses_keywords(
    liked=LIKED_CODES,
    disliked=DISLIKED_CODES,
    skipped=SKIPPED_CODES,
    courseClient=course_client,
    n=30,
    kwd_intersects=kwd_intersects_gemini
))

rec_kwd_tfidf = pd.DataFrame(rec_keywords.recommend_courses_keywords(
    liked=LIKED_CODES,
    disliked=DISLIKED_CODES,
    skipped=SKIPPED_CODES,
    courseClient=course_client,
    n=30,
    kwd_intersects=kwd_intersects_tfidf
))

compare_results_general(
    # (rec_mmr, "Average_embed"), 
    # (rec_max, "Max"),
    (rec_max_with_combs, "Max with combinations"),
    (rec_kwd_gemini, "Keywords Gemini"),
    (rec_kwd_tfidf, "Keywords TF-IDF"),
    columns=["CODE", "NAME_EN", "SIMILARITY", "RECOMMENDED_FROM"]
)

[KYS] Filtered out 21 courses that are too similar to liked ones
Filtered out 2 courses that are too similar to liked ones


Unnamed: 0,CODE,NAME_EN,SIMILARITY,RECOMMENDED_FROM
0,C2184,Introduction to programming in Python,0.848422,['IB111']
1,F1420,Programming in Python,0.828709,['IB111']
2,PLIN048,Introduction to programming for humanities,0.819631,['IB111']
3,IB002,Algorithms and data structures I,0.813036,['IB111']
4,E3011,Algorithmization and programming,0.810996,['IB111']
5,ISKM72,Basics of Algorithmic Thinking,0.805376,['IB111']
6,ISKM80,Python for non-programmers,0.804476,['IB111']
7,IB114,Introduction to Programming and Algorithms II,0.797866,['IB111']
8,MDA204,Introduction to Python,0.789193,['IB111']
9,IV104,Programming Seminar,0.785168,['IB111']

Unnamed: 0,CODE,NAME_EN,SIMILARITY,RECOMMENDED_FROM
0,IB113,Introduction to Programming and Algorithms,7,['IB111']
1,ISKM80,Python for non-programmers,5,['IB111']
2,PLIN048,Introduction to programming for humanities,5,['IB111']
3,IB002,Algorithms and data structures I,4,['IB111']
4,IB114,Introduction to Programming and Algorithms II,4,['IB111']
5,E3011,Algorithmization and programming,4,['IB111']
6,ISKM72,Basics of Algorithmic Thinking,3,['IB111']
7,IV104,Programming Seminar,3,['IB111']
8,PB175,Project managment and project,3,['IB111']
9,SZ6103,,3,['IB111']

Unnamed: 0,CODE,NAME_EN,SIMILARITY,RECOMMENDED_FROM
0,IB113,Introduction to Programming and Algorithms,9,['IB111']
1,ISKM72,Basics of Algorithmic Thinking,6,['IB111']
2,IB002,Algorithms and data structures I,6,['IB111']
3,IB114,Introduction to Programming and Algorithms II,6,['IB111']
4,IV003,Algorithms and Data Structures II,6,['IB111']
5,ISKM80,Python for non-programmers,5,['IB111']
6,C2142,Design of algorithms in life sciences,5,['IB111']
7,C2184,Introduction to programming in Python,5,['IB111']
8,F1420,Programming in Python,5,['IB111']
9,PLIN081,Advanced machine learning methods,4,['IB111']


In [50]:
import scipy.sparse as sp
import app.recommend.embeddings as rec_embeddings
import app.recommend.keywords as rec_keywords
import importlib

importlib.reload(rec_embeddings)
importlib.reload(rec_keywords)

LIKED_CODES = []
DISLIKED_CODES = []
SKIPPED_CODES = []

LIKED_CODES = ["IB015", "IB016", "IB002", "MB152", "AEB_A14b", "DUCIT", "p952"] # Tereza
DISLIKED_CODES = ["IB000", "PV080"]

# LIKED_CODES = ["C7073", "Bi4010", "IB111"] # bioinformatics
# LIKED_CODES = ["BIp001", "IB002", "IB005"] # bioinformatics
# LIKED_CODES = ["IB005", "MA010", "IA159", "PV021", "PV027"]
# LIKED_CODES = ["IB031", "CORE016"]

# LIKED_CODES = ["PV021", "NLI_04"]

# LIKED_CODES = ["IV124", "PV021", "IV109"]

# LIKED_CODES = ["PB071", "CORE052"]
# DISLIKED_CODES = ["C2160", "PV065"]

# LIKED_CODES = ["F3170", "F3100"]

# LIKED_CODES = ['IB111', "MV008", "IB031"]
# DISLIKED_CODES = ["PB007"]

rec_kwd_gemini = pd.DataFrame(rec_keywords.recommend_courses_keywords(
    liked=LIKED_CODES,
    disliked=DISLIKED_CODES,
    skipped=SKIPPED_CODES,
    courseClient=course_client,
    n=30,
    kwd_intersects=kwd_intersects_gemini
))

rec_kwd_tfidf = pd.DataFrame(rec_keywords.recommend_courses_keywords(
    liked=LIKED_CODES,
    disliked=DISLIKED_CODES,
    skipped=SKIPPED_CODES,
    courseClient=course_client,
    n=30,
    kwd_intersects=kwd_intersects_tfidf
))

compare_results(
    rec_kwd_gemini, 
    rec_kwd_tfidf, 
    title1="gemini_keywords", 
    title2="tf_idf_keywords",
    columns=["CODE", "FACULTY", "NAME", "RECOMMENDED_FROM", "SIMILARITY"]
)

Unnamed: 0,CODE,FACULTY,NAME,RECOMMENDED_FROM,SIMILARITY
0,IB114,FI,Úvod do programování a algoritmizace II,"['IB002', 'IB015']",11
1,C2143,PřF,Seminář k návrhu algoritmů pro přírodovědce,"['IB002', 'IB015']",9
2,IV003,FI,Algorithms and Data Structures II,"['IB002', 'IB015']",8
3,MB142,FI,Aplikovaná matematická analýza,"['MB152', 'IB015']",8
4,MIN201,PřF,Matematika II,"['MB152', 'IB015']",7
5,C2142,PřF,Návrh algoritmů pro přírodovědce,"['IB002', 'IB015']",6
6,MA015,FI,Graph Algorithms,"['IB002', 'IB015']",6
7,IB111,FI,Základy programování,"['IB002', 'IB015']",5
8,AEB_A06a,FF,Základy studia pravěku a rané doby dějinné,"['AEB_A14b', 'IB015']",5
9,AEB_A14c,FF,Doba bronzová ve střední Evropě,"['AEB_A14b', 'IB015']",5

Unnamed: 0,CODE,FACULTY,NAME,RECOMMENDED_FROM,SIMILARITY
0,IB114,FI,Úvod do programování a algoritmizace II,"['IB002', 'IB015']",15
1,IA014,FI,Advanced Functional Programming,"['IB015', 'IB016']",13
2,MB142,FI,Aplikovaná matematická analýza,"['MB152', 'IB015']",12
3,IB111,FI,Základy programování,"['IB002', 'IB015']",12
4,C2142,PřF,Návrh algoritmů pro přírodovědce,"['IB002', 'IB015']",12
5,IV003,FI,Algorithms and Data Structures II,"['IB002', 'IB016']",11
6,IA010,FI,Principles of Programming Languages,"['IB015', 'IB016']",10
7,MIN201,PřF,Matematika II,"['MB152', 'IB015']",10
8,C2143,PřF,Seminář k návrhu algoritmů pro přírodovědce,"['IB002', 'IB015']",10
9,F6060,PřF,Programování zkouška,"['IB002', 'IB015']",9


In [None]:
# Compare embeddings

from app.recommend.embeddings import recommend_average

embeddings_1 = np.load(os.path.join("data", "embeddings", "embeddings_tomas_03.npy"))
embeddings_2 = np.load(os.path.join("data", "embeddings", "embeds_from_catalogue.npy"))

LIKED_CODES = ["PV197", "IB031", "CORE047"] # base
DISLIKED_CODES = ["MB151", "MB152", "MB154"] # base
#LIKED_CODES = ["C7073", "Bi4010", "IB002"] # bioinformatics
#LIKED_CODES = ["IB111"]
SKIPPED_CODES = []

LIKED_CODES = ["PV197", "IB031", "CORE047", "RLB666"]
DISLIKED_CODES = ["MB151", "MB152", "MB154"]
SKIPPED_CODES = []

rec_avg_1 = pd.DataFrame(recommend_average(
    liked_codes=LIKED_CODES,
    disliked_codes=DISLIKED_CODES,
    skipped_codes=SKIPPED_CODES,
    all_embeds=embeddings_1,
    courseClient=course_client,
    n=10
))

rec_avg_2 = pd.DataFrame(recommend_average(
    liked_codes=LIKED_CODES,
    disliked_codes=DISLIKED_CODES,
    skipped_codes=SKIPPED_CODES,
    all_embeds=embeddings_2,
    courseClient=course_client,
    n=10   
))

compare_results(
    rec_avg_1,  
    rec_avg_2,
    title1="Embeddings 03",
    title2="Embeddings base"
)

Unnamed: 0,CODE,FACULTY,NAME
0,PV242,FI,Inovace a podnikání
1,XV004,PřF,Od nápadu k podnikání
2,MKH_PODN,ESF,Business
3,MPH_PODN,ESF,Podnikání
4,BKH_ZAPO,ESF,Základy podnikání
5,BPH_ZAPO,ESF,Základy podnikání
6,BKF_TEZP,ESF,Teze závěrečné práce
7,BPE_TEBP,ESF,Teze bakalářské práce
8,BPH_BAS2,ESF,Bakalářský seminář 2
9,BPF_TEBP,ESF,Teze bakalářské práce

Unnamed: 0,CODE,FACULTY,NAME
0,XV004,PřF,Od nápadu k podnikání
1,SLAV01,FF,Seminář k magisterské diplomové práci I
2,SLAV01S,FF,Seminář k magisterské diplomové práci I
3,PV242,FI,Inovace a podnikání
4,BPF_TEBP,ESF,Teze bakalářské práce
5,BPE_TEBP,ESF,Teze bakalářské práce
6,SLAV02,FF,Seminář k magisterské diplomové práci II
7,BKF_TEZP,ESF,Teze závěrečné práce
8,BKF_SZP1,ESF,Seminář k závěrečné práci 1
9,RLBcA014,FF,Seminář k bakalářské práci I
