In [1]:

import os
import sys

# Build an absolute path from this notebook's parent directory
module_path = os.path.abspath(os.path.join('..', 'web', 'backend'))

# Add to sys.path if not already present
if module_path not in sys.path:
    sys.path.append(module_path)

In [3]:

import numpy as np
import app.courses as courses
cs_client = courses.CourseClient("../web/backend/assets/courses")

data = cs_client.all_courses()
embeddings = np.load(f"../web/backend/assets/embeds_all.npy", allow_pickle=True)

print(f'Loaded {len(data)} courses')
print(f'Embedding shape: {embeddings.shape}')

Loaded 21106 courses
Embedding shape: (21106, 768)


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Reduce to 2D with PCA
pca = PCA(n_components=2)
X_pca = pca.fit_transform(embeddings)

# Plot
plt.scatter(X_pca[:, 0], X_pca[:, 1], s=2)
plt.title("PCA Visualization of Embeddings")
plt.xlabel("PC 1")
plt.ylabel("PC 2")
plt.show()


In [4]:
from sklearn.manifold import TSNE
import pandas as pd

# Reduce to 2D using t-SNE
tsne = TSNE(n_components=2, perplexity=30, verbose=1)
X_tsne = tsne.fit_transform(embeddings)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 21106 samples in 0.016s...
[t-SNE] Computed neighbors for 21106 samples in 4.145s...
[t-SNE] Computed conditional probabilities for sample 1000 / 21106
[t-SNE] Computed conditional probabilities for sample 2000 / 21106
[t-SNE] Computed conditional probabilities for sample 3000 / 21106
[t-SNE] Computed conditional probabilities for sample 4000 / 21106
[t-SNE] Computed conditional probabilities for sample 5000 / 21106
[t-SNE] Computed conditional probabilities for sample 6000 / 21106
[t-SNE] Computed conditional probabilities for sample 7000 / 21106
[t-SNE] Computed conditional probabilities for sample 8000 / 21106
[t-SNE] Computed conditional probabilities for sample 9000 / 21106
[t-SNE] Computed conditional probabilities for sample 10000 / 21106
[t-SNE] Computed conditional probabilities for sample 11000 / 21106
[t-SNE] Computed conditional probabilities for sample 12000 / 21106
[t-SNE] Computed conditional probabilities for sam

In [None]:
import math
print(21000*math.log(21000))

208997.83205081674
4.301029995663981


In [12]:
X_tsne.shape

(21106, 2)

In [9]:

import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pandas as pd

# Extract faculty information
faculties = [item.FACULTY for item in data]
codes = [item.CODE for item in data]
names = [item.NAME for item in data]
unique_faculties = list(set(faculties))

# Define custom colors for each faculty
custom_colors = {
    'FI': '#f2d45c',
    'FF': '#4bc8ff',
    'FSS': '#008c78',
    'ESF': '#b9006e',
    'PrF': '#9100dc',
    'LF': '#f01928',
    'PdF': '#ff7300',
    'FaF': '#56788d',
    'FSpS': '#5ac8af',
    'CST': '#0031e7',
    'PřF': '#00af3f'
}

# For any faculties not explicitly defined above, assign colors from a default palette
default_colors = plt.cm.tab20.colors
for i, faculty in enumerate(unique_faculties):
    if faculty not in custom_colors:
        custom_colors[faculty] = default_colors[i % len(default_colors)]

# Map faculties to their colors
colors = [custom_colors[faculty] for faculty in faculties]

In [None]:

# Plot with custom colors
plt.figure(figsize=(12, 8))
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=colors, s=5, alpha=0.7)

# Add legend
legend_elements = [plt.Line2D([0], [0], marker='o', color='w', 
                             markerfacecolor=custom_colors[faculty], 
                             markersize=8, label=faculty) for faculty in unique_faculties]
plt.legend(handles=legend_elements, title="Faculties", loc='best', bbox_to_anchor=(1.05, 1), fontsize='small')

plt.title("t-SNE Visualization of Embeddings by Faculty")
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
plt.tight_layout()
plt.show()

In [11]:
import plotly.express as px

# Convert your data to a DataFrame (assuming X_tsne and faculties still exist in scope)
df = pd.DataFrame({
    'Dimension 1': X_tsne[:, 0],
    'Dimension 2': X_tsne[:, 1],
    'Faculty': faculties,
    'Codes': codes,
    'Names': names
})

# Convert your custom_colors dict into a form that Plotly can use
# (faculty name -> color code)
color_discrete_map = {
    faculty: color
    for faculty, color in custom_colors.items()
}

# Create an interactive scatter plot
fig = px.scatter(
    df,
    x='Dimension 1',
    y='Dimension 2',
    color='Faculty',
    color_discrete_map=color_discrete_map,
    title="t-SNE Visualization of Embeddings by Faculty (Interactive)",
    height=1200,
    hover_data=['Codes', 'Names']
)

fig.show()





In [None]:
import pandas as pd

# Convert course_data to a DataFrame and save as .tsv
course_data_df = pd.DataFrame([f"{course.CODE}-{course.NAME}" for course in course_data])
course_data_df.to_csv('course_data.tsv', sep='\t', index=False)

# Save course_embeddings as .tsv
np.savetxt('course_embeddings.tsv', course_embeddings, delimiter='\t')