In [10]:
import numpy as np
import os

import scripts.helpers as helpers
helpers.add_backend_to_path()

import app.courses as courses

cs_client= courses.CourseClient(os.path.join("..", "web", "backend", "assets", "courses"))

data = cs_client.all_courses()
embeddings = np.load(os.path.join("data", "embeddings", "embeddings_tomas_03.npy"), allow_pickle=True)

print(f'Loaded {len(data)} courses')
print(f'Embedding shape: {embeddings.shape}')

Loaded 21106 courses
Embedding shape: (21106, 768)


In [11]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Reduce to 2D with PCA
pca = PCA(n_components=2).fit_transform(embeddings)

In [12]:
from sklearn.decomposition import PCA

# Reduce to 50 dimensions with PCA
pca_50 = PCA(n_components=50)
X_pca_50 = pca_50.fit_transform(embeddings)

print(f'Original shape: {embeddings.shape}')
print(f'Reduced shape: {X_pca_50.shape}')

Original shape: (21106, 768)
Reduced shape: (21106, 50)


In [13]:
from sklearn.manifold import TSNE

# Reduce to 2D using t-SNE
tsne = TSNE(n_components=2, perplexity=30, verbose=1)
X_tsne = tsne.fit_transform(embeddings)

[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 21106 samples in 0.027s...
[t-SNE] Computed neighbors for 21106 samples in 9.691s...
[t-SNE] Computed conditional probabilities for sample 1000 / 21106
[t-SNE] Computed conditional probabilities for sample 2000 / 21106
[t-SNE] Computed conditional probabilities for sample 3000 / 21106
[t-SNE] Computed conditional probabilities for sample 4000 / 21106
[t-SNE] Computed conditional probabilities for sample 5000 / 21106
[t-SNE] Computed conditional probabilities for sample 6000 / 21106
[t-SNE] Computed conditional probabilities for sample 7000 / 21106
[t-SNE] Computed conditional probabilities for sample 8000 / 21106
[t-SNE] Computed conditional probabilities for sample 9000 / 21106
[t-SNE] Computed conditional probabilities for sample 10000 / 21106
[t-SNE] Computed conditional probabilities for sample 11000 / 21106
[t-SNE] Computed conditional probabilities for sample 12000 / 21106
[t-SNE] Computed conditional probabilities for sam

In [14]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import math

# Extract faculty information
faculties = [item.FACULTY for item in data]
codes = [item.CODE for item in data]
names = [item.NAME for item in data]
student_count = [int(item.STUDENTS_ENROLLED) for item in data]
unique_faculties = list(set(faculties))

# Define custom colors for each faculty
custom_colors = {
    'FI': '#f2d45c',
    'FF': '#4bc8ff',
    'FSS': '#008c78',
    'ESF': '#b9006e',
    'PrF': '#9100dc',
    'LF': '#f01928',
    'PdF': '#ff7300',
    'FaF': '#56788d',
    'FSpS': '#5ac8af',
    'CST': '#0031e7',
    'PřF': '#00af3f'
}

# For any faculties not explicitly defined above, assign colors from a default palette
default_colors = plt.cm.tab20.colors
for i, faculty in enumerate(unique_faculties):
    if faculty not in custom_colors:
        custom_colors[faculty] = default_colors[i % len(default_colors)]

# Map faculties to their colors
colors = [custom_colors[faculty] for faculty in faculties]

In [15]:

import plotly.express as px

student_count_log = [math.sqrt(min(int(item.STUDENTS_ENROLLED), 1000)) for item in data]

# Convert your data to a DataFrame (assuming X_tsne and faculties still exist in scope)
df = pd.DataFrame({
    'Dimension 1': pca[:, 0],
    'Dimension 2': pca[:, 1],
    'Faculty': faculties,
    'Codes': codes,
    'Names': names,
    'Student Count': student_count,
    'size': student_count_log,
})

# Convert your custom_colors dict into a form that Plotly can use
# (faculty name -> color code)
color_discrete_map = {
    faculty: color
    for faculty, color in custom_colors.items()
}

# Create an interactive scatter plot
fig = px.scatter(
    df,
    x='Dimension 1',
    y='Dimension 2',
    color='Faculty',
    size='size',
    color_discrete_map=color_discrete_map,
    title="PCA Visualization of Embeddings by Faculty and Student Count (Interactive)",
    height=1000,
    hover_data=['Codes', 'Names', "Student Count"]
)

fig.show()





In [16]:
import plotly.express as px

student_count_log = [math.sqrt(max(0, min(1000, int(item.STUDENTS_ENROLLED)))) for item in data]

# Convert your data to a DataFrame (assuming X_tsne and faculties still exist in scope)
df = pd.DataFrame({
    'Dimension 1': X_tsne[:, 0],
    'Dimension 2': X_tsne[:, 1],
    'Faculty': faculties,
    'Codes': codes,
    'Names': names,
    'Student Count': student_count,
    'size': student_count_log,
})

# Convert your custom_colors dict into a form that Plotly can use
# (faculty name -> color code)
color_discrete_map = {
    faculty: color
    for faculty, color in custom_colors.items()
}

# Create an interactive scatter plot
fig = px.scatter(
    df,
    x='Dimension 1',
    y='Dimension 2',
    color='Faculty',
    size='size',
    color_discrete_map=color_discrete_map,
    title="t-SNE Visualization of Embeddings by Faculty and Student Count (Interactive)",
    height=1000,
    hover_data=['Codes', 'Names', "Student Count"]
)

fig.show()





In [19]:
# Export the t-SNE data to JSON for D3.js visualization
import json

# Create a list of dictionaries with the necessary data for D3.js
d3_data = []
for i in range(len(X_tsne)):
    d3_data.append({
        'x': float(X_tsne[i, 0]),
        'y': float(X_tsne[i, 1]),
        'faculty': faculties[i],
        'code': codes[i],
        'name': names[i],
        'studentCount': int(student_count[i]),
        'color': custom_colors.get(faculties[i], '#000000')  # Use the faculty color or default to black
    })

# Save to a JSON file
with open('data/generated/tsne_visualization_data.json', 'w', encoding='utf-8') as f:
    json.dump(d3_data, f, ensure_ascii=False, indent=2)

print(f"Exported {len(d3_data)} data points to 'data/generated/tsne_visualization_data.json'")

# Example of the first few records for reference
print("\nSample of exported data:")
print(json.dumps(d3_data[:3], ensure_ascii=False, indent=2))

Exported 21106 data points to 'data/generated/tsne_visualization_data.json'

Sample of exported data:
[
  {
    "x": 75.79875183105469,
    "y": -58.92048263549805,
    "faculty": "CST",
    "code": "A_SPT",
    "name": " Angličtina pro soudní překladatele a tlumočníky ",
    "studentCount": 8,
    "color": "#0031e7"
  },
  {
    "x": 83.08512878417969,
    "y": 25.172754287719727,
    "faculty": "CST",
    "code": "AUT_TM1",
    "name": " Základy plánování a organizace času pro studenty se specifickými nároky ",
    "studentCount": 26,
    "color": "#0031e7"
  },
  {
    "x": 28.713438034057617,
    "y": -35.499366760253906,
    "faculty": "CST",
    "code": "BELONG",
    "name": " Professional Writing and Communication with Employers ",
    "studentCount": 6,
    "color": "#0031e7"
  }
]
