In [None]:
!pip install datasets


In [None]:
from datasets import load_dataset


In [6]:
# load embeddings, this will be a pandas df with embeddings stored in a list
reference_embeddings = load_dataset("anordkvist/gu-course-syllabus-embeddings")
df_embeddings = reference_embeddings['train'].to_pandas()
course_content_embeddings = df_embeddings['Course content']

print(course_content_embeddings.shape)
course_content_embeddings.head()

In [None]:
import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt

# Assume 'series_embeddings' is your series of embeddings, and 'course_codes' is the series with course codes.
# Convert the series into a numpy array
embeddings_array = np.array(course_content_embeddings.tolist())

# Initialize t-SNE
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)

# Fit and transform with t-SNE
tsne_results = tsne.fit_transform(embeddings_array)

In [None]:
course_codes = df_embeddings['course_code']

# Define the departments you want to highlight with the number of characters to consider
# Specify the number of chars for each department
code_char_mapping = {
    'DIT': 3,
    'NEK': 3,
    'MM': 2,
    'P': 1
}

# Create a color map, need to be the same amount as nr of codes
colors = ['red', 'blue', 'green', 'purple']
color_map = dict(zip(code_char_mapping.keys(), colors))

# Function to determine color based on variable-length department codes
def determine_color(code):
    for char, num_chars in code_char_mapping.items():
        if code.startswith(char):
            return color_map[char]
    return 'grey'  # Default color

# Apply function to course codes
point_colors = course_codes.apply(determine_color)

# Plot the t-SNE results
plt.figure(figsize=(16,10))
scatter = plt.scatter(tsne_results[:, 0], tsne_results[:, 1], c=point_colors, alpha=0.5)

# Highlight the departments
plt.title('t-SNE visualization with multiple departments highlighted')
plt.xlabel('Component 1')
plt.ylabel('Component 2')
# legend
for dept, color in color_map.items():
    plt.scatter([], [], color=color, label=dept)
plt.legend()

plt.show()
