In [1]:
# 2. Imports
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer


In [3]:
books_df      = pd.read_csv('../data/test.csv')        
categories_df = pd.read_csv('../data/categories.csv', header=None, names=['category'])

print(f"Loaded {len(books_df)} books and {len(categories_df)} categories")
books_df.head(), categories_df.head()


Loaded 199 books and 582 categories


(                                               Title
 0  What It Takes: Lessons in the Pursuit of Excel...
 1  The Practice: Shipping Creative Work by Seth G...
 2  The Ideal Team Player: How to Recognize and Cu...
 3  4 Essential Keys to Effective Communication in...
 4  The Undocumented Americans by Karla Cornejo Vi...,
               category
 0  Books on Psychology
 1  Books on Leadership
 2   Books on Investing
 3       Books on Sales
 4     Books on Finance)

In [4]:
titles   = books_df['Title'].astype(str).tolist()
cats_all = categories_df['category'].astype(str).tolist()


KeyError: 'anchor'

In [5]:
model_name = "yuriivoievidka/microsoft_mpnet-base-librarian"
model = SentenceTransformer(model_name)


In [None]:
title_emb = model.encode(titles,   convert_to_numpy=True, show_progress_bar=True)
cat_emb   = model.encode(cats_all, convert_to_numpy=True, show_progress_bar=True)

In [None]:
title_emb = title_emb / np.linalg.norm(title_emb, axis=1, keepdims=True)
cat_emb   = cat_emb   / np.linalg.norm(cat_emb,   axis=1, keepdims=True)

In [None]:
sims = np.dot(title_emb, cat_emb.T)

In [None]:
top_k = 15

results = []
for i, title in enumerate(titles):
    top_idx = np.argsort(sims[i])[::-1][:top_k]
    top_cats = [cats_all[j] for j in top_idx]
    results.append({
        'anchor': title,
        **{f'cat_{n+1}': top_cats[n] for n in range(top_k)}
    })

output_df = pd.DataFrame(results)


In [None]:
output_df.head()
output_df.to_csv('books_with_top15_categories.csv', index=False)
print("Saved books_with_top15_categories.csv")
