In [48]:
import json
from pathlib import Path
import numpy as np
import pyarrow as pa

In [49]:
codewords = np.array(json.loads(Path("codewords.json").read_text()), dtype=np.float32)

In [50]:
input_path = 'embeddings.arrow'

table = pa.ipc.open_file(input_path).read_all()

df = table.to_pandas()

embeddings = df.to_numpy()

In [51]:
embeddings

array([[ 19, 109, 105, ...,  98,  87,  38],
       [  0,  42,  57, ...,  38,  35,   9],
       [ 12,   8,  54, ...,  22,  40,  38],
       ...,
       [ 28, 101,  45, ...,  56,  24, 107],
       [ 46,  84, 118, ...,  28, 104,  47],
       [ 64,  61,  20, ..., 101,  75,  47]])

In [52]:
input_path = 'titles.arrow'

table1 = pa.ipc.open_file(input_path).read_all()

df1 = table1.to_pandas()

titles = df1.to_numpy()

In [53]:
titles

array([['List of Supernatural and The Winchesters characters'],
       ['List of characters in mythology novels by Rick Riordan'],
       ['1943 Birthday Honours'],
       ...,
       ['List of fictional alien species: W'],
       ['List of fictional alien species: X'],
       ['List of fictional alien species: Y']], dtype=object)

In [54]:
from sentence_transformers import SentenceTransformer
minilml6v2 = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")



In [55]:
illuminati_query = minilml6v2.encode("illuminati")

In [56]:
illuminati_query.shape

(384,)

In [57]:
import nanopq

In [58]:
(cwM, cwKs, cwDs) = codewords.shape
print(f"Number of subspaces (cwM): {cwM}")
print(f"Number of codewords per subspace (cwKs): {cwKs}")
print(f"Dimensionality of each subspace (cwDs): {cwDs}")

Number of subspaces (cwM): 48
Number of codewords per subspace (cwKs): 128
Dimensionality of each subspace (cwDs): 8


In [59]:
pq48x7 = nanopq.PQ(M=48, Ks=128)
pq48x7.Ds = cwDs

M: 48, Ks: 128, metric : <class 'numpy.uint8'>, code_dtype: l2


In [60]:
pq48x7.codewords = codewords

In [61]:
illuminati_dists = pq48x7.dtable(illuminati_query).adist(embeddings)
print(titles[np.argsort(illuminati_dists)[0:10]])

[['Illuminati (disambiguation)']
 ['Illuminati in popular culture']
 ['Illuminati (game)']
 ['Shadow government (conspiracy theory)']
 ['GURPS Illuminati']
 ['Illuminatus of Arce']
 ['The New World Order (Robertson book)']
 ['New World Order (conspiracy theory)']
 ['Illuminati (comics)']
 ['Illuminati']]


In [62]:
def recommend_top10(article_name):
    query = minilml6v2.encode(article_name)
    

    query_dists = pq48x7.dtable(query).adist(embeddings)
    top10_indices = np.argsort(query_dists)[:10]
    top10_titles = titles[top10_indices]
    top10_distances = query_dists[top10_indices]

    return top10_titles, top10_distances


get_rec = "Was hitler misunderstood"
top10_articles, top10_distances = recommend_top10(get_rec)

print(f"Most Similar Articles to {get_rec}")
for i, (article, distance) in enumerate(zip(top10_articles, top10_distances), 1):
    print(f"{i}. {article} (Distance: {distance:.4f})")

Most Similar Articles to Was hitler misunderstood
1. ['Young Hitler'] (Distance: 0.6662)
2. ['Nazi analogies'] (Distance: 0.6787)
3. ["Hitler's War"] (Distance: 0.6800)
4. ['Political views of Adolf Hitler'] (Distance: 0.6952)
5. ["Adolf Hitler's cult of personality"] (Distance: 0.6963)
6. ['Sexuality of Adolf Hitler'] (Distance: 0.6990)
7. ["Hitler's prophecy"] (Distance: 0.7047)
8. ["La part de l'autre"] (Distance: 0.7050)
9. ['30 January 1939 Reichstag speech'] (Distance: 0.7121)
10. ['Adolf Hitler in popular culture'] (Distance: 0.7172)
