In [1]:
import pyarrow as pa
import pandas as pd
import numpy as np

In [2]:
import json
from pathlib import Path

In [3]:
embedding_dfs = []
title_dfs = []


for i in range(33):
    embedding_filename = "insertpathtoarrowfiles" 
    title_filename = "insertpathtoarrowfiles"
    
    
    with pa.input_stream(embedding_filename) as f:
        reader = pa.ipc.RecordBatchStreamReader(f)
        record_batches = [batch for batch in reader]
        tables = [pa.Table.from_batches([batch]) for batch in record_batches]
        embedding_df = pa.concat_tables(tables).to_pandas()
        embedding_dfs.append(embedding_df)
    
    
    with pa.input_stream(title_filename) as f:
        reader = pa.ipc.RecordBatchStreamReader(f)
        record_batches = [batch for batch in reader]
        tables = [pa.Table.from_batches([batch]) for batch in record_batches]
        title_df = pa.concat_tables(tables).to_pandas()
        title_dfs.append(title_df)


In [4]:
embedding_df = pd.concat(embedding_dfs)
title_df = pd.concat(title_dfs)

In [5]:
embedding_df

Unnamed: 0,embedding
0,19
1,109
2,105
3,38
4,61
...,...
9299467,30
9299468,90
9299469,101
9299470,75


In [6]:
title_df

Unnamed: 0,title
0,List of Supernatural and The Winchesters chara...
1,List of characters in mythology novels by Rick...
2,1943 Birthday Honours
3,List of people from Illinois
4,List of Pokémon anime characters
...,...
193734,List of fictional alien species: U
193735,List of fictional alien species: V
193736,List of fictional alien species: W
193737,List of fictional alien species: X


In [7]:
embedding_df.reset_index(drop=True, inplace=True)
title_df.reset_index(drop=True, inplace=True)

In [8]:
embedding_df

Unnamed: 0,embedding
0,19
1,109
2,105
3,38
4,61
...,...
316499467,30
316499468,90
316499469,101
316499470,75


In [9]:
title_df

Unnamed: 0,title
0,List of Supernatural and The Winchesters chara...
1,List of characters in mythology novels by Rick...
2,1943 Birthday Honours
3,List of people from Illinois
4,List of Pokémon anime characters
...,...
6593734,List of fictional alien species: U
6593735,List of fictional alien species: V
6593736,List of fictional alien species: W
6593737,List of fictional alien species: X


In [10]:
merged_df = title_df

In [None]:
merged_df['Embeddings'] = ''

for i in range(len(merged_df)):
    start_idx = i * 48
    end_idx = start_idx + 48
    embeddings_list = embedding_df.iloc[start_idx:end_idx]['embedding'].tolist()
    merged_df.at[i, 'Embeddings'] = embeddings_list

In [12]:
merged_df

Unnamed: 0,title,Embeddings
0,List of Supernatural and The Winchesters chara...,"[19, 109, 105, 38, 61, 9, 103, 74, 19, 35, 33,..."
1,List of characters in mythology novels by Rick...,"[0, 42, 57, 38, 65, 70, 103, 80, 5, 35, 58, 50..."
2,1943 Birthday Honours,"[12, 8, 54, 75, 5, 58, 38, 58, 52, 123, 93, 11..."
3,List of people from Illinois,"[117, 124, 65, 34, 51, 116, 1, 56, 109, 63, 95..."
4,List of Pokémon anime characters,"[56, 5, 14, 75, 63, 119, 46, 69, 55, 27, 14, 1..."
...,...,...
6593734,List of fictional alien species: U,"[125, 84, 29, 93, 10, 102, 24, 16, 111, 38, 62..."
6593735,List of fictional alien species: V,"[101, 58, 86, 103, 60, 17, 66, 20, 104, 121, 6..."
6593736,List of fictional alien species: W,"[28, 101, 45, 27, 10, 85, 29, 56, 125, 38, 123..."
6593737,List of fictional alien species: X,"[46, 84, 118, 54, 24, 2, 78, 66, 57, 35, 84, 7..."


In [127]:
embeddings = np.array(merged_df['Embeddings'].tolist())

In [14]:
embeddings

array([[ 19, 109, 105, ...,  98,  87,  38],
       [  0,  42,  57, ...,  38,  35,   9],
       [ 12,   8,  54, ...,  22,  40,  38],
       ...,
       [ 28, 101,  45, ...,  56,  24, 107],
       [ 46,  84, 118, ...,  28, 104,  47],
       [ 64,  61,  20, ..., 101,  75,  47]])

In [None]:
#save to use later
table = pa.Table.from_pandas(pd.DataFrame(embeddings))  

output_path = 'embeddings.arrow'


with pa.OSFile(output_path, 'wb') as sink:
    with pa.RecordBatchFileWriter(sink, table.schema) as writer:
        writer.write_table(table)


In [128]:
titles = np.array(merged_df['title'].tolist())

In [16]:
titles

array(['List of Supernatural and The Winchesters characters',
       'List of characters in mythology novels by Rick Riordan',
       '1943 Birthday Honours', ..., 'List of fictional alien species: W',
       'List of fictional alien species: X',
       'List of fictional alien species: Y'], dtype='<U251')

In [None]:
#save to use later
table = pa.Table.from_pandas(pd.DataFrame(titles))  

output_path = 'titles.arrow'


with pa.OSFile(output_path, 'wb') as sink:
    with pa.RecordBatchFileWriter(sink, table.schema) as writer:
        writer.write_table(table)

In [17]:
from sentence_transformers import SentenceTransformer

minilml6v2 = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

  from tqdm.autonotebook import tqdm, trange


In [64]:
illuminati_query = minilml6v2.encode("illuminati")

In [92]:
illuminati_query.shape

(384,)

In [19]:
import nanopq

In [20]:
codewords = np.array(json.loads(Path("codewords.json").read_text()), dtype=np.float32)

In [21]:
(cwM, cwKs, cwDs) = codewords.shape
print(f"Number of subspaces (cwM): {cwM}")
print(f"Number of codewords per subspace (cwKs): {cwKs}")
print(f"Dimensionality of each subspace (cwDs): {cwDs}")

Number of subspaces (cwM): 48
Number of codewords per subspace (cwKs): 128
Dimensionality of each subspace (cwDs): 8


In [22]:
pq48x7 = nanopq.PQ(M=48, Ks=128)
pq48x7.Ds = cwDs

M: 48, Ks: 128, metric : <class 'numpy.uint8'>, code_dtype: l2


In [23]:
pq48x7.codewords = codewords

In [65]:
illuminati_dists = pq48x7.dtable(illuminati_query).adist(embeddings)
print(titles[np.argsort(illuminati_dists)[0:10]])

['Illuminati (disambiguation)' 'Illuminati in popular culture'
 'Illuminati (game)' 'Shadow government (conspiracy theory)'
 'GURPS Illuminati' 'Illuminatus of Arce'
 'The New World Order (Robertson book)'
 'New World Order (conspiracy theory)' 'Illuminati (comics)' 'Illuminati']


In [70]:
def recommend_top10(article_name):
    query = minilml6v2.encode(article_name)
    

    query_dists = pq48x7.dtable(query).adist(embeddings)
    top10_indices = np.argsort(query_dists)[:10]
    top10_titles = titles[top10_indices]
    top10_distances = query_dists[top10_indices]

    return top10_titles, top10_distances


get_rec = "Pink Floyd"
top10_articles, top10_distances = recommend_top10(get_rec)

print(f"Most Similar Articles to {get_rec}")
for i, (article, distance) in enumerate(zip(top10_articles, top10_distances), 1):
    print(f"{i}. {article} (Distance: {distance:.4f})")

Most Similar Articles to Pink Floyd
1. Zabriskie Point (album) (Distance: 0.6241)
2. Obscured by Clouds (Distance: 0.6269)
3. Pink Floyd (Distance: 0.6321)
4. The Endless River (Distance: 0.6544)
5. Julia Dream (Distance: 0.6570)
6. A Collection of Great Dance Songs (Distance: 0.6615)
7. Grantchester Meadows (song) (Distance: 0.6624)
8. Yet Another Movie (Distance: 0.6631)
9. A Momentary Lapse of Reason (Distance: 0.6649)
10. On the Turning Away (Distance: 0.6665)


In [76]:
def notrecommend_top10(article_name):
    query = minilml6v2.encode(article_name)
    

    query_dists = pq48x7.dtable(query).adist(embeddings)
    top10_indices = np.argsort(query_dists)[-10:]
    top10_indices = top10_indices[::-1]
    top10_titles = titles[top10_indices]
    top10_distances = query_dists[top10_indices]

    return top10_titles, top10_distances


get_rec = "Mission Impossible"
top10_articles, top10_distances = notrecommend_top10(get_rec)

print(f"Most Disimilar Articles to {get_rec}")
for i, (article, distance) in enumerate(zip(top10_articles, top10_distances), 1):
    print(f"{i}. {article} (Distance: {distance:.4f})")

Most Disimilar Articles to Mission Impossible
1. Gmina Krajenka (Distance: 1.9882)
2. Mrzygłody Lubyckie, Podkarpackie Voivodeship (Distance: 1.9602)
3. Zaryte (Distance: 1.9484)
4. Gmina Zabrodzie (Distance: 1.9471)
5. Krzemów (Distance: 1.9459)
6. Zakrzewo, Płock County (Distance: 1.9449)
7. Gmina Horyniec-Zdrój (Distance: 1.9442)
8. Gmina Krościenko Wyżne (Distance: 1.9406)
9. Szreńsk (Distance: 1.9398)
10. Bieżyń (Distance: 1.9393)


In [116]:
def aggrecommend_top10(article_list):
    article_list_lower = [article.lower() for article in article_list] 
    encoded_articles = [minilml6v2.encode(article.lower()) for article in article_list_lower]
    query = np.mean(encoded_articles, axis=0, keepdims=True)
    query = query.flatten()
    
    query_dists = pq48x7.dtable(query).adist(embeddings)
    
    top15_indices = np.argsort(query_dists)[:15]

    top15_titles = titles[top15_indices]
    top15_distances = query_dists[top15_indices]
    filtered_top10_titles = []
    filtered_top10_distances = []
    count = 0
    for title, distance in zip(top15_titles, top15_distances):
        if title.lower() not in article_list_lower:
            filtered_top10_titles.append(title)
            filtered_top10_distances.append(distance)
            count += 1
        if count == 10:
            break

    return filtered_top10_titles, filtered_top10_distances

article_list = ["Water", "Carbon Dioxide", "Oxygen", "Hydrogen Peroxide", "Sodium Chloride", "Sulfuric Acid", "Acetic Acid", "Ammonia", "Ethanol", "Methane"]

top10_articles, top10_distances = aggrecommend_top10(article_list)

print(f"Most Similar Articles to Given Articles")
for i, (article, distance) in enumerate(zip(top10_articles, top10_distances), 1):
    print(f"{i}. {article} (Distance: {distance:.4f})")


Most Similar Articles to Given Articles
1. Hydrogen (Distance: 0.3384)
2. Carbon monoxide (Distance: 0.3620)
3. Kipp's apparatus (Distance: 0.3667)
4. Formic acid (Distance: 0.3760)
5. Oxygen compounds (Distance: 0.3786)
6. Carbonated water (Distance: 0.3803)
7. Amateur chemistry (Distance: 0.3820)
8. Scripps Energy & Materials Center (Distance: 0.3834)
9. Sodium hypochlorite (Distance: 0.3841)
10. Urea nitrate (Distance: 0.3860)
