In [1]:
import pandas as pd

In [2]:
file_path = "../data/wikipedia_pages2/50000_to_75000.parquet"

df = pd.read_parquet(file_path)
df.head()

Unnamed: 0,text,url,title
0,The Ulakhan Fault is a left-lateral moving tra...,https://en.wikipedia.org/wiki/Ulakhan%20Fault,Ulakhan Fault
1,Tripodal ligands are tri- and tetradentate lig...,https://en.wikipedia.org/wiki/Tripodal%20ligand,Tripodal ligand
2,WASP-14b is an extrasolar planet discovered in...,https://en.wikipedia.org/wiki/WASP-14b,WASP-14b
3,Long-acting reversible contraceptives (LARC) a...,https://en.wikipedia.org/wiki/Long-acting%20re...,Long-acting reversible contraception
4,"AIDS (""acquired immune deficiency syndrome"") i...",https://en.wikipedia.org/wiki/HIV%20integration,HIV integration


# Note

wikipedia_pages2 was downloaded from https://www.kaggle.com/datasets/nbroad/wiki-20220301-en-sci

In [3]:
from sentence_transformers import SentenceTransformer

model_name = "BAAI/bge-large-en"

model = SentenceTransformer(model_name)
sentences = ["haha", "haha"]

embeddings_1 = model.encode(sentences, normalize_embeddings=True)
embeddings_2 = model.encode(sentences, normalize_embeddings=True)
similarity = embeddings_1 @ embeddings_2.T
print(similarity, embeddings_1)


[[1.        1.       ]
 [0.9999999 0.9999999]] [[-0.00087222 -0.02147177 -0.01763643 ...  0.01505643  0.00848369
   0.00374709]
 [-0.00087222 -0.02147177 -0.01763643 ...  0.01505643  0.00848369
   0.00374709]]


In [4]:
%timeit embeddings_1 = model.encode(sentences, normalize_embeddings=True)

6.13 ms ± 508 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [5]:
# One example

text = df.iloc[0]['text']
embeddings = model.encode(text, normalize_embeddings=True)
embeddings.shape

(1024,)

In [6]:
model.encode(df['text'].values[:16], normalize_embeddings=True)

array([[-0.01012542,  0.04541331, -0.00550739, ..., -0.0371272 ,
        -0.04358213, -0.00750957],
       [-0.00839777, -0.0079006 ,  0.02812083, ...,  0.00433183,
        -0.04053844, -0.03013914],
       [ 0.02113054,  0.01953551,  0.02801393, ..., -0.00511671,
        -0.01368569, -0.00981659],
       ...,
       [-0.00575373,  0.01555173, -0.0076666 , ..., -0.00153228,
        -0.05121766, -0.03535201],
       [-0.01893044, -0.00404281,  0.00475774, ..., -0.0286705 ,
        -0.0268669 , -0.00825804],
       [-0.02951775, -0.03077783, -0.01642974, ...,  0.00054414,
         0.00587249, -0.04442121]], dtype=float32)

In [7]:

import glob

# Load all files in "../data/wikipedia_pages2/"
parquet_files = glob.glob("../data/wikipedia_pages2/*.parquet")

# Load all parquets into a single dataframe
df = pd.concat([pd.read_parquet(f) for f in parquet_files])

print(f"N rows: {len(df)}")

%time df['embd'] = df['text'].apply(lambda x: model.encode(x, normalize_embeddings=True))



N rows: 131049
CPU times: user 31min 12s, sys: 3.15 s, total: 31min 15s
Wall time: 31min 11s


In [8]:
df.head()

Unnamed: 0,text,url,title,embd
0,The Ulakhan Fault is a left-lateral moving tra...,https://en.wikipedia.org/wiki/Ulakhan%20Fault,Ulakhan Fault,"[-0.010125404, 0.045413326, -0.0055074026, 0.0..."
1,Tripodal ligands are tri- and tetradentate lig...,https://en.wikipedia.org/wiki/Tripodal%20ligand,Tripodal ligand,"[-0.008397752, -0.007900611, 0.028120818, 0.02..."
2,WASP-14b is an extrasolar planet discovered in...,https://en.wikipedia.org/wiki/WASP-14b,WASP-14b,"[0.021130525, 0.019535448, 0.028013904, -0.006..."
3,Long-acting reversible contraceptives (LARC) a...,https://en.wikipedia.org/wiki/Long-acting%20re...,Long-acting reversible contraception,"[-0.014279559, -0.005928753, -0.02585963, 0.01..."
4,"AIDS (""acquired immune deficiency syndrome"") i...",https://en.wikipedia.org/wiki/HIV%20integration,HIV integration,"[-0.02654824, -0.008117266, -0.0044153593, 0.0..."


In [9]:
# save df to "../data/wikipedia_pages2_w_embd"
df.to_parquet("../data/wikipedia_pages2_w_embd/wiki_sci_embd.parquet")

In [10]:
# load "../data/wikipedia_pages2_w_embd/wiki_sci_embd.parquet" as wiki_df
wiki_df = pd.read_parquet("../data/wikipedia_pages2_w_embd/wiki_sci_embd.parquet")
wiki_df.head()

Unnamed: 0,text,url,title,embd
0,The Ulakhan Fault is a left-lateral moving tra...,https://en.wikipedia.org/wiki/Ulakhan%20Fault,Ulakhan Fault,"[-0.010125404, 0.045413326, -0.0055074026, 0.0..."
1,Tripodal ligands are tri- and tetradentate lig...,https://en.wikipedia.org/wiki/Tripodal%20ligand,Tripodal ligand,"[-0.008397752, -0.007900611, 0.028120818, 0.02..."
2,WASP-14b is an extrasolar planet discovered in...,https://en.wikipedia.org/wiki/WASP-14b,WASP-14b,"[0.021130525, 0.019535448, 0.028013904, -0.006..."
3,Long-acting reversible contraceptives (LARC) a...,https://en.wikipedia.org/wiki/Long-acting%20re...,Long-acting reversible contraception,"[-0.014279559, -0.005928753, -0.02585963, 0.01..."
4,"AIDS (""acquired immune deficiency syndrome"") i...",https://en.wikipedia.org/wiki/HIV%20integration,HIV integration,"[-0.02654824, -0.008117266, -0.0044153593, 0.0..."
