In [1]:
from datasets import Dataset

bible_data = Dataset.from_csv("berean_study_bible.csv")
bible_data = bible_data.rename_columns({
    "Verse": "verse",
    "Berean Standard Bible": "text"
    })
bible_data = bible_data.filter(lambda x: isinstance(x["text"], str))

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
from transformers import AutoTokenizer, TFAutoModel

model_cpkt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_cpkt)
model = TFAutoModel.from_pretrained(model_cpkt, from_pt=True)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFMPNetModel: ['embeddings.position_ids']
- This IS expected if you are initializing TFMPNetModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFMPNetModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFMPNetModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMPNetModel for predictions without further training.


In [5]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="tf"
    )
    encoded_input = dict(encoded_input.items())
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [4]:
embedding = get_embeddings(bible_data["text"][0])
embedding.shape

TensorShape([1, 768])

In [5]:
embeddings_dataset = bible_data.map(
    lambda x: {"embeddings": get_embeddings(x["text"]).numpy()[0]}
)

Map: 100%|██████████| 31086/31086 [55:44<00:00,  9.29 examples/s] 


In [9]:
embeddings_dataset.save_to_disk("embedding_dataset")

Saving the dataset (1/1 shards): 100%|██████████| 31086/31086 [00:00<00:00, 128270.62 examples/s]


In [2]:
embeddings_dataset = Dataset.load_from_disk("embedding_dataset")

In [3]:
embeddings_dataset.add_faiss_index(column="embeddings")

100%|██████████| 32/32 [00:00<00:00, 159.92it/s]


Dataset({
    features: ['verse', 'text', 'embeddings'],
    num_rows: 31086
})

In [14]:
question = "Who should we love?"
question_embedding = get_embeddings([question]).numpy()
question_embedding.shape

(1, 768)

In [15]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embeddings", question_embedding, k=5
)

In [16]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=True, inplace=True)


In [17]:
for _, row in samples_df.iterrows():
    print(f"VERSE: {row.verse}")
    print(f"SCORE: {row.scores}")
    print(f"TEXT: {row.text}")
    print("=" * 50)
    print()

VERSE: 1 John 4:7
SCORE: 23.098581314086914
TEXT: Beloved, let us love one another, because love comes from God. Everyone who loves has been born of God and knows God.

VERSE: 1 John 4:11
SCORE: 24.75037384033203
TEXT: Beloved, if God so loved us, we also ought to love one another.

VERSE: John 15:17
SCORE: 25.112953186035156
TEXT: This is My command to you: Love one another.

VERSE: 1 John 4:19
SCORE: 26.2373046875
TEXT: We love because He first loved us.

VERSE: 1 Thessalonians 1:4
SCORE: 28.035762786865234
TEXT: Brothers who are beloved by God, we know that He has chosen you,

