In [2]:
import pandas as pd
import numpy as np
from joblib import load
import tiktoken

from sentence_transformers import SentenceTransformer, util
from sentence_transformers.cross_encoder import CrossEncoder

## Execution

In [3]:
# ================== #
# Tokenize Text
# ================== #

# TODO: change tiktoken out to real tokenizer... or use tiktoken

def get_tokens(text_2_encode: str, tokenizer=None):
    """
    Tokenize text in a string.

    Initialize a tokenizer if tokenizer == None.
    """

    if tokenizer is None:
        tokenizer = tiktoken.encoding_for_model("text-davinci-003")
    return tokenizer.encode(text=text_2_encode)


def get_num_tokens(text_2_encode: str, **kwargs):
    """
    Count the number of tokens in a string.
    """
    return len(get_tokens(text_2_encode=text_2_encode, **kwargs))


# ================== #
#  Get Embeddings
# ================== #

def get_embeddings(text=None, model=None):
    """
    Generate embeddings on a string of text.
    """
    if model==None:
        model = load('./model/SentBERTmodel.pkl')

    return model.encode(text)


# ================== #
#  Calculate Vector Similarity
# ================== #

def vector_similarity(x: "list[float]", y: "list[float]") -> float:
    """
    Returns the similarity between two vectors.

    Because embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))


# ================== #
#  Order Chunks by Similarity
# ================== #

def measure_embedding_similarity(
    query: str,
    embeddings
):
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections.

    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embeddings(query)

    return [vector_similarity(query_embedding, embedding) for embedding in embeddings]


# ================== #
#  Get Similar Texts
# ================== #

def get_similar_texts(df, k, column):
    """
    Slice a dataframe on the top k results.  Sort the sliced dataframe descending on similarity score.

    If there are repeated results in top 5, keep them all.
    """
    response = df.nlargest(k, columns=[column],keep='all')
    response = response.sort_values(by=column, ascending=False)
    return response


# ================== #
#  Run Cross Encoding to re-rank
# ================== #

def cross_encode(data, query):
    # Sentence Combinations
    cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

    corpus = data['text'].tolist()
    sentence_combinations = [[query, corpus_sentence] for corpus_sentence in corpus]

    data['cross_encoder_score'] = cross_encoder.predict(sentence_combinations)

    data.sort_values(by='cross_encoder_score', inplace=True, ascending=False)

    return data

In [4]:
# ================== #
#  Run (score.py)
# ================== #

def run(question: str, run_cross_encoder="Yes") -> dict[str,str]:

    # Import Data
    filepath = "C:\\Users\\t_zim\\Desktop\\Data\\Bible\\"
    filename = "KJV_chapter_search.json"

    df = pd.read_json(str(filepath)+str(filename))
    #df = df.iloc[:50]

#%%%%%%%%%%%#
    # Set up Vars
    embeddings = df['embeddings']
    k=50

    # Retrieve Top K Most Similar Results
    df['similarity score'] = measure_embedding_similarity(question, embeddings)

    # Count number of tokens in each article
    df['token count'] = df['text'].apply(get_num_tokens)
    
    # Return Chunks With Highest Similarity (Text)
    response = get_similar_texts(df, k, 'similarity score')

    # Remove embeddings column
    keep_columns = ['book', "chapter", 'text', 'token count', 'similarity score']
    response = response[keep_columns]

    if run_cross_encoder=="Yes":
        print("Cross Encoding...")
        response = cross_encode(response, question)

        # Return Chunks With Highest Similarity (Text)
        response = get_similar_texts(response, 5, 'cross_encoder_score')
    
    
    return response

In [5]:
query = "what is the greatest commandment?"

response = run(question=query, run_cross_encoder="Yes")
#response.reset_index(inplace=True, drop=True)



FileNotFoundError: File C:\Users\t_zim\Desktop\Data\Bible\KJV_chapter_search.json does not exist

In [8]:
response

Unnamed: 0,book,chapter,text,token count,similarity score,cross_encoder_score
1196,catechism,8,What is the greatest commandment?,7,1.0,6.373341
289,Deuteronomy,8,All the commandments which I command thee this...,777,0.471799,-3.618346
113,1 Timothy,1,"Paul, an apostle of Jesus Christ by the comman...",618,0.455262,-4.011883
46,1 John,2,"My little children, these things write I unto ...",970,0.462779,-4.320076
323,Ecclesiastes,8,Who <i>is</i> as the wise <i>man</i>? and who ...,753,0.480661,-4.851909
