# Semantic Search on a Corpus

In [1]:
# Import Packages
import json
import numpy as np
import pandas as pd
import tiktoken
from joblib import load

## Import Data

In [2]:
# Import Data
filepath = "C:\\Users\\t_zim\\Desktop\\Data\\Bible\\"
filename = "KJV_chapter_search.json"

df = pd.read_json(str(filepath)+str(filename))
len(df)

1198

In [3]:
# ================== #
# Tokenize Text
# ================== #

# TODO: change tiktoken out to real tokenizer... or use tiktoken

def get_tokens(text_2_encode: str, tokenizer=None):
    """
    Tokenize text in a string.

    Initialize a tokenizer if tokenizer == None.
    """

    if tokenizer is None:
        tokenizer = tiktoken.encoding_for_model("text-davinci-003")
    return tokenizer.encode(text=text_2_encode)


def get_num_tokens(text_2_encode: str, **kwargs):
    """
    Count the number of tokens in a string.
    """
    return len(get_tokens(text_2_encode=text_2_encode, **kwargs))


# ================== #
#  Get Embeddings
# ================== #

def get_embeddings(text=None, model=None):
    """
    Generate embeddings on a string of text.
    """
    if model==None:
        model = load('./model/SentBERTmodel.pkl')

    return model.encode(text)


# ================== #
#  Calculate Vector Similarity
# ================== #

def vector_similarity(x: "list[float]", y: "list[float]") -> float:
    """
    Returns the similarity between two vectors.

    Because embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))


# ================== #
#  Order Chunks by Similarity
# ================== #

def measure_embedding_similarity(
    query: str,
    embeddings
):
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections.

    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embeddings(query)

    return [vector_similarity(query_embedding, embedding) for embedding in embeddings]


# ================== #
#  Get Similar Texts
# ================== #

def get_similar_texts(df, k):
    """
    Slice a dataframe on the top k results.  Sort the sliced dataframe descending on similarity score.

    If there are repeated results in top 5, keep them all.
    """
    response = df.nlargest(k, columns=['similarity score'],keep='all')
    response = response.sort_values(by='similarity score', ascending=False)
    return response

In [4]:
# ================== #
#  Run (score.py)
# ================== #

def run(question: str, k: int, embeddings, df) -> dict[str,str]:

    # Retrieve Top K Most Similar Results
    df['similarity score'] = measure_embedding_similarity(question, embeddings)

    # Count number of tokens in each article
    df['token count'] = df['text'].apply(get_num_tokens)
    
    # Return Chunks With Highest Similarity (Text)
    response = get_similar_texts(df, k)

    # Remove embeddings column
    keep_columns = ['book', "chapter", 'text', 'token count', 'similarity score']
    response = response[keep_columns]
    
    return response

## Ask Questions

In [5]:
response = run("who is my father?", 3, df['embeddings'], df)

response.style.set_properties(subset=['text'], **{'width': '900px'})

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,book,chapter,text,token count,similarity score
711,John,14,"Let not your heart be troubled: ye believe in God, believe also in me. In my Father's house are many mansions: if it were not so, I would have told you. I go to prepare a place for you. And if I go and prepare a place for you, I will come again, and receive you unto myself; that where I am, there ye may be also. And whither I go ye know, and the way ye know. Thomas saith unto him, Lord, we know not whither thou goest; and how can we know the way? Jesus saith unto him, I am the way, the truth, and the life: no man cometh unto the Father, but by me. If ye had known me, ye should have known my Father also: and from henceforth ye know him, and have seen him. Philip saith unto him, Lord, shew us the Father, and it sufficeth us. Jesus saith unto him, Have I been so long time with you, and yet hast thou not known me, Philip? he that hath seen me hath seen the Father; and how sayest thou then, Shew us the Father? Believest thou not that I am in the Father, and the Father in me? the words that I speak unto you I speak not of myself: but the Father that dwelleth in me, he doeth the works. Believe me that I am in the Father, and the Father in me: or else believe me for the very works' sake. Verily, verily, I say unto you, He that believeth on me, the works that I do shall he do also; and greater works than these shall he do; because I go unto my Father. And whatsoever ye shall ask in my name, that will I do, that the Father may be glorified in the Son. If ye shall ask any thing in my name, I will do it. If ye love me, keep my commandments. And I will pray the Father, and he shall give you another Comforter, that he may abide with you for ever; Even the Spirit of truth; whom the world cannot receive, because it seeth him not, neither knoweth him: but ye know him; for he dwelleth with you, and shall be in you. I will not leave you comfortless: I will come to you. Yet a little while, and the world seeth me no more; but ye see me: because I live, ye shall live also. At that day ye shall know that I am in my Father, and ye in me, and I in you. He that hath my commandments, and keepeth them, he it is that loveth me: and he that loveth me shall be loved of my Father, and I will love him, and will manifest myself to him. Judas saith unto him, not Iscariot, Lord, how is it that thou wilt manifest thyself unto us, and not unto the world? Jesus answered and said unto him, If a man love me, he will keep my words: and my Father will love him, and we will come unto him, and make our abode with him. He that loveth me not keepeth not my sayings: and the word which ye hear is not mine, but the Father's which sent me. These things have I spoken unto you, being yet present with you. But the Comforter, which is the Holy Ghost, whom the Father will send in my name, he shall teach you all things, and bring all things to your remembrance, whatsoever I have said unto you. Peace I leave with you, my peace I give unto you: not as the world giveth, give I unto you. Let not your heart be troubled, neither let it be afraid. Ye have heard how I said unto you, I go away, and come again unto you. If ye loved me, ye would rejoice, because I said, I go unto the Father: for my Father is greater than I. And now I have told you before it come to pass, that, when it is come to pass, ye might believe. Hereafter I will not talk much with you: for the prince of this world cometh, and hath nothing in me. But that the world may know that I love the Father; and as the Father gave me commandment, even so I do. Arise, let us go hence.",991,0.400252
1163,Song of Solomon,3,"By night on my bed I sought him whom my soul loveth: I sought him, but I found him not. I will rise now, and go about the city in the streets, and in the broad ways I will seek him whom my soul loveth: I sought him, but I found him not. The watchmen that go about the city found me: to whom I said, Saw ye him whom my soul loveth? It was but a little that I passed from them, but I found him whom my soul loveth: I held him, and would not let him go, until I had brought him into my mother's house, and into the chamber of her that conceived me. I charge you, O ye daughters of Jerusalem, by the roes, and by the hinds of the field, that ye stir not up, nor awake my love, till he please. Who is this that cometh out of the wilderness like pillars of smoke, perfumed with myrrh and frankincense, with all powders of the merchant? Behold his bed, which is Solomon's; threescore valiant men are about it, of the valiant of Israel. They all hold swords, being expert in war: every man hath his sword upon his thigh because of fear in the night. King Solomon made himself a chariot of the wood of Lebanon. He made the pillars thereof of silver, the bottom thereof of gold, the covering of it of purple, the midst thereof being paved with love, for the daughters of Jerusalem. Go forth, O ye daughters of Zion, and behold king Solomon with the crown wherewith his mother crowned him in the day of his espousals, and in the day of the gladness of his heart.",428,0.398384
238,Acts,10,"There was a certain man in Caesarea called Cornelius, a centurion of the band called the Italian band, A devout man, and one that feared God with all his house, which gave much alms to the people, and prayed to God alway. He saw in a vision evidently about the ninth hour of the day an angel of God coming in to him, and saying unto him, Cornelius. And when he looked on him, he was afraid, and said, What is it, Lord? And he said unto him, Thy prayers and thine alms are come up for a memorial before God. And now send men to Joppa, and call for one Simon, whose surname is Peter: He lodgeth with one Simon a tanner, whose house is by the sea side: he shall tell thee what thou oughtest to do. And when the angel which spake unto Cornelius was departed, he called two of his household servants, and a devout soldier of them that waited on him continually; And when he had declared all these things unto them, he sent them to Joppa. On the morrow, as they went on their journey, and drew nigh unto the city, Peter went up upon the housetop to pray about the sixth hour: And he became very hungry, and would have eaten: but while they made ready, he fell into a trance, And saw heaven opened, and a certain vessel descending unto him, as it had been a great sheet knit at the four corners, and let down to the earth: Wherein were all manner of fourfooted beasts of the earth, and wild beasts, and creeping things, and fowls of the air. And there came a voice to him, Rise, Peter; kill, and eat. But Peter said, Not so, Lord; for I have never eaten any thing that is common or unclean. And the voice spake unto him again the second time, What God hath cleansed, that call not thou common. This was done thrice: and the vessel was received up again into heaven. Now while Peter doubted in himself what this vision which he had seen should mean, behold, the men which were sent from Cornelius had made inquiry for Simon's house, and stood before the gate, And called, and asked whether Simon, which was surnamed Peter, were lodged there. While Peter thought on the vision, the Spirit said unto him, Behold, three men seek thee. Arise therefore, and get thee down, and go with them, doubting nothing: for I have sent them. Then Peter went down to the men which were sent unto him from Cornelius; and said, Behold, I am he whom ye seek: what is the cause wherefore ye are come? And they said, Cornelius the centurion, a just man, and one that feareth God, and of good report among all the nation of the Jews, was warned from God by an holy angel to send for thee into his house, and to hear words of thee. Then called he them in, and lodged them. And on the morrow Peter went away with them, and certain brethren from Joppa accompanied him. And the morrow after they entered into Caesarea. And Cornelius waited for them, and had called together his kinsmen and near friends. And as Peter was coming in, Cornelius met him, and fell down at his feet, and worshipped him. But Peter took him up, saying, Stand up; I myself also am a man. And as he talked with him, he went in, and found many that were come together. And he said unto them, Ye know how that it is an unlawful thing for a man that is a Jew to keep company, or come unto one of another nation; but God hath shewed me that I should not call any man common or unclean. Therefore came I unto you without gainsaying, as soon as I was sent for: I ask therefore for what intent ye have sent for me? And Cornelius said, Four days ago I was fasting until this hour; and at the ninth hour I prayed in my house, and, behold, a man stood before me in bright clothing, And said, Cornelius, thy prayer is heard, and thine alms are had in remembrance in the sight of God. Send therefore to Joppa, and call hither Simon, whose surname is Peter; he is lodged in the house of one Simon a tanner by the sea side: who, when he cometh, shall speak unto thee. Immediately therefore I sent to thee; and thou hast well done that thou art come. Now therefore are we all here present before God, to hear all things that are commanded thee of God. Then Peter opened his mouth, and said, Of a truth I perceive that God is no respecter of persons: But in every nation he that feareth him, and worketh righteousness, is accepted with him. The word which God sent unto the children of Israel, preaching peace by Jesus Christ: (he is Lord of all:) That word, I say, ye know, which was published throughout all Judaea, and began from Galilee, after the baptism which John preached; How God anointed Jesus of Nazareth with the Holy Ghost and with power: who went about doing good, and healing all that were oppressed of the devil; for God was with him. And we are witnesses of all things which he did both in the land of the Jews, and in Jerusalem; whom they slew and hanged on a tree: Him God raised up the third day, and shewed him openly; Not to all the people, but unto witnesses chosen before of God, even to us, who did eat and drink with him after he rose from the dead. And he commanded us to preach unto the people, and to testify that it is he which was ordained of God to be the Judge of quick and dead. To him give all the prophets witness, that through his name whosoever believeth in him shall receive remission of sins. While Peter yet spake these words, the Holy Ghost fell on all them which heard the word. And they of the circumcision which believed were astonished, as many as came with Peter, because that on the Gentiles also was poured out the gift of the Holy Ghost. For they heard them speak with tongues, and magnify God. Then answered Peter, Can any man forbid water, that these should not be baptized, which have received the Holy Ghost as well as we? And he commanded them to be baptized in the name of the Lord. Then prayed they him to tarry certain days.",1459,0.36135


## Output Data (if needed)

In [6]:
"""

del df['similarity score']

OUTPUT_filepath = "C:\\Users\\hlmq\\OneDrive - Chevron\\Desktop\\Projects\\Company Relationships\\Data\\Exxon\\"
OUTPUT_file = "CompanyProfileNewsData_withEmbeddings.json"

df.to_json(str(OUTPUT_filepath)+str(OUTPUT_file), orient='records')

"""

'\n\ndel df[\'similarity score\']\n\nOUTPUT_filepath = "C:\\Users\\hlmq\\OneDrive - Chevron\\Desktop\\Projects\\Company Relationships\\Data\\Exxon\\"\nOUTPUT_file = "CompanyProfileNewsData_withEmbeddings.json"\n\ndf.to_json(str(OUTPUT_filepath)+str(OUTPUT_file), orient=\'records\')\n\n'