# Semantic Search on a Corpus

In [None]:
# Import Packages
import json
import numpy as np
import pandas as pd
import tiktoken
from joblib import load

## Import Data

In [None]:
# Import Data
filepath = "C:\\Users\\hlmq\\OneDrive - Chevron\\Desktop\\Projects\\Company Relationships\\Data\\Exxon\\"
filename = "CompanyProfileNewsData.json"

df = pd.read_json(str(filepath)+str(filename), lines=True)
len(df)

In [None]:
# ================== #
# Tokenize Text
# ================== #

# TODO: change tiktoken out to real tokenizer... or use tiktoken

def get_tokens(text_2_encode: str, tokenizer=None):
    """
    Tokenize text in a string.

    Initialize a tokenizer if tokenizer == None.
    """

    if tokenizer is None:
        tokenizer = tiktoken.encoding_for_model("text-davinci-003")
    return tokenizer.encode(text=text_2_encode)


def get_num_tokens(text_2_encode: str, **kwargs):
    """
    Count the number of tokens in a string.
    """
    return len(get_tokens(text_2_encode=text_2_encode, **kwargs))


# ================== #
#  Get Embeddings
# ================== #

def get_embeddings(text=None, model=None):
    """
    Generate embeddings on a string of text.
    """
    if model==None:
        model = load('./model/SentBERTmodel.pkl')

    return model.encode(text)


# ================== #
#  Calculate Vector Similarity
# ================== #

def vector_similarity(x: "list[float]", y: "list[float]") -> float:
    """
    Returns the similarity between two vectors.

    Because embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))


# ================== #
#  Order Chunks by Similarity
# ================== #

def measure_embedding_similarity(
    query: str,
    embeddings
):
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections.

    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embeddings(query)

    return [vector_similarity(query_embedding, embedding) for embedding in embeddings]


# ================== #
#  Get Similar Texts
# ================== #

def get_similar_texts(df, k):
    """
    Slice a dataframe on the top k results.  Sort the sliced dataframe descending on similarity score.

    If there are repeated results in top 5, keep them all.
    """
    response = df.nlargest(k, columns=['similarity score'],keep='all')
    response = response.sort_values(by='similarity score', ascending=False)
    return response

In [None]:
# ================== #
#  Run (score.py)
# ================== #

def run(question: str, k: int, embeddings, df) -> dict[str,str]:

    # Retrieve Top K Most Similar Results
    df['similarity score'] = measure_embedding_similarity(question, embeddings)
    
    # Return Chunks With Highest Similarity (Text)
    response = get_similar_texts(df, k)
    
    return response

In [None]:
# Produce Data

# Generate embeddings
embeddings = get_embeddings(df['n_summary'])
df['embeddings'] = embeddings.tolist()

# Count number of tokens in each article
df['token count'] = df['n_summary'].apply(get_num_tokens)

## Ask Questions

In [None]:
response = run("is ExxonMobil investing in advanced recycling of plastic", 5, embeddings, df)

response.style.set_properties(subset=['n_summary'], **{'width': '600px'})

In [None]:
response = run("What are Exxon's decarbonization goals?", 5, embeddings, df)

response.style.set_properties(subset=['n_summary'], **{'width': '600px'})

In [None]:
response = run("How much is Exxon planning to invest in Carbon Capture, Utilization, and Storage?", 5, embeddings, df)

response.style.set_properties(subset=['n_summary'], **{'width': '600px'})

In [None]:
response = run("How much is Exxon planning to invest in Hydrogen?", 5, embeddings, df)

response.style.set_properties(subset=['n_summary'], **{'width': '600px'})

## Output Data (if needed)

In [None]:
OUTPUT_filepath = "C:\\Users\\hlmq\\OneDrive - Chevron\\Desktop\\Projects\\Company Relationships\\Data\\Exxon\\Export Results\\"
OUTPUT_file = "query_hydrogen.csv"

response.to_csv(str(OUTPUT_filepath)+str(OUTPUT_file), index=False)