In [77]:
import numpy as np
import openai
import pandas as pd
import pickle
import tiktoken
import toml


COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

secrets = toml.load('.streamlit/secrets.toml')

openai_api_key = secrets['openai_api_key']
openai.api_key = openai_api_key

In [78]:
df = pd.read_csv('docs/govtext/govtext_content.csv')
df = df.set_index(["section", "subsection"])
print(f"{len(df)} rows in the data.")
df.sample(5)

35 rows in the data.


Unnamed: 0_level_0,Unnamed: 1_level_0,content,tokens
section,subsection,Unnamed: 2_level_1,Unnamed: 3_level_1
topic modelling,check status and results - overview,## CHECKING STATUS & VIEWING RESULTS\nClick on...,659
summarisation,check status and results,## CHECKING PROCESSING STATUS\n\nClick on the ...,488
FAQ,14,3. What’s the maximum document length that cou...,155
overview,key features,Key Features\nThe GovText Web Portal service c...,91
FAQ,4,1. Can GovText automatically extract only the ...,155


In [79]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL):
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"], result["usage"]["prompt_tokens"]

def compute_doc_embeddings(df: pd.DataFrame):
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """

    embed_dict = {}
    token_lengths_dict = {}
    for idx, r in df.iterrows():
        r_embed, r_tokens = get_embedding(r.content) 
        embed_dict[idx] = r_embed
        token_lengths_dict[idx] = r_tokens
    
    return embed_dict, token_lengths_dict

In [111]:
test_response = openai.Embedding.create(
      model=EMBEDDING_MODEL,
      input="Sulaiman: Software Engineer"
    )

In [112]:
test_response

<OpenAIObject list at 0x7f577342e4a0> JSON: {
  "data": [
    {
      "embedding": [
        0.005708738695830107,
        -0.0123036103323102,
        0.006819813046604395,
        -0.026938430964946747,
        -0.022616829723119736,
        0.030810149386525154,
        -0.03569069504737854,
        -0.0008166905026882887,
        -0.023939212784171104,
        -0.019617613404989243,
        0.0354725681245327,
        0.03086468018591404,
        0.019249526783823967,
        -0.0038410439155995846,
        0.0034439885057508945,
        -0.019208628684282303,
        0.01826796494424343,
        -0.0006096421857364476,
        -0.0007084800745360553,
        -0.01415085606276989,
        -0.014777964912354946,
        -0.014246285893023014,
        0.016032183542847633,
        -0.004444296006113291,
        -0.007484412286430597,
        -0.0008810202707536519,
        -0.010592692531645298,
        -0.02368018962442875,
        0.005671248771250248,
        -0.022834954783320427

In [82]:
# def load_embeddings(fname: str):
#     """
#     Read the document embeddings and their keys from a CSV.
    
#     fname is the path to a CSV with exactly these named columns: 
#         "title", "heading", "0", "1", ... up to the length of the embedding vectors.
#     """
    
#     df = pd.read_csv(fname, header=0)
#     max_dim = max([int(c) for c in df.columns if c != "section" and c != "subsection"])
#     return {
#            (r.section, r.subsection): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
#     }

import pickle
def load_embeddings(fname: str):
    with open(fname, 'rb') as f:
        embeddings = pickle.load(f)
    
    return embeddings

def save_embeddings(embeddings, fname: str):    
    with open(fname, 'wb') as f:
        pickle.dump(embeddings, f)
    

In [83]:
import numpy as np
def vector_similarity(x, y):
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

# contexts: dict[(str, str), np.array])
# returns list[(float, (str, str))]
def order_document_sections_by_query_similarity(query: str, contexts):
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding, _ = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities

In [84]:
document_embeddings, token_length_dicts = compute_doc_embeddings(df)

In [85]:
# df.reset_index().head()

In [86]:
document_embeddings, token_length_dicts = compute_doc_embeddings(df)

In [87]:
# df = df.reset_index()
# df['tokens'] = df.apply(lambda x: token_length_dicts[(x['section'], x['subsection'])],axis=1)
# df.head()

In [88]:
df.to_csv('docs/govtext/govtext_content.csv', index=True)

In [89]:
save_embeddings(document_embeddings, 'docs/govtext/govtext_content_embeddings.pkl')
save_embeddings(token_length_dicts, 'docs/govtext/token_lengths.pkl')

In [90]:
order_document_sections_by_query_similarity("How many data scientists are there in the team?", document_embeddings)[:5]

[(0.7818629008279487, ('overview', 'team')),
 (0.7561511788817536, ('summarisation', 'submit analysis')),
 (0.7543579670877631, ('datasets', 'edit')),
 (0.7533630418059329, ('datasets', 'overview')),
 (0.7494890658931403, ('overview', 'future'))]

In [105]:
MAX_SECTION_LEN = 1500
SEPARATOR = "\n* "
ENCODING = "cl100k_base"  # encoding for text-embedding-ada-002

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 3 tokens'

In [106]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        print(document_section)
        
        chosen_sections_len += document_section.tokens + separator_len
        print(chosen_sections_len)
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))
    
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

In [96]:
prompt = construct_prompt(
    "How many data scientists are there in the team?",
    document_embeddings,
    df
)

print("===\n", prompt)

content    GovText is the NLP team in the Artificial Inte...
tokens                                                   111
Name: (overview, team), dtype: object
114
content    ## SUBMIT AN ANALYSIS\nClick on the **Predicti...
tokens                                                   608
Name: (summarisation, submit analysis), dtype: object
725
content    ### UPLOAD DATASET\nClicking on the **Upload**...
tokens                                                   323
Name: (datasets, edit), dtype: object
1051
Selected 2 document sections:
('overview', 'team')
('summarisation', 'submit analysis')
===
 Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."

Context:

* GovText is the NLP team in the Artificial Intelligence Platforms team headed by Director Alvina Goh.  It is part of the Data Science and Artificial Intelligence Division (DSAID) of the Government Technology Agency of Singapore (GovTe

In [104]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}

In [99]:
def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings,
    show_prompt: bool = False
) -> str:
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )
    
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

In [110]:
govtext_query = "Which preprocessing methods does govtext use?"
answer_query_with_context(govtext_query, df, document_embeddings, show_prompt=True)

content    6. How does GovText handle spaces and punctuat...
tokens                                                   155
Name: (FAQ, 9), dtype: object
158
content    4. What are the pre-processing steps performed...
tokens                                                   155
Name: (FAQ, 7), dtype: object
316
content    1. Can GovText automatically extract only the ...
tokens                                                   155
Name: (FAQ, 4), dtype: object
474
content    8. Does GovText include N-grams and Part-of-sp...
tokens                                                   155
Name: (FAQ, 11), dtype: object
632
content    Does GovText use lemmatization or stemming for...
tokens                                                   155
Name: (FAQ, 6), dtype: object
790
content    GovText is a comprehensive Natural Language Pr...
tokens                                                   184
Name: (overview, intro), dtype: object
977
content    Key Features\nThe GovText Web Portal servic

'GovText uses sentence segmentation, part-of-speech (POS) tagging and tokenization, removal of stopwords, lemmatization, and formation of n-grams as part of its default preprocessing steps for topic modelling.'