In [47]:
# !pip install tiktoken
# !pip install openai
# !pip install transformers

In [5]:
import numpy as np
import openai
import pandas as pd
import pickle
import tiktoken

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

In [49]:
from transformers import GPT2TokenizerFast

In [50]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [7]:
prompt = "What are Sogeti's practices?"
openai.api_key = "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"
openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=300,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

'Sogeti is a global IT services provider that specializes in digital transformation, application services, and testing and quality assurance. The company’s core practices include:\n\n1. Digital Transformation: Sogeti helps organizations to develop and implement digital strategies, leveraging the latest technologies and trends.\n\n2. Application Services: Sogeti provides a range of services to help organizations develop, maintain, and modernize their applications.\n\n3. Testing and Quality Assurance: Sogeti helps organizations to ensure the quality of their applications and systems through testing and quality assurance services.\n\n4. Cloud Services: Sogeti helps organizations to migrate their applications and systems to the cloud, as well as manage and optimize their cloud environments.\n\n5. Security Services: Sogeti provides a range of security services to help organizations protect their data and systems.'

In [8]:
prompt = """Answer the question as truthfully as possible, and if you're unsure of the answer, say "Sorry, I don't know".

Q: What are Sogeti's practices?
A:"""

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=300,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

'Sogeti is a global IT services and consulting company that specializes in providing customized solutions to its clients. They offer a range of services, including application development, cloud computing, digital transformation, and more.'

In [13]:
prompt = """Answer the question as truthfully as possible using the provided text, and if the answer is not contained within the text below, say "I don't know"

Context:
OUR SOGETI PRACTICES\n\nto help our clients achieve real and meaningful business outcomes.

Q: What are Sogeti's practices?
A:"""

openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=100,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0,
    model=COMPLETIONS_MODEL
)["choices"][0]["text"].strip(" \n")

"Sogeti's practices are to help their clients achieve real and meaningful business outcomes."

In [54]:
def count_tokens(text):
    """count the number of tokens in a string"""
    return len(tokenizer.encode(text))

def get_ntokens(x):
    ncontent_ntokens = [
            count_tokens(c)
            + 3
            + count_tokens(" ".join(h.split(" ")[1:-1]))
            - (1 if len(c) == 0 else 0)
            for h, c in zip(x[0], x[1])
        ]
    return ncontent_ntokens


In [61]:
df = pd.read_csv('sample_data/Gpt3_data.csv',encoding='latin')
df['Tokens'] = df['Content'].apply(lambda x: count_tokens(x))
df = df.set_index(["Heading"])
print(f"{len(df)} rows in the data.")
df.sample(5)

44 rows in the data.


Unnamed: 0_level_0,Content,Tokens
Heading,Unnamed: 1_level_1,Unnamed: 2_level_1
Hackathon Conducted,hackathon was conducted to create reusable ass...,12
Capgemini SAP Business outcomes,We have been a global SAP Partner since 1993 ....,27
edge solution,edge solution utilizing AWS Local Zones ...,38
Featured ABL's Application Development and Managed Services (ADMS),ADM is focused on helping clients reduce the c...,146
Capgemini Sustainibility business impact,Capgemini helped Mercedes-Benz reduce their CO...,54


In [23]:
def get_embedding(text, model=EMBEDDING_MODEL):
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def compute_doc_embeddings(df):
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.
    
    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.Content) for idx, r in df.iterrows()
    }

In [62]:
dff = compute_doc_embeddings(df)

In [26]:
dff.keys()

dict_keys(['OUR SOGETI PRACTICES', 'APPLICATIONS AND  CLOUD TECHNOLOGIES', 'CUSTOMER FIRST', 'INSIGHTS  AND DATA', 'INTELLIGENT INDUSTRY', 'QUALITY ENGINEERING', 'SUSTAINABILITY (SPECIAL FOCUS AREA)', 'ABL and GBL (GROUP ALIGNMENT)', 'Applications  &  Cloud Technologies  Capturing Cost Savings', 'Hackathon Conducted', 'edge solution', 'APPLICATIONS  CLOUD TECHNOLOGIES  APPLICATIONS AND CLOUD TECHNOLOGIES KEY ACCOLADES', 'Microsoft Accolades', 'AWS Accolades', 'CUSTOMER FIRST  BUSINESS OUTCOME STATEMENT', 'CUSTOMER FIRST  solutions help clients', 'CUSTOMER FIRST   ROI growth', 'Insights and Data Top Client', 'Insights and Data Success Stories', 'Insights and Data offering to Unilever ', 'Insights and Data offering to  Airbus', 'Intelligent Industry top clients ', 'Intelligent Industry helped P&G ', 'Intelligent Industry Intel', 'Capgemin Kroger ', 'Intelligent Industry Atricure ', 'Quality Engineer Top clients', 'Quality Engineer Success Stories', 'Quality Engineer Johnson and Johnson '

In [28]:
def load_embeddings(fname):
    """
    Read the document embeddings and their keys from a CSV.
    
    fname is the path to a CSV with exactly these named columns: 
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """
    
    df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
           (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }


In [36]:
def vector_similarity(x, y):
    """
    Returns the similarity between two vectors.
    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query, contexts):
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections. 
    
    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)
    
    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)
    
    return document_similarities


In [39]:
order_document_sections_by_query_similarity("What are some of the benefits of I&D services?", dff)[:5]


[(0.8249029115568557, 'INSIGHTS  AND DATA'),
 (0.7855496742632355, 'CUSTOMER FIRST  solutions help clients'),
 (0.7843566711260354,
  "Featured ABL's Application Development and Managed Services (ADMS)"),
 (0.7619811714770115, 'Quality Engineering Banner Health '),
 (0.7579119554538427, 'Intelligent Industry helped P&G ')]

In [40]:
MAX_SECTION_LEN = 500
SEPARATOR = "\n* "
ENCODING = "gpt2"  # encoding for text-davinci-003

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

f"Context separator contains {separator_len} tokens"

'Context separator contains 3 tokens'

In [63]:
def construct_prompt(question, context_embeddings, df):
    """
    Fetch relevant 
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)
    
    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []
     
    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.        
        document_section = df.loc[section_index]
        
        chosen_sections_len += document_section.Tokens + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break
            
        chosen_sections.append(SEPARATOR + document_section.Content.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))
            
    # Useful diagnostic information
    print(f"Selected {len(chosen_sections)} document sections:")
    print("\n".join(chosen_sections_indexes))
    
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""
    
    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"

In [64]:
prompt = construct_prompt(
    "What are Sogeti's practices?",
    dff,
    df
)

print("===\n", prompt)

Selected 2 document sections:
About Sogeti
Quality Engineering Medica 
===
 Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."

Context:

* Part    of      the     Capgemini       Group,  Sogeti  makes   business        value   through technology      for     organizations   that    need    to      implement       innovation      at      speed   and     want    a       local   partner with    global  scale.  With    a       hands-on        culture and     close   proximity       to      its     clients,        Sogeti  implements      solutions       that    will    help    organizations   work    faster, better, and     smarter.        By      combining       its     agility and     speed   of      implementation  through a       DevOps  approach,       Sogeti  delivers        innovative      solutions       in      quality engineering,    cloud   and     application     development,   

In [65]:
COMPLETIONS_API_PARAMS = {
    # We use temperature of 0.0 because it gives the most predictable, factual answer.
    "temperature": 0.0,
    "max_tokens": 300,
    "model": COMPLETIONS_MODEL,
}

In [74]:
def answer_query_with_context(
    query,
    df,
    document_embeddings,
    show_prompt=False):
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )
    
    if show_prompt:
        print(prompt)

    response = openai.Completion.create(
                prompt=prompt,
                **COMPLETIONS_API_PARAMS
            )

    return response["choices"][0]["text"].strip(" \n")

In [76]:
test_response = answer_query_with_context("What is a data incubator?", df, dff)

Selected 6 document sections:
Insights and Data offering to Unilever 
INSIGHTS  AND DATA
Insights and Data offering to  Airbus
Insights and Data Success Stories
Hackathon Conducted
Intelligent Industry Intel


In [77]:
test_response

'A data incubator is a platform that helps organizations better connect with their customers by leveraging the latest data solutions, including artificial intelligence, hyper-automation, machine learning, robotics process automation, and more.'