## A simple understaning of chat + Prompt + custom context
with wikipedia data & openai api

for openai api authentication: 
https://platform.openai.com/docs/api-reference/authentication

In [1]:
import sys
import os
sys.path.append('/home/vino/api_keys')
from api_key import OPENAI_API_KEY
from openai import OpenAI
client = OpenAI(api_key=OPENAI_API_KEY)

In [2]:
prompt = """
context: "my name is Vinoth"
question: what is my name?
"""

response = client.chat.completions.create(
    model="gpt-4o-mini",
    
    max_tokens=50,
    temperature=0.7,
    n=3,
    frequency_penalty=0.5,
    presence_penalty=0.5 ,   
    
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": prompt
        }
    ]
)

print(response.choices[0].message.content)

Your name is Vinoth.


In [3]:
prompt = """
Question: "When did sweden join NATO?"
Answer:
"""

response = client.chat.completions.create(
    model="gpt-4o-mini",
    
    max_tokens=1000,
    temperature=0.7,
    n=3,
    frequency_penalty=0.5,
    presence_penalty=0.5 ,   
    
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {
            "role": "user",
            "content": prompt
        }
    ]
)

print(response.choices[0].message.content)

As of October 2023, Sweden has not yet officially joined NATO, but it applied for membership in May 2022 alongside Finland. The accession process requires the approval of all current NATO member states, and Sweden's membership has been pending due to certain political considerations. Please verify with up-to-date sources for the latest developments regarding Sweden's NATO membership status.


## Get text from wiki... 

In [7]:
from dateutil.parser import parse
import pandas as pd
import requests

# Get the Wikipedia page for "2024"
#https://en.wikipedia.org/wiki/2024
resp = requests.get("https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exlimit=1&titles=2024&explaintext=1&formatversion=2&format=json")

# Load the page text into a DataFrame
df = pd.DataFrame()
df["text"] = resp.json()["query"]["pages"][0]["extract"].split("\n")

# Remove empty lines and headings from the text
df = df[df["text"].str.strip().ne("") & (~df["text"].str.startswith("=="))]

# Adjust text samples to include dates as prefixes where applicable
prefix = ""
for i, row in df.iterrows():
    if " – " not in row["text"]:  # If the text doesn't already have a date prefix
        try:
            # If the row's text is a valid date, set it as the new prefix
            parse(row["text"])
            prefix = row["text"]
        except ValueError:
            # If it's not a date, prepend the existing prefix to the text
            df.at[i, "text"] = f"{prefix} – {row['text']}"

# Keep only rows that now contain a " – " separator (date + text)
df = df[df["text"].str.contains(" – ")]
df

Unnamed: 0,text
0,"– 2024 (MMXXIV) is the current year, and is a..."
1,"– So far, this year has seen the continuation..."
2,– The ongoing Israel–Hamas war has led to spi...
3,"– Approximately 79 countries, representing ar..."
11,"January 1 – Egypt, Ethiopia, Iran and the Unit..."
...,...
240,Economics – TBD
241,Literature – TBD
242,Peace – TBD
243,Physics – John J. Hopfield and Geoffrey E. Hin...


## Get vector embeddings for the text
https://platform.openai.com/docs/guides/embeddings/embeddings

In [11]:
#By default, the length of the embedding vector will be 1536 for text-embedding-3-small 
def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

df['embedding'] = df['text'].apply(lambda x: get_embedding(x, model='text-embedding-3-small'))
df.to_csv('output/embedded.csv', index=False)

In [14]:
df

Unnamed: 0,text,embedding
0,"– 2024 (MMXXIV) is the current year, and is a...","[-0.003991527017205954, -0.015976931899785995,..."
1,"– So far, this year has seen the continuation...","[-0.024011870846152306, 0.0021908641792833805,..."
2,– The ongoing Israel–Hamas war has led to spi...,"[-0.0022356603294610977, -0.03140779957175255,..."
3,"– Approximately 79 countries, representing ar...","[0.01753215864300728, 0.026159770786762238, 0...."
11,"January 1 – Egypt, Ethiopia, Iran and the Unit...","[-0.05981452390551567, -0.021807799115777016, ..."
...,...,...
240,Economics – TBD,"[-0.0586511567234993, -0.015092739835381508, 0..."
241,Literature – TBD,"[-0.02117740735411644, 0.0011748162796720862, ..."
242,Peace – TBD,"[-0.004014094825834036, -0.02729666978120804, ..."
243,Physics – John J. Hopfield and Geoffrey E. Hin...,"[-0.008249836042523384, -0.051322367042303085,..."


In [19]:
print('embedding size:',len(df['embedding'][0]))

embedding size: 1536


### (optional):tokenising with tiktoken to undersand 
##### prefix_to_encoding:"gpt-4o-": "o200k_base
##### (model to encoding: "text-embedding-3-small": "cl100k_base"?)
https://github.com/openai/tiktoken/blob/main/tiktoken/model.py

In [22]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o-mini")

In [23]:
encoding.name

'o200k_base'

In [30]:
encoded=encoding.encode(df['text'][0])
encoded

[1127,
 220,
 1323,
 19,
 350,
 12365,
 7730,
 4478,
 8,
 382,
 290,
 2208,
 1284,
 11,
 326,
 382,
 261,
 58256,
 1284,
 8601,
 402,
 10715,
 328,
 290,
 144926,
 18712,
 11,
 290,
 220,
 1323,
 19,
 404,
 1284,
 328,
 290,
 14782,
 40126,
 350,
 4585,
 8,
 326,
 1689,
 1750,
 18987,
 2363,
 350,
 2416,
 8,
 2364,
 929,
 11,
 290,
 220,
 1494,
 404,
 220,
 1284,
 328,
 290,
 220,
 18,
 9290,
 167837,
 326,
 290,
 220,
 2040,
 302,
 14015,
 11,
 326,
 290,
 220,
 220,
 20,
 404,
 256,
 1284,
 328,
 290,
 220,
 1323,
 15,
 82,
 23270,
 13,
 256]

In [31]:
decoded=encoding.decode(encoded)
assert decoded==df['text'][0]
decoded

' – 2024 (MMXXIV) is the current year, and is a leap year starting on Monday of the Gregorian calendar, the 2024th year of the Common Era (CE) and Anno Domini (AD) designations, the 24th  year of the 3rd millennium and the 21st century, and the  5th   year of the 2020s decade.  '

In [34]:
#see how the sentence is actually tokenised
[encoding.decode_single_token_bytes(token) for token in encoded]

[b' \xe2\x80\x93',
 b' ',
 b'202',
 b'4',
 b' (',
 b'MM',
 b'XX',
 b'IV',
 b')',
 b' is',
 b' the',
 b' current',
 b' year',
 b',',
 b' and',
 b' is',
 b' a',
 b' leap',
 b' year',
 b' starting',
 b' on',
 b' Monday',
 b' of',
 b' the',
 b' Gregorian',
 b' calendar',
 b',',
 b' the',
 b' ',
 b'202',
 b'4',
 b'th',
 b' year',
 b' of',
 b' the',
 b' Common',
 b' Era',
 b' (',
 b'CE',
 b')',
 b' and',
 b' An',
 b'no',
 b' Dom',
 b'ini',
 b' (',
 b'AD',
 b')',
 b' design',
 b'ations',
 b',',
 b' the',
 b' ',
 b'24',
 b'th',
 b' ',
 b' year',
 b' of',
 b' the',
 b' ',
 b'3',
 b'rd',
 b' millennium',
 b' and',
 b' the',
 b' ',
 b'21',
 b'st',
 b' century',
 b',',
 b' and',
 b' the',
 b' ',
 b' ',
 b'5',
 b'th',
 b'  ',
 b' year',
 b' of',
 b' the',
 b' ',
 b'202',
 b'0',
 b's',
 b' decade',
 b'.',
 b'  ']

In [86]:
#get cosine similarty or any other metric to find the relavance of embedded texts to question
from scipy.spatial.distance import cosine

question_prompt="""
"When did sweden join NATO?
"""

def get_sorted_by_relavance(df, question_prompt, n=10, pprint=True):
   question_embedding = get_embedding(question_prompt, model='text-embedding-3-small')
   df['similarities'] = df.embedding.apply(lambda x: cosine(x, question_embedding))

   #lower cosine similarty value, they are closer 
   res = df.sort_values('similarities', ascending=True).head(n)
   return res

res = get_sorted_by_relavance(df, question_prompt, n=10)

In [56]:
res

Unnamed: 0,text,embedding,similarities
52,March 7 – As the final Nordic country to join ...,"[-0.037684716284275055, 0.02795541100203991, 0...",0.302108
92,May 7–11 – The Eurovision Song Contest 2024 is...,"[-0.025903914123773575, -0.01191788911819458, ...",0.666418
63,March 31 – Bulgaria and Romania become members...,"[-0.03385886177420616, 0.028884245082736015, 0...",0.666564
147,July 9–11 – The 33rd NATO summit is held in Wa...,"[-0.014703717082738876, 0.017862677574157715, ...",0.705111
211,October 20 – 2024 Moldovan European Union memb...,"[0.0026293995324522257, 0.03370118886232376, 0...",0.711542
11,"January 1 – Egypt, Ethiopia, Iran and the Unit...","[-0.05981452390551567, -0.021807799115777016, ...",0.743055
118,June 1 – The 2024 Icelandic presidential elect...,"[0.013015824370086193, -0.03595449775457382, 0...",0.749218
23,January 14 – Margrethe II formally abdicates a...,"[0.02902938239276409, 0.04483066871762276, 0.0...",0.762981
14,January 1 – Ethiopia announces an agreement wi...,"[0.016344791278243065, -0.05994442477822304, 0...",0.770028
129,June 14 – July 14 – UEFA Euro 2024 is held in ...,"[-0.02209368348121643, -0.024109605699777603, ...",0.78514


In [70]:
def create_prompt(df, question, max_token_count):
    """
    Generates a formatted prompt string for a Completion model based on a question and relevant context.

    Parameters:
    - question (str): The question to be answered.
    - df (DataFrame): A DataFrame containing relevant text data sorted by relevance.
    - max_token_count (int): The maximum allowed number of tokens in the prompt.

    Returns:
    - str: A formatted prompt string that includes the context and the question.
    """
    # Initialize the tokenizer aligned with the embeddings
    tokenizer = tiktoken.encoding_for_model("gpt-4o-mini")
    
    # Define the prompt template
    prompt_template = """
    Answer the question based on the context below. If the question cannot 
    be answered based on the context, please say "I don't know."
    
    Context:
    
    {}
    
    ---
    
    Question: {}
    Answer:"""

    # Calculate the initial token count for the template and question
    current_token_count = len(tokenizer.encode(prompt_template)) + \
                          len(tokenizer.encode(question))
    
    context = []
    
    # Retrieve relevant rows from the DataFrame sorted by relevance
    for text in get_sorted_by_relavance(df,question)["text"].values:
        # Count tokens for the current text
        text_token_count = len(tokenizer.encode(text))
        
        # Check if adding this text exceeds the max token count
        if current_token_count + text_token_count <= max_token_count:
            context.append(text)
            current_token_count += text_token_count  # Update the total token count
        else:
            break  # Stop if max token count is reached

    # Format the context and return the final prompt
    return prompt_template.format("\n- ".join(context), question)  # Using bullet points for clarity


In [72]:
print(create_prompt(df,"When did sweden join NATO?",200))


Answer the question based on the context below. If the question cannot 
be answered based on the context, please say "I don't know."

Context:

March 7 – As the final Nordic country to join the alliance, Sweden officially joins NATO, becoming its 32nd member after Finland a year earlier.
- March 31 – Bulgaria and Romania become members of the Schengen Area through sea and air routes.
- May 7–11 – The Eurovision Song Contest 2024 is held in Malmö, Sweden. Swiss contestant Nemo wins with the song "The Code".

---

Question: When did sweden join NATO?
Answer:


In [84]:
def answer_question(df, question, max_prompt_tokens=1800, max_answer_tokens=150):
    
    """
    If the model produces an error, return an empty string
    """
    prompt = create_prompt(df, question, max_prompt_tokens)
    print(prompt)
    try:
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            
            max_tokens=max_answer_tokens,
            temperature=0.7,
            n=3,
            frequency_penalty=0.5,
            presence_penalty=0.5 ,   
            
            messages=[             
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )
        return response.choices[0].message.content
    except Exception as e:
        print(e)
        return ""

In [87]:
answer_question(df,"When did sweden join NATO?")


Answer the question based on the context below. If the question cannot 
be answered based on the context, please say "I don't know."

Context:

March 7 – As the final Nordic country to join the alliance, Sweden officially joins NATO, becoming its 32nd member after Finland a year earlier.
- March 31 – Bulgaria and Romania become members of the Schengen Area through sea and air routes.
- May 7–11 – The Eurovision Song Contest 2024 is held in Malmö, Sweden. Swiss contestant Nemo wins with the song "The Code".
- July 9–11 – The 33rd NATO summit is held in Washington, D.C.
- October 20 – 2024 Moldovan European Union membership referendum.
- January 1 – Egypt, Ethiopia, Iran and the United Arab Emirates become BRICS members.
- June 1 – The 2024 Icelandic presidential election is held, with Halla Tómasdóttir elected president of Iceland.
- January 1 – Ethiopia announces an agreement with Somaliland to use the port of Berbera. Ethiopia also says that it will eventually recognize Somaliland's 

'Sweden officially joined NATO on March 7, 2024.'