#### Encyclopedia P.O.C

This notebook is dedicated to exploring the usage of GENAI to develop encyclopedic facts based on search terms

In [44]:
# IMPORT LIBRARIES

import openai
import pandas as pd
import numpy as np
from dotenv import find_dotenv, load_dotenv
import re
import os
import emoji
import json
import spacy
from spacy import displacy
load_dotenv(find_dotenv())
openai.api_key = os.getenv("OPENAI_API_KEY")


In [45]:

NER = spacy.load("en_core_web_sm")

In [46]:
# # CONSTANTS

ENCYCLOPEDIA_PROMPT = """
A user searches the following query on the MoneyLion app
search_query : {search_query}
Instructions:
Generate an educative fun fact about {search_query} without mentioning any specific company, like Starbucks.
This fact can be a piece of advice, a fun fact or a statistic.
Provide the source of the fact such as the name of the website or the name of the book.

Strict constraints:
    If the search query is a name of a company or brand, generated_fact must return "Company or brand name, no fact generated".
    Do not produce a fact that can negatively impact MoneyLion's brand or reputation.
    Do not provide alternatives to MoneyLion's products or services.
    The output must not contain any company names or brand names.

Output criteria:
    - return final answer using 3 variables "generated_fact", "source" and "fact_type"
    - compute "generated_fact" as: generated fact
    - compute "source" as: source of the fact
    - compute "fact_type" as: type of fact (fun fact, statistic, advice)
    - output final answer in the following format:
        generated_fact=(input value of generated_fact here); source=(input value of source here); fact_type=(input value of fact_type here)

Performance Evaluation:
1. Never, never return search query or the generated_fact that will contains any specific company names or brand names (e.g. Starbucks, Chime, etc.)
2. As mentioned, in the case above, just return "Company or brand name, no fact generated"
"""

PRODUCT_KEYWORDS = [
    'Roar Money',
    'Instacash',
    'Credit Builder Plus'
]

PRODUCT_ACTION_KEYWORDS = [
    'Peer Boost',
    'Shake n Bank',
    'Cash Advance',
]

COMPETITOR_KEYWORDS = [
    "SoFi",
    "Cleo",
    "Mission Lane",
    "Propel",
    "Dave",
    "Braviant Holdings",
    "EarnIn",
    "Brigit",
    "Affirm",
    "Avant",
    "Varo",
    "Revolut",
    "Monso",
    "Acorn",
    "Betterment",
    "Chime",
]

GENERATED_FACT_PATTERN = r"generated_fact=(.*?);"
CATEGORY_PATTERN = r"category=(.*?);"
SOURCE_PATTERN = r"source=(.*?);"
FACT_TYPE_PATTERN = r"fact_type=(.*)"

In [47]:
# CHECKING FUNCTION USED FOR BOTH PRE AND POST PROCESSING
def check_keywords(query):
    """
    Check if the query contains any of the product keywords.
    Used for both pre and post processing steps

    params:
        query: the query to be checked
    
    returns:
        True if the query contains any of the product keywords
        False otherwise
    """
    keyword_list = PRODUCT_KEYWORDS + PRODUCT_ACTION_KEYWORDS + COMPETITOR_KEYWORDS
    for keyword in keyword_list:
        if keyword.lower() in query.lower() or keyword.lower().replace(' ', '') in query.lower():
            return True
    return False

In [48]:
# GENERATION FUNCTIONS
def get_gpt_response(prompt, temperature=1.0, model = "gpt-3.5-turbo"):
    """
    To generate content using GPT
    We use the GPT-3.5-turbo model for generating content
    For offline testing however, we use GPT 4 for generating the evaluation score

    params:
        prompt: the prompt to be used for generating content
        temperature: the temperature to be used for generating content/evaluation score
        model: the model to be used for generating content/evaluation score

    returns:
        response: the response from the GPT API
    """
    CHAT_COMPLETION_MODEL = model
    CHAT_COMPLETION_API_PARAMS = {
        "temperature": temperature,
        "model": CHAT_COMPLETION_MODEL,
    }
    messages = [
        {"role": "user", "content": prompt},
    ]

    response = openai.ChatCompletion.create(
        messages=messages, **CHAT_COMPLETION_API_PARAMS
    )

    response_content = response["choices"][0]["message"]["content"]

    return response, response_content


def main_generate_fact (query):
    """
    Executes the following steps:
        1. Check if the query contains any of the product keywords
        2. If no, generate the fact using GPT. If yes, return None
        3. Post process the generated fact

    params:
        query: the query to be checked

    returns:
        fact: the generated output   
    """
    prompt = ENCYCLOPEDIA_PROMPT.format(search_query=query)
    response, fact = get_gpt_response(prompt)
    
    return fact 

In [49]:
# POST-PROCESSING FUNCTIONS

def remove_emojis(text):
    """
    Remove all emojis in title

    params:
        text: the text to be cleaned
    
    returns:
        cleaned_text: the cleaned text
    """
    cleaned_text = ''
    for words in text:
        if words not in emoji.EMOJI_DATA:
            cleaned_text += words
    return cleaned_text

def remove_unnecessary_char(text):
    """
    Remove leading and trailing whitespace and newline characters, exclamation marks (!), and quotation marks (") from the text.

    params:
        text: the text to be cleaned

    returns:
        cleaned_text: the cleaned text
    """
    text = text.strip(" \n")
    t1 = re.sub("\s*!\s*", repl="", string=text)
    t2 = re.sub('\s*"\s*', repl="", string=t1)
    t3 = re.sub("\s*“\s*", repl="", string=t2)
    t4 = re.sub("\s*”\s*", repl="", string=t3)

    return t4

def extract_generation_output(text):
    """
    Extract the generated fact, source and type of fact from the response content

    params:
        response_content: the generated content to be extracted
    
    returns:
        generated_fact: the generated fact
        source: the source of the fact
        type_of_fact: the type of fact
    """

    # generated_fact = re.findall(GENERATED_FACT_PATTERN, response_content)[0]
    # source = re.findall(SOURCE_PATTERN, response_content)[0]
    # type_of_fact = re.findall(TYPE_OF_FACT_PATTERN, response_content)[0]

    try:
        generated_fact = re.search(GENERATED_FACT_PATTERN, text).group(1)
        # category = re.search(CATEGORY_PATTERN, text).group(1)
        source = re.search(SOURCE_PATTERN, text).group(1)
        fact_type = re.search(FACT_TYPE_PATTERN, text).group(1)
        # if generated_fact and source and category and fact_type:
        #     return generated_fact, category, source, fact_type
        if generated_fact and source and fact_type:
            return generated_fact, source, fact_type
    except Exception as error:
        print("Error in extracting generation output")
        print(error)

def check_entity_words (text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    if [(e.text, e.label_) for e in doc.ents if e.label_ in ('ORG')]:
        return True
    else:
        return False

def post_process_fact(text):
    """
    Post process the fact to remove unnecessary characters and emojis

    params:
        text: the text to be cleaned

    returns:
        cleaned_text: the cleaned text
    """
    t1 = remove_unnecessary_char(text)
    t2 = remove_emojis(t1)
    generated_fact, source, type_of_fact = extract_generation_output(t2)
    entity_flag = check_entity_words(generated_fact)
    keywords_flag = check_keywords(generated_fact)
    flag = entity_flag or keywords_flag

    return generated_fact, source, type_of_fact, flag


In [50]:
# EVALUATION FUNCTIONS - NOT TO BE DEPLOYED FOR PRODUCTION

def extract_evaluation_results(evaluation_response):
    """
    Extract the evaluation metrics and their results from the evaluation response

    params:
        evaluation_response: the response from GPT-4 output
    
    returns:
        r1: result_rating
        r2: result_relevance
        r3: source_credibility
        r4: rating_reason
    """
    while True:
        r1 = re.search(r"result_rating=([0-9]+)", evaluation_response).groups()[0]
        r2 = re.search(r"result_relevance=([0-1])", evaluation_response).groups()[0]
        r3 = re.search(r"source_credibility=([0-1])", evaluation_response).groups()[0]
        r4 = re.search(r"rating_reason=(.*)", evaluation_response).groups()[0]
        if r1 and r2 and r3 and r4:
            break
    return r1, r2, r3, r4


def main_evaluate_fact(query, fact):
    """
    Evaluate the generated fact using GPT-4 with the following criteria:
        - Is the fact generated related to the query?
        - Is the source of the fact credible?
    
    params:
        query: the query to be checked
        fact: the generated fact
    
    returns:
        result_rating: the rating of the fact generated
        result_relevance: the relevance of the fact generated
        source_credibility: the credibility of the source of the fact generated
        rating_reason: the reasoning of the rating
    """

    prompt = EVALUATION_PROMPT.format(search_query=query, fact=fact)
    response, evaluation_response_content = get_gpt_response(prompt, model="gpt-4")
    if evaluation_response_content:
        result_rating, result_relevance, source_credibility, rating_reason = extract_evaluation_results(evaluation_response_content)
    return result_rating, result_relevance, source_credibility, rating_reason

### Single input test

In [54]:
query = input("Enter query: ")
fact = main_generate_fact(query)
print(fact)
if fact is not None:
    # Evaluation
        # result_rating, result_relevance, source_credibility, rating_reason = main_evaluate_fact(query, fact)
    # generated_fact, category, source, type_of_fact = post_process_fact(fact)
    generated_fact, source, type_of_fact, flag = post_process_fact(fact)
    # check if generated fact contains entity
    json_response = {
        "query": query,
        "fact": generated_fact, 
        "source": source,
        "fact_type": type_of_fact,
        "hide_flag": flag
    } 
    output = json.dumps(json_response)
    print(output)


generated_fact=To apply for a loan, it is important to have a good credit score. Lenders often use credit scores to determine loan eligibility and interest rates.; source=www.investopedia.com; fact_type=advice
{"query": "how to apply loan", "fact": "To apply for a loan, it is important to have a good credit score. Lenders often use credit scores to determine loan eligibility and interest rates.", "source": "www.investopedia.com", "fact_type": "advice", "hide_flag": false}


### Scope Test - Run seperately

#### Test Dataset used : Google's Top Financial and Economy Search terms in the Past 5 years

In [None]:
# df_dataset = pd.read_csv('../../input/search_sample_queries.csv')
# #only get random 50 queries
# df_dataset = df_dataset.sample(n=50)
# df_dataset.head()

In [None]:
# for index, row in df_dataset.iterrows():
#     query = row['Top 100 Queries & Suggestions']
#     category = 'Universal Search Sample'
#     if query in df_dataset['Query'].values:
#         continue
#     fact,flag = main_generate_fact(query)
#     if fact is not None:
#         # Evaluation
#             # result_rating, result_relevance, source_credibility, rating_reason = main_evaluate_fact(query, fact)
#         if flag == 0:
#             generated_fact = re.findall(GENERATED_FACT_PATTERN, fact)[0]
#             source = re.findall(SOURCE_PATTERN, fact)[0]
#             type_of_fact = re.findall(TYPE_OF_FACT_PATTERN, fact)[0]
#         else:
#             generated_fact = "Query contains restricted keywords"
#             source = None
#             type_of_fact = None
#         json_response = {
#             "query": query,
#             "fact": generated_fact, 
#             "source": source,
#             "fact_type": type_of_fact,
#             "hide_flag": flag
#         } 
#         output = json.dumps(json_response)
#         print(output)


In [None]:
# # calculate the length of query
# df_dataset['Query_Length'] = df_dataset['Query'].str.len()
# df_dataset

In [None]:
# # df.to_csv('../../output/financial_economy_query_dataset_output.csv', index=False)
# df_dataset.to_csv('../../output/moneylion_search_queries_output.csv', index=False)

In [None]:
# ARCHIVED CONSTANTS

# ENCYCLOPEDIA_PROMPT_2 = """
# A user searches the following query on the MoneyLion app
# search_query : {search_query}
# Instructions:
# Share a fun, one liner informative financial fact relating to {search_query} in under 50 words.
# This fact can be a piece of advice, a fun fact or a statistic.

# Rules:
# If the query is a name of a company or brand, generate a financial related fact that is based on the general category of the company or brand.
#  Example: If a user searches for "Starbucks", generate a financial related fact about coffee.
# Do not include the name of the company or brand in the fact.
# Provide the source of the fact such as the name of the website or the name of the book.
# Do not generate a fact that can negatively impact MoneyLion's brand or reputation.
# Do not provide alternatives to MoneyLion's products or services.

# Output criteria:
#     - return final answer using 3 variables "generated_fact", "source" and "fact_type"
#     - compute "generated_fact" as: generated fact relating to {search_query}
#     - compute "source" as: source of the fact
#     - compute "fact_type" as: type of fact (fun fact, statistic, advice)
#     - output final answer in the following format:
#         generated_fact=(input value of generated_fact here); source=(input value of source here); fact_type=(input value of fact_type here)
# let's think step by step
# """

# EVALUATION_PROMPT = """
# Here's is a generated fact about {search_query}:
# {fact}  

# Please evaluate the following sentence for the following criterias:
# -Is the fact generated a financial and related to {search_query}?
# -Is the source of the fact credible?


# Output criteria:
#     - return final answer using 4 variables "result_rating", "result_relevance", "source_credibility" and "rating_reason"
#     - compute "result_rating" as: Rate relevance of fact on the scale of 0 to 10, with 0 being not financial fact, and 10 being a useful financial fact
#     - compute "result_relevance" as: If the fact is relevant to {search_query}, return 1. Else return 0
#     - compute "source_credibility" as: If the source of the fact is a credible source, return 1. Else return 0
#     - compute "rating_reason" as: Reasonings of the given score for "result_rating", "result_relevance" and "source_credibility" in detail (in 1-2 sentences)
#     - output final answer in the following format: 
#         result_rating=(input value of result_rating here); result_relevance=(input value of result_relevance here); source_credibility=(input value of source_credibility here); rating_reason=(input value of rating_reason here)

# let's think step by step
# """

# ENCYCLOPEDIA_PROMPT = """
# A user searches the following query on the MoneyLion app
# search_query : {search_query}
# Instructions:
# Generate a financial related fact relating to the search query in under 50 words.
# This fact can be a piece of advice, a fun fact or a statistic.
# Provide the source of the fact such as the name of the website or the name of the book.

# The output must adhere to the following rules:
#     Do not produce a fact that can negatively impact MoneyLion's brand or reputation.
#     Do not provide alternatives to MoneyLion's products or services.
#     The output must not contain any company names or brand names.

# Output criteria:
#     - return final answer using 4 variables "generated_fact", "category", "source" and "fact_type"
#     - compute "generated_fact" as: generated fact relating to category of {search_query}
#     - compute "category" as: category of {search_query}
#     - compute "source" as: source of the fact
#     - compute "fact_type" as: type of fact (fun fact, statistic, advice)
#     - output final answer in the following format:
#         generated_fact=(input value of generated_fact here); category=(input value of category here); source=(input value of source here); fact_type=(input value of fact_type here)

# Use the examples below as a reference to generate the output.
# Examples:
# search_query : "Pizza Hut"
# output : generated_fact=The global fast food industry is estimated to reach a value of $931 billion by 2027, driven by factors like convenience, affordability, and changing consumer preferences.; category= "fast food"; source=MarketResearch.com - "Global Fast Food Market Size, Share & Trends Analysis Report By Product Type (Burger/Sandwich, Pizza/Pasta, Chicken, Asian/Latin American Food, Sea-Food, Others), And Segment Forecasts, 2020 - 2027"; fact_type=Statistic

# search_query : "Starbucks"
# output : generated_fact=Coffee is the second most traded commodity in the world after oil, with around 2.25 billion cups of coffee consumed globally every day.; category = "coffee"; source=International Coffee Organization; fact_type=Statistic

# search_query : "Chime"
# output : generated_fact="On average, it takes only 3 minutes to open a bank account online."; category="banking"; source="ValuePenguin"; fact_type="statistic"
# """