In [157]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
import boto3
import os
from dotenv import load_dotenv
import pandas as pd
import random
from langchain_openai.chat_models import AzureChatOpenAI
from langchain_core.messages import AIMessage, BaseMessage, HumanMessage,SystemMessage
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_experimental.synthetic_data import create_data_generation_chain
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.pydantic_v1 import BaseModel, Field

from typing import List

from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain_openai import ChatOpenAI
from langchain.output_parsers import PydanticOutputParser



In [2]:
load_dotenv()
s3r = boto3.client('s3', 
                   aws_access_key_id=os.environ["DEV_ACCESS_KEY"], 
                   aws_secret_access_key=os.environ["DEV_SECRET_ACCESS_KEY"])

### Download test data from dev s3

In [3]:
name = f"df_test_v1.xlsx"
fullname = f'swapnil/form_selection_core/excel/' + name
s3r.download_file(Bucket = "datainsights-shared-coupadev-com", Key = fullname, Filename = name)

## load test data

In [4]:
test_data_path = "df_test_v1.xlsx"
test_data = pd.read_excel(test_data_path)
test_data.head(2)

Unnamed: 0,instance,easy_form_widget_response_id,updated_at,easy_form_id,easy_form_widget_id,easy_form_response_id,user_submitted_description,easy_form_widget_response_type,backing_attribute,field_name,...,subject_type,status,easy_form_name,easy_form_model,easy_form_status,easy_form_description,form_type,requistion_line_id,header_id,channel
0,cbre,,2023-11-01 11:09:43,,,,"""Warning Trip Hazard"" Sign 210mm x 148mm",,,,...,,,,,,,freeform,5889876.0,3013667.0,freeform
1,monash,,2023-09-11 00:20:35,,,,DEBDEN DAYPLANNER refil,,,,...,,,,,,,freeform,1091278.0,479534.0,freeform


### generate synthetic data

In [5]:
def initialize_llm(**kwargs):
    # initialize llm
    llm = AzureChatOpenAI(
        deployment_name=os.environ["AZURE_OPENAI_DEPLOYMENT"],
        api_key=os.environ["AZURE_OPENAI_API_KEY"],
        azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
        api_version=os.environ["AZURE_OPENAI_API_VERSION"],
        **kwargs
    )
    return llm


In [6]:
llm= initialize_llm(temperature = 0.5)

In [7]:
all_personas = ['verbose','concise','step-by-step','casual','formal','spelling-mistake']
personas_attributes = {
    'verbose':
        'You write longer complete sentences that are friendlier and generally include punctuation.',
    'concise':
        'You write direct queries with minimal non-essential text. You usually omit capitalization and filler phrases',
    'step-by-step':
        'You summarize the goal of each step before explaining the detailed instructions.',
    'casual':
        'You use informal language that may not directly reference all the details',
    'formal':
        'You like to provide detailed information and formal language.',
    'spelling-mistake':
        'You often make spelling mistakes'
}

In [118]:
prompt = PromptTemplate.from_template("""You work for a corporate business and as an employee, you want to buy/procure some goods or services on behalf of your organisation.
You will be given a description of item/service from which you need to construct your request. You will put the request into an AI Assitant.
Extract the basic information such as amount, quantity, business justification etc if present, from the given fields in order to create a simple one line request using them.
Use only minimal information as you can provide the details later.

Make the request concise and simple. Use given description judiciously and include important information. Feel free to add more information.
If any additional preferences are given, use them during sentence construction as well.
Instructions:
- You are a person having a persona that matches the given persona description. Act accordingly.
item_or_service_description:
{description}
Persona description:
{persona_description}
Request:""")

In [144]:
generate = create_data_generation_chain(llm, prompt)


In [145]:
for chunk in generate.stream({
                "description": item_description,
                "persona_description":persona_description,
            }):
    print(chunk, end="")



In [146]:
chunk

 'persona_description': 'You often make spelling mistakes',

### reflection

In [147]:
llm_reflection = initialize_llm(temperature=0, streaming=True)

In [193]:
reflection_template = """
"You are expert at analysing synthetic queries created to train a chatbot on procurement requests. Following is the instance of generated data. 
for example if the Persona description says that you make spelling mistakes then there should be atleast one spelling mistake in the generated query otherwise the generated query would be invalid.
original item or service description:
{description}
Persona description(person making the request):
{persona_description}
Generated query: {text}.
\n{format_instructions}\n
"""

In [194]:
chunk

 'persona_description': 'You often make spelling mistakes',

In [195]:
class IsQueryValid(BaseModel):
    isQueryValid: bool = Field(description="Whether the generated query for item/service description accurately follows persona description")
    observation: str = Field(description="Suggestion to improve the generated query take to make the query accurately follow persona description")


# Set up a parser + inject instructions into the prompt template.
reflection_parser = PydanticOutputParser(pydantic_object=IsQueryValid)

reflection_prompt = PromptTemplate(
    template=reflection_template,
    input_variables=['description', 'persona_description', 'text'],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

reflection_chain = reflection_prompt | llm_reflection | reflection_parser

In [196]:
reflection_result = reflection_chain.invoke(chunk)

In [198]:
reflection_result

IsQueryValid(isQueryValid=True, observation='No spelling mistakes found in the generated query.')

### Archive

In [None]:
item_description =test_data['user_submitted_description'][0]
persona = random.choice(all_personas)
persona_description = personas_attributes[persona]
description, persona,persona_description

ret = generate.invoke({
                "description": item_description,
                "persona": persona,
                "persona_description":persona_description,
            })