### Build a Data Model

In [2]:
from pydantic.v1 import BaseModel

class SalesTransaction(BaseModel):
    transaction_id: int
    customer_id: int
    customer_name: str
    product_id: int
    product_name: str
    product_category: str 
    quantity: int 
    unit_price: float 
    toatl_price: float 
    transaction_date: str


### Build sample data fro synthetic generation in Python

In [9]:
sample_data = [
    {
        "example":
        '''Transaction ID: 1001, Customer ID: 501, Customer Name: Alice J, Product ID: 201, Product Name: Hello Coding, Product Category: eLearning, Quantity: 2, Unit Price: $149,Total Price = $298, Transaction Date: 2030-01-02'''
    },
    {
        "example":
        '''Transaction ID: 1002, Customer ID: 502, Customer Name: Bob T, Product ID: 202, Product Name: Learn to Code, Product Category: eLearning, Quantity: 2, Unit Price: $149,Total Price = $298, Transaction Date: 2030-01-02'''
    },
    {
        "example":
        '''Transaction ID: 1003, Customer ID: 503, Customer Name: S.T. J, Product ID: 202, Product Name: Learn to Code, Product Category: eLearning, Quantity: 2, Unit Price: $149, Total Price = $298, Transaction Date: 2030-01-02'''
    }
]

In [None]:
print(sample_data)

[{'example': 'Transaction ID: 1001, Customer ID: 501, Customer Name: Alice J, Product ID: 201, Product Name: Hello Coding, Product Category: eLearning, Quantity: 2, Unit Price: $149,Total Price = $298, Transaction Date: 2030-01-02'}, {'example': 'Transaction ID: 1002, Customer ID: 502, Customer Name: Bob T, Product ID: 202, Product Name: Learn to Code, Product Category: eLearning, Quantity: 2, Unit Price: $149,Total Price = $298, Transaction Date: 2030-01-02'}, {'example': 'Transaction ID: 1003, Customer ID: 503, Customer Name: S.T. J, Product ID: 202, Product Name: Learn to Code, Product Category: eLearning, Quantity: 2, Unit Price: $149, Total Price = $298, Transaction Date: 2030-01-02'}]


### Build a prompt template for data generation LLM

In [13]:
from langchain_core.prompts import PromptTemplate 

OPENAI_TEMPLATE = PromptTemplate(imput_variables=["Sample_data"],
                                 template="{sample_data}"
                                 )

print(OPENAI_TEMPLATE)


input_variables=['sample_data'] input_types={} partial_variables={} template='{sample_data}'


In [14]:
!pip install langchain_experimental

Collecting langchain_experimental
  Downloading langchain_experimental-0.3.4-py3-none-any.whl.metadata (1.7 kB)
Downloading langchain_experimental-0.3.4-py3-none-any.whl (209 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.2/209.2 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: langchain_experimental
Successfully installed langchain_experimental-0.3.4


In [17]:
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)

from langchain_core.prompts.few_shot import FewShotPromptTemplate

prompt_template = FewShotPromptTemplate(
    prefix = SYNTHETIC_FEW_SHOT_PREFIX,
    examples = sample_data,
    suffix = SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subect", "extra"],
    example_prompt = OPENAI_TEMPLATE,
)

In [19]:
print(prompt_template)

input_variables=['extra', 'subject'] input_types={} partial_variables={} examples=[{'example': 'Transaction ID: 1001, Customer ID: 501, Customer Name: Alice J, Product ID: 201, Product Name: Hello Coding, Product Category: eLearning, Quantity: 2, Unit Price: $149,Total Price = $298, Transaction Date: 2030-01-02'}, {'example': 'Transaction ID: 1002, Customer ID: 502, Customer Name: Bob T, Product ID: 202, Product Name: Learn to Code, Product Category: eLearning, Quantity: 2, Unit Price: $149,Total Price = $298, Transaction Date: 2030-01-02'}, {'example': 'Transaction ID: 1003, Customer ID: 503, Customer Name: S.T. J, Product ID: 202, Product Name: Learn to Code, Product Category: eLearning, Quantity: 2, Unit Price: $149, Total Price = $298, Transaction Date: 2030-01-02'}] example_prompt=PromptTemplate(input_variables=['sample_data'], input_types={}, partial_variables={}, template='{sample_data}') suffix='Now you generate synthetic data about {subject}. Make sure to {extra}:' prefix='Thi

### Build a data generator with LangChain OpenAI Python SDK

In [22]:
from langchain_experimental.tabular_synthetic_data.openai import (
    create_openai_data_generator,
)
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
import os
load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")
generator = create_openai_data_generator(
    output_schema=SalesTransaction,
    llm = ChatOpenAI(
        openai_api_key=openai_api_key,  
        model="gpt-4o-mini-2024-07-18"
    ),
    prompt = prompt_template,
)
print(generator)

  warn(


PydanticInvalidForJsonSchema: Cannot generate a JsonSchema for core_schema.PlainValidatorFunctionSchema ({'type': 'with-info', 'function': <bound method BaseModel.validate of <class '__main__.SalesTransaction'>>})

For further information visit https://errors.pydantic.dev/2.11/u/invalid-for-json-schema