### Build a Data Model

In [26]:
# from pydantic.v1 import BaseModel

# class SalesTransaction(BaseModel):
#     transaction_id: int
#     customer_id: int
#     customer_name: str
#     product_id: int
#     product_name: str
#     product_category: str 
#     quantity: int 
#     unit_price: float 
#     toatl_price: float 
#     transaction_date: str

from pydantic import BaseModel, Field, ConfigDict
from datetime import date

class SalesTransaction(BaseModel):
    transaction_id: int = Field(..., description="Unique transaction identifier")
    customer_id:    int = Field(..., description="ID of the customer")
    customer_name:  str = Field(..., description="Full name of the customer")
    product_id:     int = Field(..., description="ID of the product sold")
    product_name:   str = Field(..., description="Name of the product sold")
    product_category: str = Field(..., description="Category of the product")
    quantity:       int   = Field(..., gt=0, description="Number of units sold")
    unit_price:     float = Field(..., gt=0, description="Price per single unit")
    total_price:    float = Field(..., gt=0, description="quantity × unit_price")
    transaction_date: date = Field(..., description="Date of the transaction")

    # Optional V2 config: forbid extra keys, etc.
    model_config = ConfigDict(
        extra="forbid",        # error if unexpected fields are passed
        populate_by_name=True, # allow aliases (if you add any Field(alias=…))
    )


### Build sample data fro synthetic generation in Python

In [38]:
sample_data = [
    {
        "example":
        '''Transaction ID: 1001, Customer ID: 501, Customer Name: Alice J, Product ID: 201, Product Name: Hello Coding, Product Category: eLearning, Quantity: 2, Unit Price: $149,Total Price = $298, Transaction Date: 2030-01-02'''
    },
    {
        "example":
        '''Transaction ID: 1002, Customer ID: 502, Customer Name: Bob T, Product ID: 202, Product Name: Learn to Code, Product Category: eLearning, Quantity: 2, Unit Price: $149,Total Price = $298, Transaction Date: 2030-01-02'''
    },
    {
        "example":
        '''Transaction ID: 1003, Customer ID: 503, Customer Name: S.T. J, Product ID: 202, Product Name: Learn to Code, Product Category: eLearning, Quantity: 2, Unit Price: $149, Total Price = $298, Transaction Date: 2030-01-02'''
    }
]

In [39]:
print(sample_data)

[{'example': 'Transaction ID: 1001, Customer ID: 501, Customer Name: Alice J, Product ID: 201, Product Name: Hello Coding, Product Category: eLearning, Quantity: 2, Unit Price: $149,Total Price = $298, Transaction Date: 2030-01-02'}, {'example': 'Transaction ID: 1002, Customer ID: 502, Customer Name: Bob T, Product ID: 202, Product Name: Learn to Code, Product Category: eLearning, Quantity: 2, Unit Price: $149,Total Price = $298, Transaction Date: 2030-01-02'}, {'example': 'Transaction ID: 1003, Customer ID: 503, Customer Name: S.T. J, Product ID: 202, Product Name: Learn to Code, Product Category: eLearning, Quantity: 2, Unit Price: $149, Total Price = $298, Transaction Date: 2030-01-02'}]


### Build a prompt template for data generation LLM

In [29]:
from langchain_core.prompts import PromptTemplate 

OPENAI_TEMPLATE = PromptTemplate(imput_variables=["Sample_data"],
                                 template="{sample_data}"
                                 )

print(OPENAI_TEMPLATE)


input_variables=['sample_data'] input_types={} partial_variables={} template='{sample_data}'


In [30]:
!pip install langchain_experimental



In [31]:
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)

from langchain_core.prompts.few_shot import FewShotPromptTemplate

prompt_template = FewShotPromptTemplate(
    prefix = SYNTHETIC_FEW_SHOT_PREFIX,
    examples = sample_data,
    suffix = SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subect", "extra"],
    example_prompt = OPENAI_TEMPLATE,
)

In [32]:
print(prompt_template)

input_variables=['extra', 'subject'] input_types={} partial_variables={} examples=[{'example': 'Transaction ID: 1001, Customer ID: 501, Customer Name: Alice J, Product ID: 201, Product Name: Hello Coding, Product Category: eLearning, Quantity: 2, Unit Price: $149,Total Price = $298, Transaction Date: 2030-01-02'}, {'example': 'Transaction ID: 1002, Customer ID: 502, Customer Name: Bob T, Product ID: 202, Product Name: Learn to Code, Product Category: eLearning, Quantity: 2, Unit Price: $149,Total Price = $298, Transaction Date: 2030-01-02'}, {'example': 'Transaction ID: 1003, Customer ID: 503, Customer Name: S.T. J, Product ID: 202, Product Name: Learn to Code, Product Category: eLearning, Quantity: 2, Unit Price: $149, Total Price = $298, Transaction Date: 2030-01-02'}] example_prompt=PromptTemplate(input_variables=['sample_data'], input_types={}, partial_variables={}, template='{sample_data}') suffix='Now you generate synthetic data about {subject}. Make sure to {extra}:' prefix='Thi

### Build a data generator with LangChain OpenAI Python SDK

In [34]:
from langchain_experimental.tabular_synthetic_data.openai import create_openai_data_generator
from langchain_openai import ChatOpenAI
import os
from dotenv import load_dotenv

load_dotenv()

generator = create_openai_data_generator(
    output_schema=SalesTransaction,        # ← your pure Pydantic V2 model
    llm=ChatOpenAI(
        openai_api_key=os.getenv("OPENAI_API_KEY"),
        model="gpt-4o-mini-2024-07-18",
    ),
    prompt=prompt_template,
)

print(generator)


template=FewShotPromptTemplate(input_variables=['extra', 'subject'], input_types={}, partial_variables={}, examples=[{'example': 'Transaction ID: 1001, Customer ID: 501, Customer Name: Alice J, Product ID: 201, Product Name: Hello Coding, Product Category: eLearning, Quantity: 2, Unit Price: $149,Total Price = $298, Transaction Date: 2030-01-02'}, {'example': 'Transaction ID: 1002, Customer ID: 502, Customer Name: Bob T, Product ID: 202, Product Name: Learn to Code, Product Category: eLearning, Quantity: 2, Unit Price: $149,Total Price = $298, Transaction Date: 2030-01-02'}, {'example': 'Transaction ID: 1003, Customer ID: 503, Customer Name: S.T. J, Product ID: 202, Product Name: Learn to Code, Product Category: eLearning, Quantity: 2, Unit Price: $149, Total Price = $298, Transaction Date: 2030-01-02'}], example_prompt=PromptTemplate(input_variables=['sample_data'], input_types={}, partial_variables={}, template='{sample_data}'), suffix='Now you generate synthetic data about {subject}

In [36]:
print(generator)

template=FewShotPromptTemplate(input_variables=['extra', 'subject'], input_types={}, partial_variables={}, examples=[{'example': 'Transaction ID: 1001, Customer ID: 501, Customer Name: Alice J, Product ID: 201, Product Name: Hello Coding, Product Category: eLearning, Quantity: 2, Unit Price: $149,Total Price = $298, Transaction Date: 2030-01-02'}, {'example': 'Transaction ID: 1002, Customer ID: 502, Customer Name: Bob T, Product ID: 202, Product Name: Learn to Code, Product Category: eLearning, Quantity: 2, Unit Price: $149,Total Price = $298, Transaction Date: 2030-01-02'}, {'example': 'Transaction ID: 1003, Customer ID: 503, Customer Name: S.T. J, Product ID: 202, Product Name: Learn to Code, Product Category: eLearning, Quantity: 2, Unit Price: $149, Total Price = $298, Transaction Date: 2030-01-02'}], example_prompt=PromptTemplate(input_variables=['sample_data'], input_types={}, partial_variables={}, template='{sample_data}'), suffix='Now you generate synthetic data about {subject}

### Generate synthetic data with LangChain OpenAI

In [47]:
from langchain.prompts import FewShotPromptTemplate
from langchain_experimental.tabular_synthetic_data.openai import create_openai_data_generator, OPENAI_TEMPLATE
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)
from langchain_openai import ChatOpenAI
import os
from dotenv import load_dotenv

load_dotenv()

# 1) Prepare one or more text examples for formatting
examples = [
    {
        "example": """\
Transaction ID: 1
Customer ID: 42
Customer Name: Alice Smith
Product ID: 101
Product Name: Blue Widget
Product Category: Widgets
Quantity: 2
Unit Price: 19.99
Total Price: 39.98
Transaction Date: 2025-05-08"""
    }
]

# 2) Build a FewShotPromptTemplate
prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,  # just “{example}”
)

# 3) Create the generator
generator = create_openai_data_generator(
    output_schema=SalesTransaction,
    llm=ChatOpenAI(
        openai_api_key=os.getenv("OPENAI_API_KEY"),
        model="gpt-4o-mini-2024-07-18",
    ),
    prompt=prompt_template,  # ← a BasePromptTemplate instance
)
print(generator)


template=FewShotPromptTemplate(input_variables=['extra', 'subject'], input_types={}, partial_variables={}, examples=[{'example': 'Transaction ID: 1\nCustomer ID: 42\nCustomer Name: Alice Smith\nProduct ID: 101\nProduct Name: Blue Widget\nProduct Category: Widgets\nQuantity: 2\nUnit Price: 19.99\nTotal Price: 39.98\nTransaction Date: 2025-05-08'}], example_prompt=PromptTemplate(input_variables=['example'], input_types={}, partial_variables={}, template='{example}'), suffix='Now you generate synthetic data about {subject}. Make sure to {extra}:', prefix='This is a test about generating synthetic data about {subject}. Examples below:') llm=None results=[] llm_chain=LLMChain(verbose=False, prompt=FewShotPromptTemplate(input_variables=['extra', 'subject'], input_types={}, partial_variables={}, examples=[{'example': 'Transaction ID: 1\nCustomer ID: 42\nCustomer Name: Alice Smith\nProduct ID: 101\nProduct Name: Blue Widget\nProduct Category: Widgets\nQuantity: 2\nUnit Price: 19.99\nTotal Pric

In [48]:
results = generator.generate(
    subject="transactions",
    sample_data=[],  # or your real examples list
    extra="Randomize everything!",
    runs=20,
)

In [49]:
print(results)

[SalesTransaction(transaction_id=107, customer_id=58, customer_name='John Doe', product_id=203, product_name='Green Gadget', product_category='Gadgets', quantity=4, unit_price=15.99, total_price=63.96, transaction_date=datetime.date(2025, 5, 9)), SalesTransaction(transaction_id=768, customer_id=43, customer_name='Jane Smith', product_id=127, product_name='Blue Widget', product_category='Widgets', quantity=2, unit_price=22.5, total_price=45.0, transaction_date=datetime.date(2025, 4, 15)), SalesTransaction(transaction_id=543, customer_id=57, customer_name='Alice Johnson', product_id=102, product_name='Red Gadget', product_category='Gadgets', quantity=3, unit_price=18.75, total_price=56.25, transaction_date=datetime.date(2025, 5, 2)), SalesTransaction(transaction_id=732, customer_id=45, customer_name='Mark Stevens', product_id=258, product_name='Blue Widget', product_category='Widgets', quantity=4, unit_price=22.5, total_price=90.0, transaction_date=datetime.date(2025, 8, 15)), SalesTrans