In [106]:
import os
%pip install python-dotenv
%pip install -U langchain langchain_experimental langchain-openai


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [107]:
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("OPENAI_KEY")

In [108]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_experimental.tabular_synthetic_data.openai import (
    OPENAI_TEMPLATE,
    create_openai_data_generator,
)
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)
from langchain_openai import ChatOpenAI
from pydantic import BaseModel
from typing import List, Dict

In [109]:
#create a schema for transactions
#transaction categories include housing, healthcare, transportation, utilities, travel, dining, groceries
class SpendingSchema(BaseModel):
    user_id: str       
    month: str        
    housing: int   
    bills_utilities: int
    dining: int
    transport: int
    fitness: int
    travel: int
    entertainment: int
    savings: int


In [110]:

examples = [
    {
        "example": """User ID: U001, Month: January 2023, Housing: 800, Bills & Utilities: 210, Groceries: 400, Dining: 400, Transport: 100, Fitness: 60, Travel: 400, Entertainment: 300, Savings: 1500"""
    },
    {
        "example": """User ID: U002, Month: January 2023, Housing: 1300, Bills & Utilities: 150, Groceries: 600, Dining: 200, Transport: 250, Fitness: 20, Travel: 100, Entertainment: 700, Savings: 600"""
    }
]

In [111]:

OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")
prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)


In [112]:
synthetic_data_generator = create_openai_data_generator(
    output_schema=SpendingSchema,
    llm=ChatOpenAI(
        temperature=1,
        api_key=api_key
    ), 
    prompt=prompt_template,
)

In [113]:
synthetic_results = synthetic_data_generator.generate(
    subject="spending_data",
    extra="Generate samples with dollar values for the transactions given that represent what people would spend and save. Make sure the year and month is before September 2024 to make it realistic. You can also add outliers in certain categories to represent people who go extreme in some areas." +
    "You can make housing between 600 and 10000, bills and utilities between 50 and 1000, dining between 50 and 1000, transport between 20 and 500, fitness between 0 and 400, travel between 0 and 4000, entertainment between 10 and 3000, and savings between 0 and 3000. Make it as realistic as possible",
    runs=10,
)

In [114]:
len(synthetic_results)

10

In [115]:
synthetic_results

[SpendingSchema(user_id='U003', month='August 2023', housing=5000, bills_utilities=800, dining=600, transport=400, fitness=200, travel=2000, entertainment=2500, savings=2500),
 SpendingSchema(user_id='U004', month='March 2024', housing=2500, bills_utilities=300, dining=800, transport=350, fitness=50, travel=1500, entertainment=1200, savings=1500),
 SpendingSchema(user_id='U005', month='June 2024', housing=7200, bills_utilities=600, dining=900, transport=300, fitness=250, travel=3500, entertainment=2800, savings=2700),
 SpendingSchema(user_id='U006', month='August 2023', housing=4000, bills_utilities=200, dining=500, transport=150, fitness=75, travel=3000, entertainment=2000, savings=2000),
 SpendingSchema(user_id='U007', month='July 2022', housing=7800, bills_utilities=450, dining=800, transport=200, fitness=150, travel=3700, entertainment=2500, savings=2800),
 SpendingSchema(user_id='U008', month='September 2023', housing=7500, bills_utilities=300, dining=700, transport=180, fitness=1