<a href="https://colab.research.google.com/github/zackives/upenn-cis-2450/blob/main/lab4_part2_SyntheticDataGeneration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain openai chromadb tiktoken langchain_experimental langchain-openai

Collecting langchain_experimental
  Downloading langchain_experimental-0.3.0-py3-none-any.whl.metadata (1.7 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.2.0-py3-none-any.whl.metadata (2.6 kB)
Collecting langchain-community<0.4.0,>=0.3.0 (from langchain_experimental)
  Downloading langchain_community-0.3.0-py3-none-any.whl.metadata (2.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community<0.4.0,>=0.3.0->langchain_experimental)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community<0.4.0,>=0.3.0->langchain_experimental)
  Downloading pydantic_settings-2.5.2-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community<0.4.0,>=0.3.0->langchain_experimental)
  Downloading marshmallow-3.22.0-py3-none-any.whl.metadata (7.2 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchai

In [None]:
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate, FewShotPromptTemplate
from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser
from langchain_openai import ChatOpenAI
from pydantic import BaseModel, Field
from langchain_experimental.tabular_synthetic_data.openai import (
    OPENAI_TEMPLATE,
    create_openai_data_generator,
)
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)
from typing import List
import os
import json

In [None]:
%set_env OPENAI_API_KEY= #TODO: PUT KEY HERE

### Synthetic Data Generation

#### Step1: Define your data schema

In [None]:
# Define the schema for our synthetic data using Pydantic
class Person(BaseModel):
    name: str = Field(description="A fictional person's full name")
    age: int = Field(description="The person's age (between 18 and 80)", ge=18, le=80)
    occupation: str = Field(description="The person's job or profession")
    hobby: str = Field(description="A hobby or interest of the person")


In [None]:
# Create the output parser
output_parser = PydanticOutputParser(pydantic_object=Person)


#### Step2: Create data samples

In [None]:
# Create data samples
examples = [
    {
        "example": """Name: Alex Johnson
Age: 32
Occupation: Data Scientist
Hobby: Mountain biking"""
    },
    {
        "example": """Name: Samantha Lee
Age: 45
Occupation: Marketing Director
Hobby: Oil painting"""
    },
    {
        "example": """Name: Carlos Rodriguez
Age: 28
Occupation: Freelance Writer
Hobby: Salsa dancing"""
    },
]

#### Step3: Provide a prompt template

In [None]:
# Create a zero-shot prompt template
template = """
Generate synthetic data for a fictional person with the following attributes:
{format_instructions}

Person:
"""

prompt = PromptTemplate(
    template=template,
    input_variables=[],
    partial_variables={"format_instructions": output_parser.get_format_instructions()}
)

In [None]:
#Create a few-shot prompt template
OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

few_shot_prompt = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

#### Step4: Create data generator

In [None]:
# You can customize your LLM with different model and parameters
llm = ChatOpenAI(temperature=0.8)

In [None]:
# Generate synthetic data with zero-shot prompt
def generate_synthetic_data(num_samples: int = 5) -> List[Person]:
    synthetic_data = []
    for _ in range(num_samples):
        result = llm.invoke(prompt.format())
        # Extract the content from the AIMessage
        result_content = result.content if hasattr(result, 'content') else str(result)
        # Parse the result as JSON
        try:
            parsed_json = json.loads(result_content)
            # Ensure age is an integer
            parsed_json['age'] = int(parsed_json['age'])
            # Create a Person object
            person = Person(**parsed_json)
            synthetic_data.append(person)
        except json.JSONDecodeError:
            print(f"Failed to parse result: {result_content}")
        except ValueError as e:
            print(f"Failed to create Person object: {e}")
    return synthetic_data

In [None]:
# Generate and display synthetic data
synthetic_data = generate_synthetic_data(2)
for person in synthetic_data:
    print(person.model_dump_json(indent=2))

{
  "name": "Alice Jones",
  "age": 30,
  "occupation": "Software Engineer",
  "hobby": "Photography"
}
{
  "name": "Alice Smith",
  "age": 30,
  "occupation": "Software Engineer",
  "hobby": "Photography"
}


In [None]:
# Alternatively, you can create the data generator with few-shot prompt
synthetic_data_generator = create_openai_data_generator(
    output_schema=Person,
    llm=llm,
    prompt=few_shot_prompt,
)

In [None]:
synthetic_results = synthetic_data_generator.generate(
    subject="person profile",
    extra="all the information must be chosen at random.",
    runs=5,
)

In [None]:
# You can see the synthetic data with few-shot generator has more diversity
synthetic_results

[Person(name='Emily White', age=29, occupation='Graphic Designer', hobby='Photography'),
 Person(name='Alice Smith', age=32, occupation='Software Engineer', hobby='Hiking'),
 Person(name='Sophia Johnson', age=25, occupation='Marketing Manager', hobby='Painting'),
 Person(name='Ethan Brown', age=35, occupation='Teacher', hobby='Cooking'),
 Person(name='Olivia Davis', age=28, occupation='Graphic Designer', hobby='Photography'),
 Person(name='Isaac Martinez', age=30, occupation='Software Engineer', hobby='Hiking'),
 Person(name='Ava Johnson', age=42, occupation='Marketing Manager', hobby='Traveling'),
 Person(name='Sophia Smith', age=35, occupation='Teacher', hobby='Yoga'),
 Person(name='Emily Davis', age=25, occupation='Data Analyst', hobby='Painting'),
 Person(name='Oliver Brown', age=28, occupation='Software Engineer', hobby='Playing guitar')]

### Dataset Generation

In [None]:
from langchain_experimental.synthetic_data import (
    DatasetGenerator,
)

In [None]:
# Create a dataset generator
generator = DatasetGenerator(llm, {"style": "informal", "minimal length": 500})
dataset = generator(examples)
dataset


[{'fields': {'example': 'Name: Alex Johnson\nAge: 32\nOccupation: Data Scientist\nHobby: Mountain biking'},
  'preferences': {'style': 'informal', 'minimal length': 500},
  'text': "Hey, have you met Alex Johnson? He's a 32-year-old data scientist who spends his free time mountain biking."},
 {'fields': {'example': 'Name: Samantha Lee\nAge: 45\nOccupation: Marketing Director\nHobby: Oil painting'},
  'preferences': {'style': 'informal', 'minimal length': 500},
  'text': 'Samantha Lee, a 45-year-old Marketing Director, spends her free time indulging in the artistic hobby of oil painting, where she creates vivid and expressive works of art that bring a sense of color and life to her everyday routine.'},
 {'fields': {'example': 'Name: Carlos Rodriguez\nAge: 28\nOccupation: Freelance Writer\nHobby: Salsa dancing'},
  'preferences': {'style': 'informal', 'minimal length': 500},
  'text': 'Meet Carlos Rodriguez, a 28-year-old freelance writer who spends his free time salsa dancing. His passi

#### Extraction from generated data

In [None]:
# Create a extraction prompt template
extract_prompt = PromptTemplate(
    template="Extract fields from a given text.\n{format_instructions}\n{text}\n",
    input_variables=["text"],
    partial_variables={"format_instructions": output_parser.get_format_instructions()},
)

In [None]:
# Use pydantic schema parser to parse generated dataset to pydantic data schema
_input = extract_prompt.format_prompt(text=dataset[0]["text"])
output = llm(_input.to_string())
print(output)
output_content = output.content if hasattr(output, 'content') else str(output)
parsed = output_parser.parse(output_content)
parsed

content='{\n  "name": "Alex Johnson",\n  "age": 32,\n  "occupation": "data scientist",\n  "hobby": "mountain biking and exploring the great outdoors"\n}' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 40, 'prompt_tokens': 301, 'total_tokens': 341, 'completion_tokens_details': {'reasoning_tokens': 0}}, 'model_name': 'gpt-3.5-turbo-0125', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None} id='run-da30fd62-6787-4912-b2cb-d05d6ecc954d-0' usage_metadata={'input_tokens': 301, 'output_tokens': 40, 'total_tokens': 341}


Person(name='Alex Johnson', age=32, occupation='data scientist', hobby='mountain biking and exploring the great outdoors')