# Synthetic Data Generator

In [3]:
# imports

import os
import json
from dotenv import load_dotenv
from openai import OpenAI
import gradio as gr

In [4]:
import anthropic

In [13]:
#!pip install google-generativeai

In [5]:
# Load environment variables in a file called .env
# Print the key prefixes to help with any debugging

load_dotenv(override=True)
openai_api_key = os.getenv('OPENAI_API_KEY')
anthropic_api_key = os.getenv('ANTHROPIC_API_KEY')
google_api_key = os.getenv('GOOGLE_API_KEY')

if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
if anthropic_api_key:
    print(f"Anthropic API Key exists and begins {anthropic_api_key[:7]}")
else:
    print("Anthropic API Key not set")

if google_api_key:
    print(f"Google API Key exists and begins {google_api_key[:8]}")
else:
    print("Google API Key not set")

OpenAI API Key exists and begins sk-proj-
Anthropic API Key exists and begins sk-ant-
Google API Key exists and begins AIzaSyCU


In [6]:
# Connect to OpenAI, Anthropic

openai = OpenAI()

claude = anthropic.Anthropic()

In [14]:
# connect to Gemini
import google.generativeai as genai
genai.configure(api_key=google_api_key)

# Generate Synthetic Dataset

In [9]:
def generate_dataset(description):
    """
    Generate a synthetic dataset based on the user's description.
    """
    # Create a prompt for the LLM to generate the dataset
    prompt = f"""
    Based on this description: "{description}"
    
    Generate a synthetic dataset in CSV format. The dataset should be realistic and diverse.
    Include appropriate column headers and at least 50 rows of data.
    
    Return ONLY the CSV content, nothing else. The first line should be the header row.
    """
    
    # Get response from OpenAI
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature=0.7
    )
    
    # Get the CSV content
    csv_content = response.choices[0].message.content
    
    # Save to a temporary file
    filename = "generated_dataset.csv"
    with open(filename, "w") as f:
        f.write(csv_content)
    
    return filename


In [10]:
# Create the Gradio interface
demo = gr.Interface(
    fn=generate_dataset,
    inputs=gr.Textbox(
        label="Describe the dataset you want to generate",
        placeholder="e.g., I want to build a synthetic dataset of an Ecommerce store with customer purchases"
    ),
    outputs=gr.File(label="Download your dataset"),
    title="Synthetic Dataset Generator",
    description="Describe the type of dataset you want to generate, and we'll create it for you!"
)

# Launch the interface
demo.launch()

* Running on local URL:  http://127.0.0.1:7861

To create a public link, set `share=True` in `launch()`.




# Updated Gradle UI with models selections

In [15]:
from ollama import Client  # For local model

In [16]:
# Initialize Ollama client for local model
ollama_client = Client(host='http://localhost:11434')

In [18]:
#!ollama pull mistral

In [23]:
def estimate_cost(model_choice, input_tokens, output_tokens):
    """
    Estimate the cost based on the model and token counts.
    Prices are per 1K tokens (as of April 2024).
    """
    # Updated prices
    prices = {
        "OpenAI (GPT-3.5-turbo)": {
            "input": 0.0010,
            "output": 0.0020
        },
        "Anthropic (Claude 3 Haiku)": {
            "input": 0.00025,
            "output": 0.00125
        },
        "Google (Gemini 2.0 Flash Lite)": {
            "input": 0.0001,  # Updated for Gemini 2.0 Flash Lite
            "output": 0.0002
        },
        "Local (Mistral 7B)": {
            "input": 0.0,
            "output": 0.0
        }
    }
    
    model_prices = prices[model_choice]
    input_cost = (input_tokens / 1000) * model_prices["input"]
    output_cost = (output_tokens / 1000) * model_prices["output"]
    total_cost = input_cost + output_cost
    
    return f"Estimated cost: ${total_cost:.4f} (Input: ${input_cost:.4f}, Output: ${output_cost:.4f})"

def generate_dataset(description, model_choice, num_rows):
    """
    Generate a synthetic dataset based on the user's description and selected model.
    """
    prompt = f"""
    Based on this description: "{description}"
    
    Generate a synthetic dataset in CSV format. The dataset should be realistic and diverse.
    Include appropriate column headers and exactly {num_rows} rows of data.
    
    Return ONLY the CSV content, nothing else. The first line should be the header row.
    """
    
    try:
        # Estimate input tokens (rough approximation)
        input_tokens = len(prompt.split()) * 1.3  # Approximate tokens per word
        
        if model_choice == "OpenAI (GPT-3.5-turbo)":
            try:
                response = openai.chat.completions.create(
                    model="gpt-3.5-turbo",
                    messages=[
                        {"role": "user", "content": prompt}
                    ],
                    temperature=0.7
                )
                csv_content = response.choices[0].message.content
                output_tokens = response.usage.completion_tokens
            except Exception as e:
                return [f"OpenAI Error: {str(e)}", "Cost estimation unavailable", "<p>Preview unavailable</p>"]
            
        elif model_choice == "Anthropic (Claude 3 Haiku)":
            try:
                response = claude.messages.create(
                    model="claude-3-haiku-20240307",
                    max_tokens=1000,
                    messages=[
                        {"role": "user", "content": prompt}
                    ]
                )
                csv_content = response.content[0].text
                output_tokens = len(csv_content.split()) * 1.3  # Approximate for Claude
            except Exception as e:
                return [f"Anthropic Error: {str(e)}", "Cost estimation unavailable", "<p>Preview unavailable</p>"]
            
        elif model_choice == "Google (Gemini 2.0 Flash Lite)":
            try:
                model = genai.GenerativeModel('gemini-2.0-flash-lite')
                response = model.generate_content(prompt)
                csv_content = response.text
                output_tokens = len(csv_content.split()) * 1.3  # Approximate for Gemini
            except Exception as e:
                return [f"Google Gemini Error: {str(e)}", "Cost estimation unavailable", "<p>Preview unavailable</p>"]
            
        elif model_choice == "Local (Mistral 7B)":
            try:
                response = ollama_client.generate(
                    model='mistral',
                    prompt=prompt,
                    stream=False
                )
                csv_content = response['response']
                output_tokens = len(csv_content.split()) * 1.3  # Approximate for Mistral
            except Exception as e:
                return [f"Local Model Error: {str(e)}", "Cost estimation unavailable", "<p>Preview unavailable</p>"]
        
        # Save to a temporary file
        filename = "generated_dataset.csv"
        with open(filename, "w") as f:
            f.write(csv_content)
        
        # Calculate cost estimate
        cost_estimate = estimate_cost(model_choice, input_tokens, output_tokens)
        
        # Convert CSV to DataFrame for preview
        import pandas as pd
        from io import StringIO
        try:
            df = pd.read_csv(StringIO(csv_content))
            # Verify number of rows matches request
            if len(df) != num_rows:
                preview_html = f"<p>Warning: Generated {len(df)} rows instead of requested {num_rows} rows.</p>" + df.to_html(index=False, classes='table table-striped')
            else:
                preview_html = df.to_html(index=False, classes='table table-striped')
        except Exception as e:
            preview_html = f"<p>Error creating preview: {str(e)}</p>"
        
        return [filename, cost_estimate, preview_html]
        
    except Exception as e:
        return [f"General Error: {str(e)}", "Cost estimation unavailable", "<p>Preview unavailable</p>"]

# Create the Gradio interface with model selection, cost estimation, preview, and row control
with gr.Blocks(theme=gr.themes.Soft(
    primary_hue="blue",
    secondary_hue="gray",
    neutral_hue="slate",
    font=["Inter", "sans-serif"]
)) as demo:
    gr.Markdown("""
    <style>
    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600&display=swap');
    
    body {
        font-family: 'Inter', sans-serif !important;
    }
    
    .table {
        width: 100%;
        border-collapse: collapse;
        margin: 1em 0;
        font-family: 'Inter', sans-serif !important;
    }
    .table th, .table td {
        padding: 8px;
        border: 1px solid #e2e8f0;
        text-align: left;
        font-family: 'Inter', sans-serif !important;
    }
    .table th {
        background-color: #f8fafc;
        font-weight: 600;
    }
    .table-striped tr:nth-child(even) {
        background-color: #f8fafc;
    }
    
    /* Custom styles for Gradio components */
    .gradio-container {
        font-family: 'Inter', sans-serif !important;
    }
    
    .gradio-button {
        font-family: 'Inter', sans-serif !important;
        font-weight: 500;
    }
    
    .gradio-input {
        font-family: 'Inter', sans-serif !important;
    }
    
    .gradio-output {
        font-family: 'Inter', sans-serif !important;
    }
    
    /* Custom scrollbar */
    ::-webkit-scrollbar {
        width: 8px;
        height: 8px;
    }
    
    ::-webkit-scrollbar-track {
        background: #f1f5f9;
    }
    
    ::-webkit-scrollbar-thumb {
        background: #cbd5e1;
        border-radius: 4px;
    }
    
    ::-webkit-scrollbar-thumb:hover {
        background: #94a3b8;
    }
    </style>
    """)
    
    gr.Markdown("""
    <div style='text-align: center; margin-bottom: 20px;'>
        <h1 style='font-family: "Inter", sans-serif; font-weight: 600; color: #1e293b;'>Synthetic Dataset Generator</h1>
        <p style='font-family: "Inter", sans-serif; color: #64748b;'>Generate synthetic datasets using different LLMs. Choose your preferred model and describe the dataset you want to create!</p>
    </div>
    """)
    
    with gr.Row():
        with gr.Column():
            description = gr.Textbox(
                label="Describe the dataset you want to generate",
                placeholder="e.g., I want to build a synthetic dataset of an Ecommerce store with customer purchases",
                lines=3
            )
            model_choice = gr.Dropdown(
                choices=[
                    "OpenAI (GPT-3.5-turbo)",
                    "Anthropic (Claude 3 Haiku)",
                    "Google (Gemini 2.0 Flash Lite)",
                    "Local (Mistral 7B)"
                ],
                value="OpenAI (GPT-3.5-turbo)",
                label="Select Model"
            )
            num_rows = gr.Slider(
                minimum=1,
                maximum=100,
                value=10,
                step=1,
                label="Number of Rows",
                info="Choose how many rows of data to generate"
            )
            generate_btn = gr.Button("Generate Dataset", variant="primary")
            
        with gr.Column():
            cost_estimate = gr.Textbox(
                label="Cost Estimation",
                interactive=False,
                elem_classes=["cost-estimate"]
            )
            preview = gr.HTML(label="Data Preview")
            output_file = gr.File(label="Download your dataset")
    
    generate_btn.click(
        fn=generate_dataset,
        inputs=[description, model_choice, num_rows],
        outputs=[output_file, cost_estimate, preview]
    )

# Launch the interface
demo.launch()

* Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.


