In [1]:
!pip install -q transformers accelerate bitsandbytes gradio

[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m59.4/59.4 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import json
from dotenv import load_dotenv
from openai import OpenAI
import gradio as gr
from concurrent.futures import ThreadPoolExecutor, as_completed
import re, pandas as pd
import torch

In [3]:
# Sign in to HuggingFace Hub
from google.colab import userdata
from huggingface_hub import login

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [4]:
os.environ["OPENAI_API_KEY"] = userdata.get('OPENAI_API_KEY')

# initialize OpenAI client
openai = OpenAI()

In [5]:
SYSTEM_MESSAGE = """
You are a Dataset generator assistant across various fields.
Maintain a 70‚Äì30 positive/negative ratio.
No null values.
Return ONLY a valid JSON array (no markdown, no explanations, no extra text) of the records
Do not include code fences or text outside JSON.

example record format:
[
  {{ "id": 1, "....": "....", "....": "...", "...": "...... }}
]

the above is just example for the json format, so by referencing this generate the synthetic data as per user domain specific request
and last important thing Use exactly the same JSON schema (column names and order) across all chunks.
"""


In [6]:
MODEL_INFO = {
    "meta-llama/Llama-3.1-8B-Instruct": {"context": 8192, "tokens_per_row": 40},
    "meta-llama/Llama-3.2-1B-Instruct": {"context": 4096, "tokens_per_row": 35},
    "microsoft/Phi-4-mini-instruct":    {"context": 8192, "tokens_per_row": 40},
    "google/gemma-3-270m-it":           {"context": 2048, "tokens_per_row": 30},
    "Qwen/Qwen3-4B-Instruct-2507":      {"context": 16384, "tokens_per_row": 40},
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B": {"context": 4096, "tokens_per_row": 35},
    "gpt-4o-mini":                       {"context": 16384, "tokens_per_row": 40},
}


In [7]:
def get_model_limits(model_id: str, total_rows: int):
    """Compute chunk size and safe token budget for the chosen model."""
    info = MODEL_INFO.get(model_id, {"context": 4096, "tokens_per_row": 40})
    max_context = info["context"]
    tpr = info["tokens_per_row"]
    safe_output_tokens = int(max_context * 0.7)          # keep 30% headroom for prompt
    chunk_size = max(20, min(total_rows, safe_output_tokens // tpr))
    return chunk_size, safe_output_tokens

In [8]:
def extract_json_array(text):
    match = re.search(r"\[.*\]", text, re.DOTALL)
    if not match:
        raise ValueError("No JSON array found in output.")
    return json.loads(match.group(0))

In [9]:
def generate_chunk_openai(chunk_index, num_rows, user_request, model, max_tokens, schema=None, schema_example=None):
    """Generate one JSON chunk safely with dynamic token limit."""
    if schema_example:
        schema_text = (
            f"Follow exactly this JSON structure and types:\n"
            f"{json.dumps(schema_example[:2], indent=2)}\n"
            f"Do not change key names or value types."
        )
    elif schema:
        schema_text = f"Use the exact same columns as these: {schema}."
    else:
        schema_text = "Generate your own schema suitable for the dataset and keep it consistent for future chunks."

    chunk_prompt = (
        f"{user_request}\n"
        f"Generate exactly {num_rows} unique JSON records.\n"
        f"IDs should start from {chunk_index + 1}.\n"
        f"{schema_text}\n"
        f"Return only valid JSON array."
    )

    response = openai.chat.completions.create(
        model=model,
        temperature=0.2,
        max_tokens=max_tokens,
        messages=[
            {"role": "system", "content": SYSTEM_MESSAGE},
            {"role": "user", "content": chunk_prompt},
        ],
    )

    return extract_json_array(response.choices[0].message.content)
    # except ValueError:
    #     # Retry with half rows if model truncated output
    #     if num_rows > 30:
    #         print(f"‚ö†Ô∏è  Chunk {chunk_index}: retrying with smaller size ({num_rows//2})...")
    #         return generate_chunk_openai(chunk_index, num_rows // 2, user_request, model, max_tokens, schema, schema_example)
    #     raise

In [10]:
# ===========================================================
# üîπ Global model cache (shared across threads)
# ===========================================================
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import torch

MODEL_CACHE = {}

def load_model_once(model_id):
    """
    Load and quantize a model once, then reuse it for all threads.
    Safe for inference because model weights are read-only.
    """
    print(f"üöÄ Loading model into cache: {model_id}")

    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4"
    )

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        quantization_config=quant_config
    )

    generator = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        temperature=0.2,
        do_sample=False
    )

    # Store everything needed in cache
    MODEL_CACHE[model_id] = {
        "model": model,
        "tokenizer": tokenizer,
        "generator": generator
    }
    return MODEL_CACHE[model_id]


def get_model(model_id):
    """
    Retrieve the model/tokenizer/generator from cache,
    loading it if not already present.
    """
    if model_id not in MODEL_CACHE:
        MODEL_CACHE[model_id] = load_model_once(model_id)
    return MODEL_CACHE[model_id]


In [11]:
import json, re

def generate_chunk_hf(chunk_index, num_rows, user_request, model_id, max_tokens, schema=None, schema_example=None):
    print(f"üß© Using cached {model_id} for chunk {chunk_index}...")

    # ‚úÖ Retrieve model + tokenizer + generator from cache
    cache = get_model(model_id)
    generator = cache["generator"]

    schema_text = (
        f"Follow exactly this JSON structure and types:\n{json.dumps(schema_example[:2], indent=2)}"
        if schema_example else
        "Generate your own schema suitable for the dataset and keep it consistent for future chunks."
    )

    prompt = (
        f"{user_request}\n"
        f"Generate exactly {num_rows} unique JSON records.\n"
        f"IDs should start from {chunk_index + 1}.\n"
        f"{schema_text}\n"
        f"Return only valid JSON array, no explanations."
        f"Output ONLY the JSON array ‚Äî no text before or after it."
    )

    out = generator(prompt, max_new_tokens=max_tokens)[0]["generated_text"]

    match = re.search(r"\[.*\]", out, re.DOTALL)
    if not match:
        print("‚ö†Ô∏è Raw output (first 500 chars):", out[:500])
        raise ValueError("No JSON array found in model output.")

    json_text = match.group(0)

    # ‚úÖ Trim anything after the final closing bracket
    last_bracket = json_text.rfind("]")
    if last_bracket != -1:
        json_text = json_text[:last_bracket + 1]

    # ‚úÖ Parse JSON safely
    try:
        return json.loads(json_text)
    except json.JSONDecodeError as e:
        print(f"‚ö†Ô∏è JSON decode error: {e}")
        # try minimal repair (remove trailing commas, line breaks)
        repaired = re.sub(r",\s*]", "]", json_text)
        repaired = repaired.replace("\n", "").replace("\r", "")
        try:
            return json.loads(repaired)
        except Exception as e2:
            print("‚ö†Ô∏è Still invalid JSON. Returning empty list.")
            print("First 500 chars of broken output:\n", json_text[:500])
            return []



In [12]:
def get_thread_count(total_rows):
    if total_rows <= 500:
        return 5
    elif total_rows <= 2000:
        return 6
    elif total_rows <= 5000:
        return 8
    else:
        return 10  # cap for huge datasets


In [13]:
def generate_dataset_threaded(user_request,total_rows, model):

    """Full threaded dataset generator with model-adaptive chunk/token logic."""
    chunk_func=generate_chunk_openai if model.startswith('gpt') else generate_chunk_hf

    # Determine per-model limits
    chunk_size, safe_tokens = get_model_limits(model, total_rows)
    max_workers = get_thread_count(total_rows)

    print(f"üöÄ Model: {model}")
    print(f"üìä Total rows: {total_rows}")
    print(f"üßÆ Chunk size: {chunk_size}, Token limit per chunk: {safe_tokens}")
    print(f"üßµ Threads: {max_workers}\n")

    # Compute per-thread splits
    num_workers = max_workers
    base_rows = total_rows // num_workers
    remainder = total_rows % num_workers
    rows_per_chunk = [base_rows + (1 if i < remainder else 0) for i in range(num_workers)]

    all_records = []

    # --- First chunk defines schema ---
    first_chunk_rows = rows_per_chunk[0]
    first_chunk = chunk_func(0, first_chunk_rows, user_request, model, safe_tokens)

    # ‚úÖ Fallback if first chunk failed
    if not first_chunk or len(first_chunk) == 0:
        print("‚ö†Ô∏è First chunk failed ‚Äî retrying once with simplified prompt...")
        first_chunk = chunk_func(0, first_chunk_rows, f"{user_request} (output only valid JSON)", model, safe_tokens)

    # ‚úÖ If still empty, fail gracefully
    if not first_chunk or len(first_chunk) == 0:
        raise ValueError("‚ùå Unable to generate a valid schema from the model output. Try smaller rows or different model.")

    all_records.extend(first_chunk)
    schema = list(first_chunk[0].keys())
    schema_example = first_chunk[:2]
    print(f"‚úÖ Schema captured: {schema}\n")

    # --- Parallel generation for remaining chunks ---
    with ThreadPoolExecutor(max_workers=num_workers) as executor:
        futures = []
        start_id = first_chunk_rows
        for i in range(1, num_workers):
            futures.append(
                executor.submit(
                    chunk_func, start_id, rows_per_chunk[i],
                    user_request, model, safe_tokens, schema, schema_example
                )
            )
            start_id += rows_per_chunk[i]

        for future in as_completed(futures):
            try:
                result = future.result()
                all_records.extend(result)
                print(f"‚úÖ Completed chunk ({len(result)} records)")
            except Exception as e:
              import traceback
              print(f"‚ùå Error in chunk {i}: {e}\n{traceback.format_exc()}")


    # --- Postprocess ---
    df = pd.DataFrame(all_records)
    df = df.reindex(columns=schema).fillna("N/A").sort_values(by="id", ignore_index=True)

    # --- Top-up if under-produced ---
    missing = total_rows - len(df)
    while missing > 0:
        print(f"‚ö†Ô∏è Missing {missing} rows, generating top-up...")
        extra = chunk_func(total_rows, missing, user_request, model, safe_tokens, schema, schema_example)
        df = pd.concat([df, pd.DataFrame(extra)], ignore_index=True)
        df = df.reindex(columns=schema).fillna("N/A").sort_values(by="id", ignore_index=True)
        missing = total_rows - len(df)

    print(f"\n‚úÖ Final dataset: {len(df)} rows √ó {len(schema)} columns.")
    return df

In [14]:
import torch, bitsandbytes as bnb
print("CUDA available:", torch.cuda.is_available())
print("Device:", torch.device("cuda" if torch.cuda.is_available() else "cpu"))


CUDA available: True
Device: cuda


In [15]:
def get_total_rows(user_request):
    q = user_request.lower()
    if "basic" in q:
        return 500
    elif "medium" in q:
        return 2000
    elif "large" in q:
        return 5000
    else:
        return 5


In [16]:
user_request = "Generate a dataset of exoplanets discovered between 2000 and till now with full details."
model_id = "gpt-4o-mini"
total_rows = get_total_rows(user_request)


df = generate_dataset_threaded(
    user_request=user_request,
    total_rows=total_rows,
    model=model_id
)

print(df.head())
df.to_csv("synthetic_dataset.csv", index=False)

üöÄ Model: gpt-4o-mini
üìä Total rows: 5
üßÆ Chunk size: 20, Token limit per chunk: 11468
üßµ Threads: 5

‚úÖ Schema captured: ['id', 'name', 'discovery_year', 'mass', 'radius', 'orbital_period', 'distance_from_earth', 'host_star', 'star_type', 'habitable_zone', 'discovery_method']

‚úÖ Completed chunk (1 records)
‚úÖ Completed chunk (1 records)
‚úÖ Completed chunk (1 records)
‚úÖ Completed chunk (1 records)

‚úÖ Final dataset: 5 rows √ó 11 columns.
   id                name  discovery_year  mass  radius  orbital_period  \
0   1         Kepler-186f            2014  0.55    1.10         130.600   
1   2  Proxima Centauri b            2016  1.27    1.07          11.200   
2   3  Proxima Centauri b            2016  1.17    1.07          11.200   
3   4  Proxima Centauri b            2016  1.17    1.07          11.200   
4   5         HD 209458 b            1999  0.69    1.35           3.524   

   distance_from_earth         host_star star_type  habitable_zone  \
0               500.0

In [17]:
import gradio as gr
import pandas as pd
import time
import contextlib, io, traceback

def generate_dataset(user_request, model_name, dataset_size, progress=gr.Progress()):
    # ‚úÖ Step 1: validate user input immediately
    if not user_request or user_request.strip() == "":
        raise gr.Error("‚ö†Ô∏è Please enter a dataset request before generating.")

    try:
        progress(0, desc="Initializing model and configuration...")
        time.sleep(0.8)

        size_map = {"Small": 500, "Medium": 2000, "Large": 5000}
        total_rows = size_map.get(dataset_size, 500)

        progress(0.3, desc=f"Generating {total_rows} rows using {model_name}...")
        time.sleep(1)

        # Silence prints from threaded generator
        with contextlib.redirect_stdout(io.StringIO()):
            df = generate_dataset_threaded(
                user_request=user_request,
                total_rows=total_rows,
                model=model_name
            )

        if not isinstance(df, pd.DataFrame):
            raise TypeError(f"Expected a DataFrame but got {type(df)}")

        # Save CSV
        output_file = "synthetic_dataset.csv"
        df.to_csv(output_file, index=False)

        progress(0.9, desc="Finalizing dataset...")
        time.sleep(0.5)
        progress(1, desc="‚úÖ Done!")

        details = (
            f"**Model:** {model_name}\n"
            f"**Dataset Size:** {dataset_size} ({total_rows} rows)\n"
            f"**User Request:** {user_request}"
        )

        # ‚úÖ Return clean results
        return details, df, output_file

    except Exception as e:
        # Print traceback but DO NOT update output boxes with error
        print("‚ùå ERROR DURING GENERATION:\n", traceback.format_exc())
        # Just raise popup error (stops execution, no red boxes)
        raise gr.Error(f"‚ùå Something went wrong during generation:\n{e}")




# --- Example Prompts ---
examples = [
    ["Generate a dataset of exoplanets discovered between 2000 and now"],
    ["Create a dataset of global earthquakes with magnitude >6.0 since 2010"],
    ["Generate a dataset of top 100 AI companies with country, valuation, and sector"]
]

# --- Gradio App Layout ---
with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("## üß¨ Synthetic Dataset Generator")
    gr.Markdown("Easily generate structured datasets using LLM models.")

    # 1Ô∏è‚É£ User Request
    user_input = gr.Textbox(
        label="Dataset Request",
        placeholder="e.g., Generate a dataset of exoplanets discovered between 2000 and now"
    )

    # 2Ô∏è‚É£ Model and Dataset Size
    with gr.Row():
        model_dropdown = gr.Dropdown(
            choices=[
                "gpt-4o-mini",
                "google/gemma-3-270m-it",
                "Qwen/Qwen3-4B-Instruct-2507",
                "meta-llama/Llama-3.2-1B-Instruct",
                "meta-llama/Llama-3.1-8B-Instruct",
                "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
                "Qwen/Qwen2-1.5B-Instruct",
                "microsoft/Phi-4-mini"
            ],
            label="Select Model",
            value="gpt-4o-mini"
        )

        size_dropdown = gr.Radio(
          choices=["Small", "Medium", "Large"],
          label="Select Dataset Size",
          value="Small",
          info="* Small = 500 rows * Medium = 2,000 rows * Large = 5,000 rows"
      )

    gr.Examples(
        examples=examples,
        inputs=[user_input],
        label="Try one of these prompts!"

    )


    # 3Ô∏è‚É£ Generate Button
    generate_btn = gr.Button("Generate Data")

    # 4Ô∏è‚É£ Progress + Details
    generation_details = gr.Markdown("*(Click Generate to start...)*")

    # 5Ô∏è‚É£ Dataset Preview
    dataset_preview = gr.Dataframe(
        label="Preview of Generated Dataset",
        interactive=False
    )

    # 6Ô∏è‚É£ Download CSV Button (initially hidden)
    download_button = gr.File(label="‚¨áÔ∏è Download as CSV", visible=False)


    # --- Connect the button to the backend ---
    generate_btn.click(
        fn=generate_dataset,
        inputs=[user_input, model_dropdown, size_dropdown],
        outputs=[generation_details, dataset_preview, download_button]
    )

demo.launch()


It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1714f89811564565af.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


