<h1>Dataset Generator</h1>
<p>A powerful tool leveraging Hugging Face LLM and Gradio to generate high-quality datasets for AI applications.</p>

<p>Features:</p>
<ul>
<li>Uses state-of-the-art LLM models from Hugging Face.</li>
<li>Interactive UI powered by Gradio.</li>
<li>stomizable dataset generation for various testing tasks.</li>
<li>Supports text completion, classification, and summarization.</li>
</ul>

In [None]:
!pip install -q transformers gradio torch accelerate bitsandbytes

In [None]:
# Import libraries
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, BitsAndBytesConfig, TextStreamer
import gradio as gr
from huggingface_hub import login
from google.colab import userdata
import json
import csv
from datetime import datetime
import torch

In [None]:
# Log in to HuggingFace

hf_token = userdata.get('HF_TOKEN2')
login(hf_token, add_to_git_credential=True)

In [None]:
# Initialize Model

model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"

quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
  model_name,
  device_map="auto",
  quantization_config=quant_config
)

In [None]:
#tokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Generation function

def generate_dataset(num_fields, *args):
    max_fields = 8
    params = list(args)
    
    # Split parameters correctly
    field_names = params[:max_fields]
    field_types = params[max_fields:2*max_fields]
    other_params = params[2*max_fields:]
    
    # Extract generation parameters
    num_entries = other_params[0]
    user_prompt = other_params[1]
    output_format = other_params[2]  # Should be string value
    
    # Validate fields
    fields = []
    for i in range(num_fields):
        name = field_names[i].strip()
        ftype = field_types[i]
        if not name:
            return None, None, f"Field {i+1} name cannot be empty"
        fields.append((name, ftype))

    # Build system prompt
    system_prompt = f"""Generate {num_entries} entries as JSON array. Each object must have:"""
    for name, ftype in fields:
        system_prompt += f"\n- {name} ({ftype})"
    system_prompt += "\nOutput ONLY the JSON array with no additional text."

    try:
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]

        # Apply Llama 3 chat template
        inputs = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)

        # Generate output
        generation_params = {
            "input_ids": inputs,
            "max_new_tokens": 2000,
            "temperature": 0.7,
            "do_sample": True,
            "top_p": 0.9,
            "pad_token_id": tokenizer.eos_token_id
        }

        print("⚡ Starting generation...")
        outputs = model.generate(**generation_params)
        
        # Decode and clean output
        full_output = tokenizer.decode(outputs[0][inputs.shape[-1]:], skip_special_tokens=True)
        cleaned = full_output.strip()

        # Parse and validate JSON
        data = json.loads(cleaned)
        if not isinstance(data, list):
            raise ValueError("Output must be a JSON array")

        # Validate fields
        expected_fields = [name for name, _ in fields]
        for entry in data:
            if set(entry.keys()) != set(expected_fields):
                raise ValueError("Generated fields don't match specification")

        # Save file
        filename = f"dataset_{datetime.now().strftime('%Y%m%d%H%M%S')}.{output_format}"
        keys = [name for name, _ in fields]
        
        if output_format == "json":
            with open(filename, "w") as f:
                json.dump(data, f, indent=2)
        elif output_format == "csv":
            with open(filename, "w", newline="") as f:
                writer = csv.DictWriter(f, fieldnames=keys)
                writer.writeheader()
                writer.writerows(data)
        elif output_format == "jsonl":
            with open(filename, "w") as f:
                for item in data:
                    f.write(json.dumps(item) + "\n")

        return data, filename, None

    except json.decoder.JSONDecodeError as e:
        return None, None, f"JSON Parsing Error: {str(e)}\nRaw Output: {cleaned}"
    except Exception as e:
        return None, None, f"Error: {str(e)}"

In [None]:
# Gradio Interface
max_fields = 8
default_names = ['id', 'name', 'email', 'age', 'city', 'is_active', 'score', 'birthdate']
default_types = ['number', 'string', 'string', 'number', 'string', 'boolean', 'number', 'string']

with gr.Blocks() as app:
    gr.Markdown("# 🧩 Custom Dataset Generator")

    with gr.Row():
        with gr.Column():
            gr.Markdown("## 🔧 Field Configuration")
            num_fields = gr.Slider(1, max_fields, value=4, step=1, label="Number of Fields")
            
            field_rows = []
            field_names = []
            field_types = []
            for i in range(max_fields):
                with gr.Row(visible=(i < 4)) as row:
                    name = gr.Textbox(
                        label=f"Field {i+1} Name", 
                        value=default_names[i] if i < len(default_names) else "",
                        interactive=True
                    )
                    ftype = gr.Dropdown(
                        ["string", "number", "boolean"], 
                        label=f"Field {i+1} Type",
                        value=default_types[i] if i < len(default_types) else "string",
                        interactive=True
                    )
                    field_names.append(name)
                    field_types.append(ftype)
                    field_rows.append(row)

            num_fields.change(
                lambda n: [gr.Row.update(visible=i < n) for i in range(max_fields)],
                inputs=num_fields,
                outputs=field_rows
            )

        with gr.Column():
            gr.Markdown("## ⚙️ Generation Parameters")
            num_entries = gr.Dropdown(
                choices=[15, 50, 100, 150, 200, 250],  
                value=15,  
                label="Number of Entries"
            )
            user_prompt = gr.Textbox(
                label="Content Instructions", 
                lines=3,
                value="Generate realistic data entries with:"
            )
            output_format = gr.Radio(
                ["json", "csv", "jsonl"], 
                label="Output Format", 
                value="json"
            )
            generate_btn = gr.Button("🚀 Generate Dataset", variant="primary")

    with gr.Row():
        data_preview = gr.JSON(label="📊 Data Preview")
        file_output = gr.File(label="📥 Download File")
        error_output = gr.Textbox(label="❌ Error Messages", visible=False)

    generate_btn.click(
        generate_dataset,
        inputs=[num_fields] + field_names + field_types + [num_entries, user_prompt, output_format],
        outputs=[data_preview, file_output, error_output]
    )

In [None]:
app.launch(debug=True)

<h2>Use Cases</h2>

<p>This dataset generator is ideal for:</p>

<ul>
<li>Training chatbots and virtual assistants.</li>
<li>Developing sentiment analysis models.</li>
<li>Creating summarization datasets for AI research.</li>
<li>Fine-tuning custom NLP applications.</li>
</ul>