# Initial Setup

In [1]:
import os
import torch
import gradio as gr
from dotenv import load_dotenv
from huggingface_hub import login
from transformers import BitsAndBytesConfig, AutoModelForCausalLM
from transformers import AutoTokenizer, TextStreamer

In [2]:
# Device
device = "cuda" if torch.cuda.is_available() else "cpu"

In [3]:
# Login
load_dotenv(override=True)
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN", "type-your-token-here")
login(HF_TOKEN)

In [4]:
# Models
LLAMA = "meta-llama/Llama-3.2-3B-Instruct"

In [6]:
# Load model
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

llama = AutoModelForCausalLM.from_pretrained(
    LLAMA,
    device_map="auto",
    quantization_config=quantization_config
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(LLAMA)
tokenizer.pad_token = tokenizer.eos_token

streamer = TextStreamer(tokenizer)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

# Prompting

In [124]:
def generate_system_prompt_for_structured_data(data_structure: str) -> str:
    if data_structure == "CSV":
        data_structure = "as raw CSV"
    elif data_structure == "JSON":
        data_structure = "in JSON"
    system_prompt =  "You are skilled at generating data for data science or machine learning engineering purposes based on user requests. "
    system_prompt += f"Returns the generated data only {data_structure} format, with no explanations, code, and any other formatting."
    return system_prompt

def generate_user_prompt_for_structured_data(col_names: str, row_num: int, data_structure: str, additional_info: str="") -> str:
    user_prompt = f"Generate data that has {row_num} rows.\n"
    user_prompt += f"The data has column names as follows:\n"
    user_prompt += f"{col_names}\n"
    if additional_info:
        user_prompt += f"Additional information: {additional_info}\n"
        
    if data_structure == "CSV":
        data_structure = "as raw CSV"
    elif data_structure == "JSON":
        data_structure = "in JSON"
    user_prompt += f"Returns the generated data only in {data_structure} format, with no explanations, code, and any other formatting."
    
    return user_prompt

def generate_messages_for_structured_data(col_names: str, row_num: int, data_structure: str, additional_info: str="") -> list:
    system_prompt = generate_system_prompt_for_structured_data(data_structure)
    user_prompt = generate_user_prompt_for_structured_data(col_names, row_num, data_structure, additional_info)

    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

# Testing
test = generate_messages_for_structured_data(
    col_names="Name, sex, address",
    row_num=100,
    data_structure="CSV",
    additional_info="There should be no duplicate data."
)
print(test[1]['content'])

Generate data that has 100 rows.
The data has column names as follows:
Name, sex, address
Additional information: There should be no duplicate data.
Returns the generated data only in as raw CSV format, with no explanations, code, and any other formatting.


In [125]:
def generate_system_prompt_for_unstructured_data() -> str:
    system_prompt = "You are an expert in generating text data based on user requests. "
    system_prompt += "Returns the generated text only without any explanations."
    return system_prompt

def generate_user_prompt_for_unstructured_data(data_description: str) -> str:
    user_prompt = "Generate text based on the following description:\n"
    user_prompt += f"{data_description}\n"
    user_prompt += "Returns the generated text only without any explanations."
    return user_prompt

def generate_messages_for_unstructured_data(data_description: str) -> list:
    system_prompt = generate_system_prompt_for_unstructured_data()
    user_prompt = generate_user_prompt_for_unstructured_data(data_description)
    return [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ]

test = generate_messages_for_unstructured_data("The story of my life.")
print(test[1]['content'])

Generate text based on the following description:
The story of my life.
Returns the generated text only without any explanations.


# Inferencing

In [126]:
def extract_text(decoded: str) -> str:
    """Extract assistant reply from chat-formatted LLaMA output"""
    try:
        return decoded.split("<|start_header_id|>assistant<|end_header_id|>")[-1].split("<|eot_id|>")[0].strip()
    except Exception:
        return decoded.strip()

def inferencing(messages: list) -> str:
    """Generate LLaMA output"""
    # Tokenize
    inputs = tokenizer.apply_chat_template(
        messages,
        return_tensors="pt"
    ).to(device)

    # Inference
    outputs = llama.generate(
        inputs,
        max_new_tokens=5000
    )

    # Extract response
    decoded = tokenizer.decode(outputs[0])
    response = extract_text(decoded)    
    return response

def generate_structured_data(col_names: str, row_num: int, data_structure: str, additional_info: str="") -> str:
    messages = generate_messages_for_structured_data(
        col_names,
        row_num,
        data_structure,
        additional_info
    )
    generated_data = inferencing(messages)
    return generated_data

def generate_unstructured_data(data_description: str) -> str:
    messages = generate_messages_for_unstructured_data(data_description)
    generated_data = inferencing(messages)
    return generated_data

# UI

In [129]:
with gr.Blocks() as demo:
    with gr.Row():
        gr.Markdown("# AI Data Generator")
    with gr.Row():
        data_type = gr.Dropdown(
            ["Structured Data", "Unstructured Data"], value="Structured Data", label="Data Type", interactive=True
        )
    with gr.Row():
        with gr.Column():
            # Structured Data
            with gr.Column(visible=True) as structured_inputs:
                col_names = gr.Textbox(label="Column Names", placeholder="E.g. Name, Sex, Address, Reviews")
                row_num = gr.Number(label="Number of Rows")
                data_structure = gr.Dropdown(
                    ["CSV", "JSON"], value="CSV", label="Data Format", interactive=True
                )
                additional_info = gr.Textbox(
                    label="Additional Information (optional)",
                    placeholder="E.g. Data about product reviews. The Reviews column contains positive and negative reviews. There is no duplicate data.",
                    lines=5
                )

            # Unstructured Data
            with gr.Column(visible=False) as unstructured_inputs:
                data_description = gr.Textbox(
                    label="Data Description",
                    placeholder="E.g News about AI developments in Indonesia covers how far AI has developed, comments from policy makers, public responses, and government efforts in implementing AI.",
                    lines=10
                )

            submit_btn = gr.Button("Generate Data")
            
        # Output
        output_textbox = gr.Textbox(label="Generated Data", lines=20, show_copy_button=True)

        # Update UI
        def update_ui(selected_type):
            return (
                gr.update(visible=selected_type == "Structured Data"),
                gr.update(visible=selected_type == "Unstructured Data"),
            )
        
        data_type.change(
            fn=update_ui,
            inputs=data_type,
            outputs=[structured_inputs, unstructured_inputs]
        )

        # Generate Data
        def generate_data(data_type, col_names, row_num, data_structure, additional_info, data_description):
            if data_type == "Structured Data":
                return generate_structured_data(
                    col_names,
                    row_num,
                    data_structure,
                    additional_info
                )
            elif data_type == "Unstructured Data":
                return generate_unstructured_data(data_description)
            else:
                return "Structure of the data is unknown!"
            
        submit_btn.click(
            fn=generate_data,
            inputs=[data_type, col_names, row_num, data_structure, additional_info, data_description],
            outputs=output_textbox
        )

demo.launch()

* Running on local URL:  http://127.0.0.1:7900
* To create a public link, set `share=True` in `launch()`.


