<a href="https://colab.research.google.com/github/triimamwicaksono/llm_engineering/blob/main/Week3_Datasets_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate openai httpx==0.27.2 gradio

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m79.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m70.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m40.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# imports

import os
import requests
from IPython.display import Markdown, display, update_display
from openai import OpenAI
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch

In [3]:
#Model
deepseek = "deepseek-ai/deepseek-llm-7b-chat"
qwen2 = "Qwen/Qwen2.5-7B-Instruct"
ministral = "mistralai/Mistral-7B-Instruct-v0.3"



In [4]:
# Sign in to HuggingFace Hub

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

In [5]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [6]:
def generate_message(model_id, messages):
  try:
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
    streamer = TextStreamer(tokenizer)
    model = AutoModelForCausalLM.from_pretrained(deepseek, device_map="auto", quantization_config=quant_config)
    outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)
    response = tokenizer.decode(outputs[0])
    del tokenizer, streamer, model, inputs, outputs
    return response
  except Exception as e:
    print(e)


In [7]:
#prompt

system_prompt="You are a data scientist specialized in generating high-quality synthetic datasets for testing, modeling, and analysis. When given a user prompt, you will"\
"Ensure the synthetic data is realistic but does not replicate any real personal data."\
"Use appropriate domain-specific knowledge to guide value generation (e.g., plausible timestamps, names, locations, financial values)."\
"Output the dataset in a structured format (JSON, CSV, or tabular markdown)"

user_prompt = "Please provide me a dataset for the following business"\
"for example"\
"Please generate a synthetic dataset with the following characteristics:"\
"Purpose: Simulate e-commerce transactions"\
"Number of rows: 500"\
"Columns:"\
"transaction_id (string, unique)"\
"user_id (integer)"\
"product_name (string, from a pool of ~20 common items)"\
"category (string, e.g., Electronics, Clothing, Home, etc.)"\
"price (float, in USD, between 5 and 500)"\
"quantity (integer, 1 to 5)"\
"total (price × quantity)"\
"payment_method (e.g., Credit Card, PayPal, Bank Transfer)"\
"purchase_date (timestamp between Jan 1, 2024 and Apr 30, 2025)"

In [8]:
def choose_format(format,num_data):
  format_message = ""
  if format == "CSV":
    format_message = "Please generate a synthetic dataset using CSV Format"
  elif format == "JSON":
    format_message = "Please generate a synthetic dataset using JSON Format"
  elif format == "Tabular":
    format_message = "Please generate a synthetic dataset using Tabular Format"

  return format_message + f"with {num_data} rows"



In [9]:
def prompt_engineering(user_input, data_format, num_records):
  messages = [
      {"role": "system", "content": system_prompt},
      {"role": "user", "content": user_input + user_prompt + choose_format(data_format, num_records)}
  ]
  return messages


In [10]:
def create_data(user_input, data_format, num_records, model):
  if model == "Deepseek":
    model_id = deepseek
  elif model == "Qwen2":
    model_id = qwen2
  elif model == "Ministral":
    model_id = ministral

  messages = prompt_engineering(user_input, data_format, num_records)
  return generate_message(model_id, messages)

In [11]:
create_data("running in fashion","CSV",50,"Deepseek")

tokenizer_config.json:   0%|          | 0.00/1.28k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/4.61M [00:00<?, ?B/s]

Found no NVIDIA driver on your system. Please check that you have an NVIDIA GPU and installed a driver from http://www.nvidia.com/Download/index.aspx


In [12]:
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU-Device:", torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print("No GPU found.")

CUDA available: False
No GPU found.


In [13]:
import gradio as gr
with gr.Blocks() as demo:
    gr.Markdown("## Synthetic Data Generator")

    with gr.Row():
        model_dropdown = gr.Dropdown(
            choices=["Deepseek", "Qwen2", "Mistral"],
            label="Choose a model",
            value="Deepseek"
        )

        user_input = gr.Textbox(
            lines=4,
            placeholder="Enter your prompt here...",
            label="User Input"
        )

    with gr.Row():
        target_format = gr.Dropdown(
            ['CSV', 'JSON', 'Tabular'],
            label='Choose your Format',
            value='CSV'
        )
        num_records = gr.Dropdown(
            [50, 100, 150, 200],
            label='Number of Records',
            value=50
        )

    generate_button = gr.Button("Generate Synthetic Data")

    output_box = gr.Textbox(
        label="Output",
        lines=10
    )

    generate_button.click(
        fn=create_data,
        inputs=[user_input, model_dropdown, target_format, num_records],
        outputs=output_box
    )



In [14]:
demo.launch(inbrowser = True)

It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://94f13a0809a106f386.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [15]:
# test_generate.py
print(create_data("Test prompt", "deepseek", "CSV", 100))


UnboundLocalError: cannot access local variable 'model_id' where it is not associated with a value