# Dataset creation notebook

Purpose: create a dataset for NER model using Gemini API for batched based sample generation

Structure:
1. Setup & Configuration
2. Bio tags assignment
3. Generating and saving the whole dataset

### Todo (future improvements)
1. Implement checkpoint saves for dataset creation (in case of error)
2. Tagging validation

### Setup & Configuration

In [8]:
import os
import json
import time
import random
import pandas as pd
from tqdm import tqdm
from getpass import getpass
from google import genai
from google.genai import types
from sklearn.model_selection import train_test_split

In [9]:
PROMPT_POSITIVE_EN = """
Generate 20 unique items in valid JSON Lines format. 
Output **only** JSON objects, one per line, with no additional commentary, explanation, or text. 
Each line must be a dictionary with keys:
"text" — a sentence containing one or more specific mountain names
"entities" — a list of all mountain names mentioned in the sentence, including alternative names
"""

PROMPT_NEGATIVE_EN = """
Generate 20 unique items in valid JSON Lines format. 
Output **only** JSON objects, one per line, with no commentary.
Each line must be a dictionary with:
"text" — a sentence about geography, hiking, or nature
"entities" — an empty list []
Use generic terms such as "hill", "ridge", "peak", "valley", "cliff" and include real geographic names that are NOT mountains, like rivers, lakes, deserts, islands, or regions.
"""

PROMPT_POSITIVE_UA = """
Згенеруй 20 унікальних елементів у форматі JSON Lines.
Виводь лише JSON об'єкти, 1 на рядок, без додаткових коментарів, пояснень або тексту. 
Кожен рядок має бути словником:
"text": речення українською мовою, що містить назви однієї або кількох конкретних гір.
"entities": список точних назв гір, що зустрічаються в реченні.
Використовуй реальні гори (Карпати, Говерла, Гімалаї, Альпи тощо). 
"""

PROMPT_NEGATIVE_UA = """
Згенеруй 20 унікальних елементів у форматі JSON Lines.
Виводь лише JSON об'єкти, 1 на рядок, без додаткових коментарів, пояснень або тексту. 
Кожен рядок має бути словником:
"text": речення українською про природу, географію.
"entities": пустий список [].
Використовуй загальні терміни, такі як «пагорб», «хребет», «вершина», «долина», «скеля», та вказуй справжні географічні назви, які НЕ є горами, наприклад, річки, озера, пустелі, острови чи регіони.
"""

In [10]:
def setup_api():
    api_key = os.getenv("GEMINI_API_KEY")
    if not api_key:
        api_key = getpass("Enter Gemini API Key: ")

    client = genai.Client(api_key=api_key)
    return client

# Maximum number of times to retry a failed API call before giving up on that specific request
MAX_RETRIES = 5

def generate_batch_data(client, prompt, batch_count=1):
    dataset = []

    # Tqdm wrapper makes a progress bar
    for i in tqdm(range(batch_count), desc="Generating batches"):
        try:
            for attempt in range(MAX_RETRIES):
                try:
                    response = client.models.generate_content(
                        model="gemini-2.5-flash",
                        contents=prompt,
                        config=types.GenerateContentConfig(
                            temperature=0.8,
                            response_mime_type="application/json"
                        )
                    )
                    break # success
                except Exception as e:
                    # Exponential backoff (base 10)
                    print(f"Attempt {attempt+1} failed: {e}")
                    wait_time = 10 * (attempt + 1) 
                    print(f"Retrying in {wait_time}s …")
                    time.sleep(wait_time)

            # Extract the raw generated text
            raw_text = response.candidates[0].content.parts[0].text

            # Split by lines (JSONL format) and parse
            batch_data = [json.loads(line) for line in raw_text.strip().splitlines() if line.strip()]

            # Add to overall dataset
            dataset.extend(batch_data)
            
            time.sleep(5)  # rate limit pause

        except Exception as e:
            print(f"Error on batch {i}: {e}. Retrying after 10s …")
            time.sleep(10)

    return dataset


### Verify setup

In [None]:
try:
    client = setup_api()
    test_response = generate_batch_data(client, PROMPT_POSITIVE_EN, 1)
    print(f"Total samples returned: {len(test_response)}")
    print("Preview of first 2 samples sample:")
    print(test_response[:1])
except Exception as e:
    print(f"Smoke test failed: {e}")


Generating batches: 100%|██████████| 1/1 [00:11<00:00, 11.93s/it]

Total samples returned: 20
Preview of first 2 samples sample:
[{'text': 'Mount Everest, also known as Chomolungma, is the highest peak in the world.', 'entities': ['Mount Everest', 'Chomolungma']}]





In [27]:
import re
import json

def assign_bio_tags(entry):
    """
        Converts entity lists to BIO tags.
    """
    text = entry['text']
    
    # Robust regex tokenization
    tokens = re.findall(r'\w+|[^\w\s]', text)
    tags = ["O"] * len(tokens)
    
    entities = entry.get('entities', [])
    
    for entity in entities:
        # Tokenize the entity string using the exact same regex
        entity_tokens = re.findall(r'\w+|[^\w\s]', entity)
        entity_len = len(entity_tokens)
        
        # Sliding window match
        for i in range(len(tokens) - entity_len + 1):
            if tokens[i:i+entity_len] == entity_tokens:
                tags[i] = "B-MNT"
                for j in range(1, entity_len):
                    tags[i+j] = "I-MNT"
    
    language = entry.get("language", None)

    return {
        "tokens": tokens,
        "ner_tags": tags,
        "language": language
    }

### Bio-tags assignment test

In [15]:
# Test Case 1: Positive Sample
test_data_1 = {
    "text": "The climb to Mount Fitz Roy was difficult near K2!",
    "entities": ["Mount Fitz Roy", "K2"]
}

# Test Case 2: Negative Sample
test_data_2 = {
    "text": "The river flows near the high ridge, far from the city center.",
    "entities": []
}

# --- Execute Tests ---
print("\nTest 1:")
result_1 = assign_bio_tags(test_data_1)
print(json.dumps(result_1, indent=4))

print("\nTest 2:")
result_2 = assign_bio_tags(test_data_2)
print(json.dumps(result_2, indent=4))


Test 1:
{
    "tokens": [
        "The",
        "climb",
        "to",
        "Mount",
        "Fitz",
        "Roy",
        "was",
        "difficult",
        "near",
        "K2",
        "!"
    ],
    "ner_tags": [
        "O",
        "O",
        "O",
        "B-MNT",
        "I-MNT",
        "I-MNT",
        "O",
        "O",
        "O",
        "B-MNT",
        "O"
    ]
}

Test 2:
{
    "tokens": [
        "The",
        "river",
        "flows",
        "near",
        "the",
        "high",
        "ridge",
        ",",
        "far",
        "from",
        "the",
        "city",
        "center",
        "."
    ],
    "ner_tags": [
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O",
        "O"
    ]
}


# Generating a whole dataset

### Save & Load functions

In [18]:
def save_jsonl(data, filename):
    """
    Save jsonl data to file with proper UTF-8 encoding
    """
    with open(filename, 'w', encoding='utf-8') as f:
        for entry in data:
            json.dump(entry, f, ensure_ascii=False)  # keep Cyrillic readable
            f.write('\n')

def load_jsonl(filename):
    """
    Load jsonl data from file with proper UTF-8 encoding
    """
    data = []
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():  # avoid empty lines
                data.append(json.loads(line))
    return data

### Caution: This section generates data using an external API.

In [17]:
# Generate raw data

# One batch contains 20 samples
POSITIVE_EN_BATCH_COUNT = 12
NEGATIVE_EN_BATCH_COUNT = 60
POSITIVE_UA_BATCH_COUNT = 12
NEGATIVE_UA_BATCH_COUNT = 60

try:
    client = setup_api()

    print("\nGenerating Positive English Samples...")
    pos_data_en = generate_batch_data(client, PROMPT_POSITIVE_EN, batch_count=POSITIVE_EN_BATCH_COUNT)
    save_jsonl(pos_data_en, "data/raw/raw_positive_en.jsonl")

    print("\nGenerating Negative English Samples...")
    neg_data_en = generate_batch_data(client, PROMPT_NEGATIVE_EN, batch_count=NEGATIVE_EN_BATCH_COUNT)
    save_jsonl(neg_data_en, "data/raw/raw_negative_en.jsonl")

    print("\nGenerating Positive Ukrainian Samples...")
    pos_data_ua = generate_batch_data(client, PROMPT_POSITIVE_UA, batch_count=POSITIVE_UA_BATCH_COUNT)
    save_jsonl(pos_data_ua, "data/raw/raw_positive_ua.jsonl")

    print("\nGenerating Negative Ukrainian Samples...")
    neg_data_ua = generate_batch_data(client, PROMPT_NEGATIVE_UA, batch_count=NEGATIVE_UA_BATCH_COUNT)
    save_jsonl(neg_data_ua, "data/raw/raw_negative_ua.jsonl")

except Exception as e:
    print(f"\nAPI generation error: {e}")


Generating Positive English Samples...


Generating batches: 100%|██████████| 12/12 [02:35<00:00, 12.94s/it]



Generating Negative English Samples...


Generating batches: 100%|██████████| 60/60 [11:30<00:00, 11.51s/it]



Generating Positive Ukrainian Samples...


Generating batches: 100%|██████████| 12/12 [03:02<00:00, 15.20s/it]



Generating Negative Ukrainian Samples...


Generating batches: 100%|██████████| 60/60 [15:34<00:00, 15.57s/it]


### Processing and Validating

### Adding language labels

In [None]:
def add_language_label(data, language):
    for entry in data:
        entry['language'] = language
    return data

# --- Adding labels ---
en_data = load_jsonl("data/raw/raw_positive_en.jsonl")
en_data = add_language_label(en_data, "EN")
save_jsonl(en_data, "data/labeled/labeled_positive_en.jsonl")

en_data = load_jsonl("data/raw/raw_negative_en.jsonl")
en_data = add_language_label(en_data, "EN")
save_jsonl(en_data, "data/labeled/labeled_negative_en.jsonl")

ua_data = load_jsonl("data/raw/raw_positive_ua.jsonl")
ua_data = add_language_label(ua_data, "UA")
save_jsonl(ua_data, "data/labeled/labeled_positive_ua.jsonl")

ua_data = load_jsonl("data/raw/raw_negative_ua.jsonl")
ua_data = add_language_label(ua_data, "UA")
save_jsonl(ua_data, "data/labeled/labeled_negative_ua.jsonl")

In [None]:
# List of labeled files
files = [
    "data/labeled/labeled_positive_en.jsonl",
    "data/labeled/labeled_negative_en.jsonl",
    "data/labeled/labeled_positive_ua.jsonl",
    "data/labeled/labeled_negative_ua.jsonl"
]

# Load all datasets
labeled_data = []
for f in files:
    labeled_data.extend(load_jsonl(f))

random.shuffle(labeled_data)

# DEDUPPLICATION
print(f"Original Raw Sample Count: {len(labeled_data)}")

unique_data = []
processed_texts = set()

for sample in labeled_data:
    text = sample.get('text') 
    
    if text and text not in processed_texts:
        processed_texts.add(text)
        unique_data.append(sample)

raw_data = unique_data # Overwrite the raw_data list with unique samples

print(f"Unique Raw Sample Count: {len(raw_data)}")

processed_dataset = []
for entry in tqdm(raw_data, desc="Applying BIO Tags"):
    try:
        tagged_entry = assign_bio_tags(entry)
        processed_dataset.append(tagged_entry)
            
    except Exception as e:
        print(f"Tagging failed for an entry: {e}")
        continue 
        
print(f"Tagging complete. Total samples: {len(processed_dataset)}")

Original Raw Sample Count: 2880
Unique Raw Sample Count: 2844


Applying BIO Tags: 100%|██████████| 2844/2844 [00:00<00:00, 123670.11it/s]

Tagging complete. Total samples: 2844





### Split and Export

In [29]:
# 3. SPLIT AND FINAL EXPORT
train_data, remain_data = train_test_split(processed_dataset, test_size=0.4, random_state=42)
val_data, test_data = train_test_split(remain_data, test_size=0.5, random_state=42)

save_jsonl(train_data, "data/final/train.jsonl")
save_jsonl(val_data, "data/final/validation.jsonl")
save_jsonl(test_data, "data/final/test.jsonl")