## LLM generated data

In [50]:
import json
import re
import sys
import os


current_directory = os.getcwd()
parent_directory = os.path.abspath(os.path.join(current_directory, '..'))
sys.path.append(parent_directory)

from utils.constants import init_datagen_config 

In [51]:
DATAGEN_CONFIG = init_datagen_config('../configs/datagen.yaml')
DATAGEN_CONFIG

{'MOUNTAINS_NAMES_PATH': '../dataset/synthetic/Mountain.csv',
 'SAVE_DATASET_PATH': '../dataset/synthetic/mountains_only_synthetic.txt',
 'SAVE_PROCESSED_PATH': '../../dataset/synthetic/synthetic_processed.json',
 'GENERATOR_MODEL': 'meta/llama-3.1-405b-instruct',
 'MIN_SAMPLES': 0,
 'MAX_SAMPLES': 3,
 'TEMPERATURE': 0.9,
 'WNUT_DATA_PATH': '../../dataset/wnut16/wnut 16.txt.conll',
 'PROCESSED_WNUT_PATH': '../../dataset/wnut16/wnut_processed.csv',
 'FEW-NERD_BALANCED_TRAIN_PATH': '../../dataset/few-nerd/train',
 'FEW-NERD_BALANCED_VAL_PATH': '../../dataset/few-nerd/val',
 'FEW-NERD_BALANCED_TEST_PATH': '../../dataset/few-nerd/test',
 'FINAL_DATASET_PATH': '../../dataset/resulting_dataset'}

In [52]:
import json
import os
import re

# Load your dataset
with open(os.path.join("..", DATAGEN_CONFIG['SAVE_DATASET_PATH']), 'r') as f:
    dataset = json.load(f)

def preprocess_for_ner(dataset):
    ner_data = []

    for entry in dataset:
        mountain_name = entry["mountain"]  
        sentences = entry["sentences"]

        for sentence in sentences:
            tokens = re.findall(r'\w+|[^\w\s]', sentence, re.UNICODE)
            labels = [0] * len(tokens)  # Start with all '0' (Outside)

            # Regex to find mountain names and label them
            mountain_regex = re.escape(mountain_name)
            for match in re.finditer(mountain_regex, sentence):
                start, end = match.span()

                start_word = len(re.findall(r'\w+|[^\w\s]', sentence[:start], re.UNICODE))
                end_word = len(re.findall(r'\w+|[^\w\s]', sentence[:end], re.UNICODE))

                # Update labels using the few-nerd convention
                labels[start_word] = 1  # B-Mountain
                for i in range(start_word + 1, end_word):
                    labels[i] = 2  # I-Mountain

            ner_data.append({
                "sentence": sentence,
                "tokens": tokens,
                "labels": labels
            })

    return ner_data

In [53]:
ner_formatted_data = preprocess_for_ner(dataset)

with open(DATAGEN_CONFIG['SAVE_PROCESSED_PATH'], 'w') as f:
    json.dump(ner_formatted_data, f, indent=2)

for entry in ner_formatted_data:
    print(entry)

{'sentence': 'Mount Everest, the highest peak in the world, was formed approximately 60 million years ago when India collided with Eurasia.', 'tokens': ['Mount', 'Everest', ',', 'the', 'highest', 'peak', 'in', 'the', 'world', ',', 'was', 'formed', 'approximately', '60', 'million', 'years', 'ago', 'when', 'India', 'collided', 'with', 'Eurasia', '.'], 'labels': [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}
{'sentence': 'Climbing K2, the second-highest peak in the world, is a formidable challenge for even the most seasoned mountaineers, requiring meticulous planning and physical endurance.', 'tokens': ['Climbing', 'K2', ',', 'the', 'second', '-', 'highest', 'peak', 'in', 'the', 'world', ',', 'is', 'a', 'formidable', 'challenge', 'for', 'even', 'the', 'most', 'seasoned', 'mountaineers', ',', 'requiring', 'meticulous', 'planning', 'and', 'physical', 'endurance', '.'], 'labels': [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [54]:
print(f"{len(ner_formatted_data)} synthetic generated sentences")

196 synthetic generated sentences


In [55]:
ner_formatted_data[0]

{'sentence': 'Mount Everest, the highest peak in the world, was formed approximately 60 million years ago when India collided with Eurasia.',
 'tokens': ['Mount',
  'Everest',
  ',',
  'the',
  'highest',
  'peak',
  'in',
  'the',
  'world',
  ',',
  'was',
  'formed',
  'approximately',
  '60',
  'million',
  'years',
  'ago',
  'when',
  'India',
  'collided',
  'with',
  'Eurasia',
  '.'],
 'labels': [1,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [56]:
print(len(ner_formatted_data[0]['labels'])==len(ner_formatted_data[0]['tokens']))

True


## Out-of domain datasets for NER

### WNUT 16

WNUT 2016 Dataset is annotated with 10 fine-grained NER categories: person, geo-location, company, facility, product,music artist, movie, sports team, tv show and other. Dataset was extracted from tweets and is structured in CoNLL format., in English language. Containing 5,63 in Text file format.
https://autonlp.ai/datasets/wnut-2016

#### Hope to extract some info about mountains from this dataset but it can be helpful even without vast amount of data related to our problem - can use it for balancing our synthetic dataset.

In [57]:
import pandas as pd
import numpy as np

In [58]:
with open(DATAGEN_CONFIG['WNUT_DATA_PATH'], 'r') as file:
    data = file.read()

# Parse the dataset into tokens and labels
lines = data.split('\n')
tokens = []
labels = []
sentence_tokens = []
sentence_labels = []

for line in lines:
    if line.strip():
        token, label = line.split('\t')
        sentence_tokens.append(token)
        sentence_labels.append(label)
    else:
        if sentence_tokens:
            tokens.append(sentence_tokens)
            labels.append(sentence_labels)
            sentence_tokens = []
            sentence_labels = []

# Ensure the last sentence is added
if sentence_tokens:
    tokens.append(sentence_tokens)
    labels.append(sentence_labels)

data_dict = {'tokens': tokens, 'labels': labels}
data_df = pd.DataFrame(data_dict)

In [59]:
data_df.head()

Unnamed: 0,tokens,labels
0,"[@SammieLynnsMom, @tg10781, they, will, be, al...","[O, O, O, O, O, O, O, O, O, O, O, O]"
1,"[Made, it, back, home, to, GA, ., It, sucks, n...","[O, O, O, O, O, B-geo-loc, O, O, O, O, O, O, O..."
2,"[', Breaking, Dawn, ', Returns, to, Vancouver,...","[O, B-movie, I-movie, O, O, O, B-geo-loc, O, O..."
3,"[@ls_n, perhaps, ,, but, folks, may, find, som...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[@Carr0t, aye, been, tonight, -, excellent]","[O, O, O, O, O, O]"


In [60]:
data_df = data_df.dropna()

any_null_count = data_df.isnull().any(axis=1).sum()
print(f'Count the row that included null column', any_null_count)

def is_english(sentence):
    return re.match(r'^[a-zA-Z0-9\s,.\'!?-]+$', ' '.join(sentence))

# Filter out non-English sentences
data_df = data_df[data_df['tokens'].apply(lambda x: bool(is_english(x)))]

Count the row that included null column 0


In [61]:
# unique tags from the dataset
unique_tags = set()

for label_list in labels:
    for label in label_list:
        unique_tags.add(label)

print("Unique tags:", unique_tags)

Unique tags: {'I-product', 'B-musicartist', 'B-tvshow', 'B-other', 'I-company', 'I-sportsteam', 'B-company', 'I-geo-loc', 'B-person', 'I-tvshow', 'B-geo-loc', 'I-facility', 'B-sportsteam', 'I-other', 'I-musicartist', 'B-facility', 'I-movie', 'B-product', 'I-person', 'B-movie', 'O'}


In [62]:
# rows that contain 'B-geo-loc' 
geo_df = data_df[data_df['labels'].apply(lambda x: 'B-geo-loc' in x)]
geo_df.shape

mountains_df = pd.read_csv(os.path.join('..', DATAGEN_CONFIG['MOUNTAINS_NAMES_PATH']))  
mountain_names = mountains_df['Mountain'].tolist()  

mountain_names = [name.lower() for name in mountain_names]
def detect_mountains(tokens, mountain_names):
    tokens_lower = [token.lower() for token in tokens]  # convert tokens to lowercase for comparison
    return any(mountain in tokens_lower for mountain in mountain_names)

nomountain = geo_df.copy()
nomountain['contains_mountain'] = geo_df['tokens'].apply(lambda x: detect_mountains(x, mountain_names))

mountain_rows = nomountain[nomountain['contains_mountain'] == True]
print(mountain_rows)

Empty DataFrame
Columns: [tokens, labels, contains_mountain]
Index: []


As we do not have the entries about related to mountains let's just sample 196 samples without mountains to balance synthetic dataset

In [63]:
# Remove special symbols as this data is parsed from twitter, so may contain unuseful for language understanding info
# Remain commas

def preprocess_tokens(tokens):
    # Join tokens into a single string for processing
    text = ' '.join(tokens)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove special characters and punctuation except commas
    text = re.sub(r'[^a-zA-Z0-9,\s]', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Split the processed text back into tokens
    return text.split()

In [64]:
# sample 196 examples
sampled_mountain_rows = data_df.sample(n=196, random_state=42)
sampled_mountain_rows['tokens'] = sampled_mountain_rows['tokens'].apply(preprocess_tokens)
sampled_mountain_rows['labels'] = [[0] * len(tokens) for tokens in sampled_mountain_rows['tokens']]
sampled_mountain_rows

Unnamed: 0,tokens,labels
1093,"[before, the, season, even, starts, i, will, n...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2373,"[Have, a, feeling, my, phone, bill, will, be, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2004,"[looks, like, its, my, byzabedtime, hope, to, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
427,"[Spent, all, of, last, night, puking, ,, and, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2324,"[Im, trying, to, figure, out, if, I, wanna, do...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
...,...,...
614,"[Tonight, Homemade, ice, cream, flights, try, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2243,"[Love, me, when, I, least, deserve, it, ,, bec...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
31,"[he, likes, prince, ,, paul, simon, ,, and, od...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2308,"[my, past, was, the, reason, i, tried, to, kil...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [65]:
len(sampled_mountain_rows.iloc[0]['tokens']) == len(sampled_mountain_rows.iloc[0]['labels'])

True

In [66]:
sampled_mountain_rows.iloc[0]['tokens']

['before',
 'the',
 'season',
 'even',
 'starts',
 'i',
 'will',
 'not',
 'respond',
 'to',
 'any',
 'new',
 'found',
 'miami',
 'heat',
 'fans']

In [67]:
sampled_mountain_rows.iloc[0]['labels']

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [68]:
sampled_mountain_rows.to_csv(DATAGEN_CONFIG['PROCESSED_WNUT_PATH'])

### Few-NERD
Few-NERD is a large-scale, fine-grained manually annotated named entity recognition dataset, which contains 8 coarse-grained types, 66 fine-grained types, 188,200 sentences, 491,711 entities, and 4,601,223 tokens.

https://huggingface.co/datasets/DFKI-SLT/few-nerd#dataset-summary

In [69]:
from datasets import load_dataset
from tqdm import tqdm  

In [70]:
ds = load_dataset("DFKI-SLT/few-nerd", "supervised")

In [71]:
ds_train = ds["train"]
ds_val = ds["validation"]
ds_test = ds["test"]

In [72]:
print(ds_train)
print(ds_val)
print(ds_test)

Dataset({
    features: ['id', 'tokens', 'ner_tags', 'fine_ner_tags'],
    num_rows: 131767
})
Dataset({
    features: ['id', 'tokens', 'ner_tags', 'fine_ner_tags'],
    num_rows: 18824
})
Dataset({
    features: ['id', 'tokens', 'ner_tags', 'fine_ner_tags'],
    num_rows: 37648
})


In [73]:
ds_train[1]

{'id': '1',
 'tokens': ['It',
  'starred',
  'Hicks',
  "'s",
  'wife',
  ',',
  'Ellaline',
  'Terriss',
  'and',
  'Edmund',
  'Payne',
  '.'],
 'ner_tags': [0, 0, 7, 0, 0, 0, 7, 7, 0, 7, 7, 0],
 'fine_ner_tags': [0, 0, 51, 0, 0, 0, 50, 50, 0, 50, 50, 0]}

In [74]:
ds_train.features

{'id': Value(dtype='string', id=None),
 'tokens': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'ner_tags': Sequence(feature=ClassLabel(names=['O', 'art', 'building', 'event', 'location', 'organization', 'other', 'person', 'product'], id=None), length=-1, id=None),
 'fine_ner_tags': Sequence(feature=ClassLabel(names=['O', 'art-broadcastprogram', 'art-film', 'art-music', 'art-other', 'art-painting', 'art-writtenart', 'building-airport', 'building-hospital', 'building-hotel', 'building-library', 'building-other', 'building-restaurant', 'building-sportsfacility', 'building-theater', 'event-attack/battle/war/militaryconflict', 'event-disaster', 'event-election', 'event-other', 'event-protest', 'event-sportsevent', 'location-GPE', 'location-bodiesofwater', 'location-island', 'location-mountain', 'location-other', 'location-park', 'location-road/railway/highway/transit', 'organization-company', 'organization-education', 'organization-government/governmentagency', 'or

We see the 'location-mountain' label, which might be useful to us.

In [75]:
MOUNTAIN_INDEX = 24

In [76]:
def display_mountain_stats(dataset, mountain_tag=[MOUNTAIN_INDEX]):
    """
    Displays statistics related to mountain tags in the dataset.
    :param dataset: The dataset to analyze
    :param mountain_tag: The tag representing mountains in the dataset
    """
    mountain_count = sum(
        tag in mountain_tag for line in tqdm(dataset) for tag in line['fine_ner_tags']
    )

    unique_mountains = {
        line['tokens'][i] for line in dataset for i, tag in enumerate(line['fine_ner_tags'])
        if tag in mountain_tag
    }

    samples_with_mountains = sum(
        any(tag in mountain_tag for tag in line['fine_ner_tags']) for line in dataset
    )

    print(f"Number of mountains = {mountain_count}")
    print(f"Number of distinct mountains = {len(unique_mountains)}")
    print(f"Number of samples with mountains = {samples_with_mountains}")

    stats = []
    stats.append(mountain_count)
    stats.append(len(unique_mountains))
    stats.append(samples_with_mountains)

    return stats

In [77]:
print('Train set stats:')
train_stats=display_mountain_stats(ds_train)
print('###' * 10)

print('Val set stats:')
val_stats=display_mountain_stats(ds_val)
print('###' * 10)

print('Test set stats:')
test_stats=display_mountain_stats(ds_test)
print('###' * 10)

Train set stats:


100%|██████████| 131767/131767 [00:04<00:00, 27248.01it/s]


Number of mountains = 4500
Number of distinct mountains = 1871
Number of samples with mountains = 1502
##############################
Val set stats:


100%|██████████| 18824/18824 [00:00<00:00, 28052.06it/s]


Number of mountains = 734
Number of distinct mountains = 474
Number of samples with mountains = 218
##############################
Test set stats:


100%|██████████| 37648/37648 [00:01<00:00, 27912.24it/s]


Number of mountains = 1366
Number of distinct mountains = 776
Number of samples with mountains = 448
##############################


In [78]:
# We’ll map B-Mountain to an integer (e.g., 1), I-Mountain to another integer (e.g., 2), and O to 0
#  in line with how ClassLabel expects integer labels.

def mapping_function(example):
    """
    A helper function that changes the tags like this:
    Tokens labeled with 24 will be transformed to B-Mountain (1) or I-Mountain (2) based on their position.
    All other tokens will remain the same.
    :param example: a dataset sample
    :return: the example with tags modified
    """
    old_tags = example["fine_ner_tags"]
    new_tags = []

    found_mountain = False  # Flag to track if a mountain label (24) is found

    for tag in old_tags:
        if tag == 24:
            if not found_mountain:  # First occurrence of a mountain (B-Mountain)
                new_tags.append(1)  # B-Mountain -> 1
                found_mountain = True
            else:  # Subsequent tokens of the same mountain (I-Mountain)
                new_tags.append(2)  # I-Mountain -> 2
        else:
            new_tags.append(0)  # 'O' for all other tokens (unchanged)
            found_mountain = False  # Reset the flag when it's not a mountain token

    example['fine_ner_tags'] = new_tags
    return example

def modify_dataset(dataset):
    """
    Change the tags for every sample in the dataset by applying BIO labeling for mountains.
    :param dataset: the dataset
    :return: a modified dataset
    """
    # Apply the mapping function to each sample in the dataset
    dataset = dataset.map(mapping_function, batched=False)
    return dataset

In [79]:
train_relabeled = modify_dataset(ds_train)
val_relabeled = modify_dataset(ds_val)
test_relabeled = modify_dataset(ds_test)

In [80]:
def balance_dataset(dataset, p):
    """
    Reduce the dataset by keeping only a fraction of the samples that do not contain mountains.
    If a sample contains 'B-Mountain' or 'I-Mountain', it will always be retained.
    
    :param dataset: the dataset
    :param p: the fraction, 0<=p<=1
    :return: the reduced dataset
    """
    def keep_or_discard(example):
        """
        Helper function to decide whether to keep or discard a sample.
        :param example: the sample
        :return: True if keeping the sample, False otherwise
        """
        mountain_tags = {1, 2}
        has_mountain = any(tag in mountain_tags for tag in example['fine_ner_tags'])
        return has_mountain or np.random.rand() < p

    return dataset.filter(keep_or_discard)

In [81]:
balanced_train = balance_dataset(train_relabeled, train_stats[-1] / (ds_train.shape[0] - train_stats[-1]))
balanced_val = balance_dataset(val_relabeled, val_stats[-1] / (ds_val.shape[0] - val_stats[-1]))
balanced_test = balance_dataset(test_relabeled, test_stats[-1] / (ds_test.shape[0] - test_stats[-1]))

In [82]:
balanced_train

Dataset({
    features: ['id', 'tokens', 'ner_tags', 'fine_ner_tags'],
    num_rows: 3012
})

### Balanced dataset stats

In [83]:
balanced_train_stats = display_mountain_stats(balanced_train, mountain_tag=[1, 2])
print("###" * 10)
print(f"Number of samples without mountains = {balanced_train.shape[0] - balanced_train_stats[-1]}")
print("***" * 10)
print(" ")

balanced_val_stats = display_mountain_stats(balanced_val, mountain_tag=[1, 2])
print("###" * 10)
print(f"Number of samples without mountains = {balanced_val.shape[0] - balanced_val_stats[-1]}")
print("***" * 10)
print(" ")

balanced_test_stats = display_mountain_stats(balanced_test, mountain_tag=[1, 2])
print("###" * 10)
print(f"Number of samples without mountains = {balanced_test.shape[0] - balanced_test_stats[-1]}")
print("***" * 10)
print(" ")

100%|██████████| 3012/3012 [00:00<00:00, 17757.46it/s]


Number of mountains = 4500
Number of distinct mountains = 1871
Number of samples with mountains = 1502
##############################
Number of samples without mountains = 1510
******************************
 


100%|██████████| 446/446 [00:00<00:00, 16732.94it/s]


Number of mountains = 734
Number of distinct mountains = 474
Number of samples with mountains = 218
##############################
Number of samples without mountains = 228
******************************
 


100%|██████████| 934/934 [00:00<00:00, 17903.41it/s]


Number of mountains = 1366
Number of distinct mountains = 776
Number of samples with mountains = 448
##############################
Number of samples without mountains = 486
******************************
 


In [84]:
balanced_train[10]

{'id': '385',
 'tokens': ['The',
  'Innuitian',
  'Mountains',
  "'",
  'present',
  'form',
  'was',
  'shaped',
  'during',
  'the',
  'Innuitian',
  'orogeny',
  'in',
  'the',
  'middle',
  'of',
  'the',
  'Mesozoic',
  'Era',
  'when',
  'the',
  'North',
  'American',
  'Plate',
  'moved',
  'northward',
  '.'],
 'ner_tags': [0,
  4,
  4,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  4,
  4,
  0,
  0,
  0,
  0],
 'fine_ner_tags': [0,
  1,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [85]:
balanced_train.save_to_disk(DATAGEN_CONFIG['FEW-NERD_BALANCED_TRAIN_PATH'])
balanced_val.save_to_disk(DATAGEN_CONFIG['FEW-NERD_BALANCED_VAL_PATH'])
balanced_test.save_to_disk(DATAGEN_CONFIG['FEW-NERD_BALANCED_TEST_PATH'])

Saving the dataset (1/1 shards): 100%|██████████| 3012/3012 [00:00<00:00, 159442.20 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 446/446 [00:00<00:00, 92859.75 examples/s] 
Saving the dataset (1/1 shards): 100%|██████████| 934/934 [00:00<00:00, 128492.52 examples/s]


# Resulting Dataset

In [86]:
import json
import pandas as pd
from datasets import load_from_disk, Dataset, concatenate_datasets

In [87]:
# Load the synthetic dataset
with open(DATAGEN_CONFIG['SAVE_PROCESSED_PATH'], 'r') as f:
    synthetic_data = json.load(f)

# Load the WNUT dataset
wnut_data = pd.read_csv(DATAGEN_CONFIG['PROCESSED_WNUT_PATH'])

# Load the Few-NERD dataset
few_nerd_data = load_from_disk(DATAGEN_CONFIG['FEW-NERD_BALANCED_TRAIN_PATH'])

In [88]:
# Process the synthetic dataset into a DataFrame
synthetic_df = pd.DataFrame(synthetic_data)

# Ensure all tokens are strings and labels are integers
synthetic_df['tokens'] = synthetic_df['tokens'].apply(lambda x: [str(token) for token in x])
synthetic_df['labels'] = synthetic_df['labels'].apply(lambda x: [int(label) for label in x])

# Process the WNUT dataset
# Convert the strings to actual lists
wnut_data['tokens'] = wnut_data['tokens'].apply(lambda x: eval(x))
wnut_data['labels'] = wnut_data['labels'].apply(lambda x: eval(x))

# Create a DataFrame for WNUT
wnut_df = pd.DataFrame({
    'sentence': wnut_data['tokens'].apply(lambda x: ' '.join(x)),  # Join tokens into a sentence
    'tokens': wnut_data['tokens'],
    'labels': wnut_data['labels']
})

# Ensure all tokens are strings and labels are integers
wnut_df['tokens'] = wnut_df['tokens'].apply(lambda x: [str(token) for token in x])
wnut_df['labels'] = wnut_df['labels'].apply(lambda x: [int(label) for label in x])

# Combine the datasets into a single DataFrame
combined_data = pd.concat([synthetic_df, wnut_df], ignore_index=True)

# Convert combined DataFrame to Dataset
combined_dataset = Dataset.from_pandas(combined_data)

In [89]:
# Process the Few-NERD dataset
# Convert the Few-NERD dataset to DataFrame
few_nerd_df = pd.DataFrame({
    'sentence': [" ".join(tokens) for tokens in few_nerd_data['tokens']],
    'tokens': few_nerd_data['tokens'],
    'labels': few_nerd_data['fine_ner_tags']  # Use ner_tags for labels
})

# Ensure all tokens are strings and labels are integers
few_nerd_df['tokens'] = few_nerd_df['tokens'].apply(lambda x: [str(token) for token in x])
few_nerd_df['labels'] = few_nerd_df['labels'].apply(lambda x: [int(label) for label in x])

# Convert Few-NERD DataFrame to Dataset
few_nerd_dataset = Dataset.from_pandas(few_nerd_df)

# Combine with Few-NERD dataset
final_dataset = concatenate_datasets([combined_dataset, few_nerd_dataset])

# Save the final dataset if needed
final_dataset.save_to_disk(DATAGEN_CONFIG['FINAL_DATASET_PATH'])

Saving the dataset (1/1 shards): 100%|██████████| 3404/3404 [00:00<00:00, 894911.05 examples/s]


### Examples

In [90]:
final_dataset[0]

{'sentence': 'Mount Everest, the highest peak in the world, was formed approximately 60 million years ago when India collided with Eurasia.',
 'tokens': ['Mount',
  'Everest',
  ',',
  'the',
  'highest',
  'peak',
  'in',
  'the',
  'world',
  ',',
  'was',
  'formed',
  'approximately',
  '60',
  'million',
  'years',
  'ago',
  'when',
  'India',
  'collided',
  'with',
  'Eurasia',
  '.'],
 'labels': [1,
  2,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [91]:
final_dataset[321]

{'sentence': 'You all need to know justin isnt the innocent 16 year old boy you think he is he has to be like that and love his fans so',
 'tokens': ['You',
  'all',
  'need',
  'to',
  'know',
  'justin',
  'isnt',
  'the',
  'innocent',
  '16',
  'year',
  'old',
  'boy',
  'you',
  'think',
  'he',
  'is',
  'he',
  'has',
  'to',
  'be',
  'like',
  'that',
  'and',
  'love',
  'his',
  'fans',
  'so'],
 'labels': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}

In [92]:
final_dataset[2900]

{'sentence': 'Akhun Salak Baba is buried in Akhun Salak Baba Cemetery , about 0.5 miles south of lower Kabalgram village , on the west bank of Indus river and north bank of Itai Khwar ( River ) .',
 'tokens': ['Akhun',
  'Salak',
  'Baba',
  'is',
  'buried',
  'in',
  'Akhun',
  'Salak',
  'Baba',
  'Cemetery',
  ',',
  'about',
  '0.5',
  'miles',
  'south',
  'of',
  'lower',
  'Kabalgram',
  'village',
  ',',
  'on',
  'the',
  'west',
  'bank',
  'of',
  'Indus',
  'river',
  'and',
  'north',
  'bank',
  'of',
  'Itai',
  'Khwar',
  '(',
  'River',
  ')',
  '.'],
 'labels': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0]}