In [23]:
from datasets import load_dataset

train_dataset = load_dataset("parquet", data_files="../../data/train_combined.parquet", split="train")

In [24]:
train_dataset

Dataset({
    features: ['id', 'text', 'label', 'lang', 'length'],
    num_rows: 292231
})

In [25]:
from datasets import concatenate_datasets

dataset_filtered = train_dataset.filter(lambda x: x['lang'] != 'de')
finnish = dataset_filtered.filter(lambda x: x['lang'] == 'fi')
non_finnish = dataset_filtered.filter(lambda x: x['lang'] != 'fi')
finnish = finnish.shuffle(seed=42).select(range(49500))

enfi_dataset = concatenate_datasets([non_finnish, finnish])

In [26]:
enfi_dataset

Dataset({
    features: ['id', 'text', 'label', 'lang', 'length'],
    num_rows: 148480
})

In [27]:
german_twitter_dataset = load_dataset("parquet",
    data_files="../../external_data/German/german_twitter_2019_2020/german_tweet_samples.parquet",
    split="train"
)

In [28]:
german_twitter_dataset

Dataset({
    features: ['type', 'text'],
    num_rows: 99000
})

In [29]:
def process_german(example, idx):
    new_example = {}
    new_example["id"] = f"ger_train{idx}"
    new_example["text"] = example["text"]
    new_example["label"] = -1
    new_example["lang"] = "de"
    return new_example

In [30]:
german_twitter_dataset = german_twitter_dataset.map(process_german, with_indices=True, batched=False, remove_columns=german_twitter_dataset.column_names)

In [31]:
german = german_twitter_dataset.shuffle(seed=42).select(range(49500))

In [32]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/gemma-3-270m")

In [33]:
def get_text_length(example):
    text = example["text"]
    tokenized = tokenizer(text, truncation=False)
    example["length"] = len(tokenized['input_ids'])
    return example

In [34]:
german = german.map(get_text_length)

In [35]:
max(german['length'])

381

In [36]:
final_dataset = concatenate_datasets([enfi_dataset, german])

In [37]:
final_dataset

Dataset({
    features: ['id', 'text', 'label', 'lang', 'length'],
    num_rows: 197980
})

In [41]:
final_dataset.to_csv("../../data/train_combined_v2.tsv", sep='\t')
final_dataset.to_parquet("../../data/train_combined_v2.parquet")

Creating CSV from Arrow format: 100%|██████████| 198/198 [00:02<00:00, 85.55ba/s]
Creating parquet from Arrow format: 100%|██████████| 198/198 [00:08<00:00, 23.74ba/s]


50033012

In [42]:
final_dataset

Dataset({
    features: ['id', 'text', 'label', 'lang', 'length'],
    num_rows: 197980
})