In [2]:
from datasets import load_dataset


english_dataset = load_dataset("csv", data_files="../../data/train.tsv", split="train", delimiter='\t')
german_dataset = load_dataset("parquet", data_files="../../external_data/German/NottDeuYTSch/NottDeuYTSch_Corpus.parquet", split="train")
finnish_dataset = load_dataset("csv", data_files="../../external_data/Finnish/suomi24-2021-2023-vrt/data/s24_2021.csv", split="train")

Generating train split: 98637 examples [00:00, 301508.63 examples/s]
Generating train split: 3149457 examples [00:01, 1586041.56 examples/s]
Generating train split: 4762970 examples [00:27, 171390.00 examples/s]


In [3]:
english_dataset

Dataset({
    features: ['id', 'text', 'label'],
    num_rows: 98637
})

In [4]:
english_dataset[0]

{'id': 'eng_train0',
 'text': "I supported Barack Obama. I thought it was absurdity and harassment that conservatives kept pestering Obama about his birth certificate, medical records, tax returns, school transcripts, family tree, DNA, dental records, fingerprints, etc. It was proof of their desire to strip Obama of any dignity. But! I didn't care about Obama's tax returns, stamp collections, third grade report card, etc, and I don't care about Trump's tax returns, third grade report card, or stamp collection, either. There was already enough public information based on their political experience (or lack thereof), public policy positions, campaign style, etc. for me to vote for Obama and against Trump without me needing to know any of that information.",
 'label': 0}

In [5]:
english_dataset = english_dataset.add_column("lang", ["en" for _ in range(len(english_dataset))])

# German

In [6]:
german_sampled = german_dataset.shuffle(seed=42).select(range(99_000))
finnish_sampled = finnish_dataset.shuffle(seed=42).select(range(99_000))

In [7]:
def convert_german_sample(example, idx):
    new_example = {}
    new_example["id"] = f"ger_train{idx}"
    new_example["text"] = example["textClean"]
    new_example["label"] = -1
    new_example["lang"] = "de"
    return new_example

In [8]:
german_sampled = german_sampled.map(convert_german_sample, with_indices=True, batched=False, remove_columns=german_sampled.column_names)

Map: 100%|██████████| 99000/99000 [00:12<00:00, 7619.16 examples/s]


In [9]:
german_sampled[6]

{'id': 'ger_train6',
 'text': 'Mit Germanletsplay und Paluten würde ich mir es richtig wünschen',
 'label': -1,
 'lang': 'de'}

# Finnish

In [10]:
def convert_finnish_sample(example, idx):
    new_example = {}
    new_example["id"] = f"fin_train{idx}"
    new_example["text"] = example["text"]
    new_example["label"] = -1
    new_example["lang"] = "fi"
    return new_example

In [11]:
finnish_sampled = finnish_sampled.map(convert_finnish_sample, with_indices=True, batched=False, remove_columns=finnish_sampled.column_names)

Map: 100%|██████████| 99000/99000 [00:09<00:00, 10492.09 examples/s]


In [12]:
finnish_sampled[19]

{'id': 'fin_train19',
 'text': 'Suomi on jakautunut kahtia , Kela-Suomi ja muu Suomi . Edellämainitulla menee vielä toistaiseksi hyvin .',
 'label': -1,
 'lang': 'fi'}

# Combine

In [13]:
from datasets import concatenate_datasets

final_dataset = concatenate_datasets([english_dataset, german_sampled, finnish_sampled])

In [14]:
final_dataset

Dataset({
    features: ['id', 'text', 'label', 'lang'],
    num_rows: 296637
})

In [15]:
final_dataset.to_csv("../../data/train_combined.csv")

Creating CSV from Arrow format:   0%|          | 0/297 [00:00<?, ?ba/s]

Creating CSV from Arrow format: 100%|██████████| 297/297 [00:01<00:00, 164.89ba/s]


59231171

In [16]:
final_dataset.to_parquet("../../data/train_combined.parquet")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00,  2.65ba/s]


65540330