In [3]:
import os
import random
from datasets import load_dataset

math_data_dir = "data/math"
web_data_dir = "data/web"

n = 50000
n_test = 5000
num_math_samples = n
num_web_samples = n + n_test

os.makedirs(math_data_dir, exist_ok=True)
os.makedirs(web_data_dir, exist_ok=True)

math_data = load_dataset(
    "open-web-math/open-web-math",
    cache_dir=math_data_dir,
    split="train",
    streaming=True,
)
web_data = load_dataset(
    "HuggingFaceFW/fineweb",
    "sample-10BT",
    cache_dir=web_data_dir,
    split="train",
    streaming=True
)
math_data = math_data.take(num_math_samples)
web_data = web_data.take(num_web_samples)

Resolving data files:   0%|          | 0/114 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/25868 [00:00<?, ?it/s]

In [4]:
math_data_list = list(math_data)
web_data_list = list(web_data)
web_data_list_train = web_data_list[:n]
test_data_list = web_data_list[n:]
print(len(math_data_list), len(web_data_list_train), len(test_data_list))
data = []
for item in math_data_list:
    data.append({
        "text": item["text"],
        "label": "__label__positive"
    })
for item in web_data_list_train:
    data.append({
        "text": item["text"],
        "label": "__label__negative"
    })
random.shuffle(data)

test_data = []
for item in test_data_list:
    test_data.append(item["text"])
print(len(data), len(test_data))

50000 50000 5000
100000 5000


In [5]:
with open("data.txt", "w") as f:
    for item in data:
        f.write(f"{item["label"]} {item["text"].replace("\n", " ")}\n")

In [6]:
import fasttext

model = fasttext.train_supervised(input="data.txt", epoch=10, lr=0.5, wordNgrams=2)

In [7]:
model.test("data.txt")

(10996, 0.9891778828664969, 0.9891778828664969)

In [8]:
with open("predict.txt", "w") as f:
    for text in test_data:
        text = text.replace("\n", " ")
        label, _ = model.predict(text)
        f.write(f"{label[0]} {text}\n")
