#### Load data

In [None]:
import os
import sys
sys.path.insert(0, os.path.abspath('../..'))
from component.io import load_nodes_jsonl

all_nodes_path = ''
nodes = load_nodes_jsonl(all_nodes_path)
texts = []
labels = []
for node in nodes:
    texts.append(node.text)
    labels.append(node.metadata['level'])

#### Generate dataset

In [None]:
from datasets import Dataset, DatasetDict

# Sample data
train_texts = ["Text of sample 1", "Text of sample 2"]
train_labels = [0, 1]

test_texts = ["Test sample 1", "Test sample 2"]
test_labels = [0, 1]

# Create datasets
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
test_dataset = Dataset.from_dict({"text": test_texts, "label": test_labels})

# Combine into a DatasetDict
dataset_dict = DatasetDict({
    "train": train_dataset,
    "test": test_dataset
})

#### Tokenize datasets

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_dataset_dict = dataset_dict.map(lambda x: tokenizer(x['text'], truncation=True), batched=True)

#### Load model

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)

#### Train

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_dict["train"],
    eval_dataset=tokenized_dataset_dict["test"],
    tokenizer=tokenizer,
    data_collator=tokenized_dataset_dict,
)

trainer.train()

In [None]:
trainer.push_to_hub(repo_id="zz9tf/level-predictor")