## This is for creating and splitting the Poetry dataset.

In [None]:
from datasets import load_dataset, Dataset
import pandas as pd

In [None]:
dataset = load_dataset("DanFosing/public-domain-poetry")
dataset_df = pd.DataFrame(dataset["train"])
dataset_df = dataset_df[["Author", "Title", "text"]]

In [None]:
def split_poem_lines(poem, ratio=1/3):
    lines = poem.splitlines()
    split_idx = max(1, int(len(lines) * ratio))
    start = "\n".join(lines[:split_idx])
    end = "\n".join(lines[split_idx:])
    return pd.Series([start, end])

dataset_df[["poem_start", "poem_end"]] = dataset_df["text"].apply(split_poem_lines)

In [None]:
dataset_df.drop(columns=["text"], inplace=True)
dataset_df.rename(columns={"Author": "author", "Title": "title"}, inplace=True)
dataset = Dataset.from_pandas(dataset_df)
dataset = dataset.train_test_split(test_size=0.1, seed=42)

In [None]:
import os 
dataset.push_to_hub("Jeremmmyyyyy/Poetry", private=False, token=os.getenv("HF_TOKEN"))

## This is for creating the math Dataset

In [None]:
from datasets import load_dataset, Dataset
import pandas as pd
from fractions import Fraction

dataset = load_dataset("Jiayi-Pan/Countdown-Tasks-3to4")
dataset = dataset["train"].shuffle(seed=42).select(range(11000))
dataset = dataset.train_test_split(test_size=1000, seed=42)

In [None]:
def find_countdown_expression(nums, target):
    """
    Find an arithmetic expression using each number in nums exactly once that evaluates to target.
    Operations allowed: +, -, *, /, with parentheses.
    Uses Fraction for exact arithmetic to avoid floating-point issues.

    Args:
        nums (list of int or float): input numbers
        target (int or float): desired target value

    Returns:
        str: a string representation of the expression that evaluates to target

    Raises:
        ValueError: if no expression can be found
    """
    # Prepare initial list of (value, expression) pairs
    initial = [(Fraction(n), str(n)) for n in nums]
    target_frac = Fraction(target)

    def helper(pairs):
        # If only one value remains, check if it equals the target
        if len(pairs) == 1:
            val, expr = pairs[0]
            if val == target_frac:
                return expr
            return None

        # Try all pairs of numbers
        for i in range(len(pairs)):
            for j in range(i + 1, len(pairs)):
                a_val, a_expr = pairs[i]
                b_val, b_expr = pairs[j]

                # Build a new list of remaining numbers
                rest = [pairs[k] for k in range(len(pairs)) if k not in (i, j)]

                # Generate all possible operations
                operations = [
                    (a_val + b_val, f"({a_expr}+{b_expr})"),
                    (a_val - b_val, f"({a_expr}-{b_expr})"),
                    (b_val - a_val, f"({b_expr}-{a_expr})"),
                    (a_val * b_val, f"({a_expr}*{b_expr})"),
                ]

                # Division operations, avoid division by zero
                if b_val != 0:
                    operations.append((a_val / b_val, f"({a_expr}/{b_expr})"))
                if a_val != 0:
                    operations.append((b_val / a_val, f"({b_expr}/{a_expr})"))

                # Recurse on each possibility
                for new_val, new_expr in operations:
                    result = helper(rest + [(new_val, new_expr)])
                    if result:
                        val = eval(result, {"__builtins__": None}, {})
                        assert abs(float(val) - float(target)) < 1e-5, f"Invalid expression: {result}"
                        return result
        return None

    expression = helper(initial)
    if expression is None:
        raise ValueError(f"No solution found for nums={nums} target={target}")
    return expression

def add_gold_answer_to_dataset(dataset):
    """
    Load the HF dataset, compute the countdown solution for each example, and return a new Dataset with 'gold_answer'.

    Args:
        dataset : Hugging Face dataset

    Returns:
        datasets.Dataset: with additional 'gold_answer' column
    """
    def solve(example):
        nums = example['nums']
        target = example['target']
        try:
            example['gold_answer'] = find_countdown_expression(nums, target)
        except ValueError:
            example['gold_answer'] = None
        return example

    new_ds = dataset.map(solve)
    return new_ds

In [None]:
dataset["train"] = add_gold_answer_to_dataset(dataset["train"])
dataset["test"] = add_gold_answer_to_dataset(dataset["test"])
dataset["test_samples"] = add_gold_answer_to_dataset(dataset["test"]).select(range(10))


In [None]:
import os 
dataset.push_to_hub("Jeremmmyyyyy/Math", private=False, token=os.getenv("HF_TOKEN"))