## This is for creating and splitting the Poetry dataset.

In [1]:
from datasets import load_dataset, Dataset
import pandas as pd

In [9]:
dataset = load_dataset("DanFosing/public-domain-poetry")
dataset_df = pd.DataFrame(dataset["train"])
dataset_df = dataset_df[["Author", "Title", "text"]]

In [10]:
dataset_df.rename(columns={"text":"Text"}, inplace=True)

In [None]:
def split_poem_lines(poem, ratio=1/3):
    lines = poem.splitlines()
    split_idx = max(1, int(len(lines) * ratio))
    start = "\n".join(lines[:split_idx])
    end = "\n".join(lines[split_idx:])
    return pd.Series([start, end])

dataset_df[["poem_start", "poem_end"]] = dataset_df["text"].apply(split_poem_lines)

In [None]:
dataset_df.drop(columns=["text"], inplace=True)
dataset_df.rename(columns={"Author": "author", "Title": "title"}, inplace=True)
dataset = Dataset.from_pandas(dataset_df)
dataset = dataset.train_test_split(test_size=0.1, seed=42)

In [None]:
import os 
dataset.push_to_hub("schifferlearning/Poetry-Categorized", private=False, token=os.getenv("HF_TOKEN"))

In [37]:
import re
import nltk
from nltk.corpus import cmudict

# Ensure cmudict is available
try:
    nltk.data.find("corpora/cmudict")
except LookupError:
    nltk.download("cmudict")

d = cmudict.dict()

def count_syllables(word):
    """Count syllables in a word using cmudict or fallback to simple rule."""
    word = word.lower()
    if word in d:
        return max(len([y for y in pron if y[-1].isdigit()]) for pron in d[word])
    return len(re.findall(r'[aeiouy]+', word.lower()))

def total_syllables(line):
    """Count total syllables in a line."""
    words = re.findall(r'\b\w+\b', line.lower())
    return sum(count_syllables(w) for w in words)

def classify_form(poem):
    """Classify a poem's form based on line count and syllable patterns."""
    lines = [line.strip() for line in poem.strip().splitlines() if line.strip()]
    line_count = len(lines)
    syllable_counts = [total_syllables(line) for line in lines]

    # Haiku: 3 lines with 5-7-5 syllable pattern
    if line_count == 3 and syllable_counts == [5, 7, 5]:
        return "haiku"

    # Tanka: 5 lines with 5-7-5-7-7 syllable pattern
    if line_count == 5 and syllable_counts == [5, 7, 5, 7, 7]:
        return "tanka"

    # Limerick: 5 lines with approximate syllable counts
    if line_count == 5 and all(8 <= s <= 9 for s in syllable_counts[:2]) and all(5 <= s <= 6 for s in syllable_counts[2:4]) and 8 <= syllable_counts[4] <= 9:
        return "limerick"

    # Sonnet: 14 lines
    if line_count == 14:
        return "sonnet"

    # Quatrain: 4 lines with similar syllable counts
    if line_count == 4 and max(syllable_counts) - min(syllable_counts) <= 2:
        return "quatrain"

    # Cinquain: 5 lines with specific syllable counts
    if line_count == 5 and syllable_counts == [2, 4, 6, 8, 2]:
        return "cinquain"

    # Octave: 8 lines with similar syllable counts
    if line_count == 8 and max(syllable_counts) - min(syllable_counts) <= 2:
        return "octave"

    # Decastich: 10 lines with similar syllable counts
    if line_count == 10 and max(syllable_counts) - min(syllable_counts) <= 2:
        return "decastich"

    # Sestet: 6 lines with similar syllable counts
    if line_count == 6 and max(syllable_counts) - min(syllable_counts) <= 2:
        return "sestet"

    # Couplet: 2 lines with similar syllable counts
    if line_count == 2 and abs(syllable_counts[0] - syllable_counts[1]) <= 2:
        return "couplet"

    return "free_verse"

In [38]:
dataset_df['Form'] = dataset_df['Text'].apply(classify_form)

In [39]:
dataset_df["Form"].value_counts()

free_verse    32022
sonnet         3484
quatrain        893
octave          813
couplet         604
sestet          352
decastich       209
limerick        122
Name: Form, dtype: int64

In [41]:
form_counts = dataset_df['Form'].value_counts()
eligible_forms = form_counts[form_counts >= 800].index

# filter the DataFrame to include only eligible forms
eligible_df = dataset_df[dataset_df['Form'].isin(eligible_forms)]

# sample 800 entries from each eligible form
sampled_df = eligible_df.groupby('Form', group_keys=False).apply(lambda x: x.sample(n=800, random_state=42))
sampled_df = sampled_df.reset_index(drop=True)

sampled_df['Form'].value_counts()

free_verse    800
octave        800
quatrain      800
sonnet        800
Name: Form, dtype: int64

In [43]:
import os 
dataset = Dataset.from_pandas(sampled_df)
dataset = dataset.train_test_split(test_size=0.1, seed=42)
dataset.push_to_hub("schifferlearning/Poetry-Categorized", private=False, token=os.getenv("HUGGINGFACE_HUB_TOKEN"))

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/schifferlearning/Poetry-Categorized/commit/641f49bafd5602fe51d3cc0eab8638781b63d4c9', commit_message='Upload dataset', commit_description='', oid='641f49bafd5602fe51d3cc0eab8638781b63d4c9', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/schifferlearning/Poetry-Categorized', endpoint='https://huggingface.co', repo_type='dataset', repo_id='schifferlearning/Poetry-Categorized'), pr_revision=None, pr_num=None)

## This is for creating the math Dataset

In [None]:
from datasets import load_dataset, Dataset
import pandas as pd
from fractions import Fraction

dataset = load_dataset("Jiayi-Pan/Countdown-Tasks-3to4")
dataset = dataset["train"].shuffle(seed=42).select(range(11000))
dataset = dataset.train_test_split(test_size=1000, seed=42)

In [None]:
def find_countdown_expression(nums, target):
    """
    Find an arithmetic expression using each number in nums exactly once that evaluates to target.
    Operations allowed: +, -, *, /, with parentheses.
    Uses Fraction for exact arithmetic to avoid floating-point issues.

    Args:
        nums (list of int or float): input numbers
        target (int or float): desired target value

    Returns:
        str: a string representation of the expression that evaluates to target

    Raises:
        ValueError: if no expression can be found
    """
    # Prepare initial list of (value, expression) pairs
    initial = [(Fraction(n), str(n)) for n in nums]
    target_frac = Fraction(target)

    def helper(pairs):
        # If only one value remains, check if it equals the target
        if len(pairs) == 1:
            val, expr = pairs[0]
            if val == target_frac:
                return expr
            return None

        # Try all pairs of numbers
        for i in range(len(pairs)):
            for j in range(i + 1, len(pairs)):
                a_val, a_expr = pairs[i]
                b_val, b_expr = pairs[j]

                # Build a new list of remaining numbers
                rest = [pairs[k] for k in range(len(pairs)) if k not in (i, j)]

                # Generate all possible operations
                operations = [
                    (a_val + b_val, f"({a_expr}+{b_expr})"),
                    (a_val - b_val, f"({a_expr}-{b_expr})"),
                    (b_val - a_val, f"({b_expr}-{a_expr})"),
                    (a_val * b_val, f"({a_expr}*{b_expr})"),
                ]

                # Division operations, avoid division by zero
                if b_val != 0:
                    operations.append((a_val / b_val, f"({a_expr}/{b_expr})"))
                if a_val != 0:
                    operations.append((b_val / a_val, f"({b_expr}/{a_expr})"))

                # Recurse on each possibility
                for new_val, new_expr in operations:
                    result = helper(rest + [(new_val, new_expr)])
                    if result:
                        val = eval(result, {"__builtins__": None}, {})
                        assert abs(float(val) - float(target)) < 1e-5, f"Invalid expression: {result}"
                        return result
        return None

    expression = helper(initial)
    if expression is None:
        raise ValueError(f"No solution found for nums={nums} target={target}")
    return expression

def add_gold_answer_to_dataset(dataset):
    """
    Load the HF dataset, compute the countdown solution for each example, and return a new Dataset with 'gold_answer'.

    Args:
        dataset : Hugging Face dataset

    Returns:
        datasets.Dataset: with additional 'gold_answer' column
    """
    def solve(example):
        nums = example['nums']
        target = example['target']
        try:
            example['gold_answer'] = find_countdown_expression(nums, target)
        except ValueError:
            example['gold_answer'] = None
        return example

    new_ds = dataset.map(solve)
    return new_ds

In [None]:
dataset["train"] = add_gold_answer_to_dataset(dataset["train"])
dataset["test"] = add_gold_answer_to_dataset(dataset["test"])



In [None]:
import os 
dataset.push_to_hub("Jeremmmyyyyy/Math", private=False, token=os.getenv("HF_TOKEN"))