# strategy 1: randomly uniform samples

In [1]:
import json
import random
import os

def generate_addition_problem(k, reverse=True):
    a = random.randint(10**(k-1), 10**k - 1)
    b = random.randint(10**(k-1), 10**k - 1)
    res = str(a + b)
    if reverse:
        res = res[::-1]
    return f"{a}+{b}={res}"

def create_dataset(filename, num_samples, k_list=[3]):
    data = []
    for _ in range(num_samples):
        k = random.choice(k_list)
        data.append(generate_addition_problem(k))
    
    with open(filename, 'w') as f:
        for line in data:
            f.write(line + "\n")
    print(f"Saved {num_samples} samples to {filename}")

if __name__ == "__main__":
    # 1. Training data: 3-digit addition (10k samples)
    create_dataset("train.txt", 10000, k_list=[3])
    
    # 2. In-Distribution Test: 3-digit
    create_dataset("test_id.txt", 1000, k_list=[3])
    
    # 3. OOD Test: 4-digit (Length Generalization)
    create_dataset("test_ood.txt", 500, k_list=[4])


Saved 10000 samples to train.txt
Saved 1000 samples to test_id.txt
Saved 500 samples to test_ood.txt


In [2]:
!head -n 5 train.txt

408+458=668
970+518=8841
599+150=947
793+826=9161
277+185=264


In [3]:
!head -n 5 test_id.txt

885+810=5961
676+312=889
806+219=5201
433+384=718
709+258=769


In [4]:
!head -n 5 test_ood.txt

7740+5180=02921
6988+4808=69711
6251+1558=9087
9772+8764=63581
5632+6704=63321


# strategy 2: Mixed-Length Training

In [5]:
def generate_addition_problem_mixed(k_a, k_b, reverse=True):
    a = random.randint(10**(k_a-1) if k_a > 1 else 0, 10**k_a - 1)
    b = random.randint(10**(k_b-1) if k_b > 1 else 0, 10**k_b - 1)
    res = str(a + b)
    if reverse:
        res = res[::-1]
    return f"{a}+{b}={res}"

def create_mixed_dataset(filename, num_samples, k_range=[1, 2, 3]):
    with open(filename, 'w') as f:
        for _ in range(num_samples):
            # Randomly pick lengths for a and b
            ka = random.choice(k_range)
            kb = random.choice(k_range)
            f.write(generate_addition_problem_mixed(ka, kb) + "\n")
        print(f"Saved {num_samples} samples to {filename}")

if __name__ == "__main__":
    # Train on 1, 2, and 3 digits
    create_mixed_dataset("train_mixed.txt", 10000, k_range=[1, 2, 3])
    # Test on 3 digits (In-Distribution)
    create_mixed_dataset("test_id_mixed.txt", 1000, k_range=[3])
    # Test on 4 digits (Out-of-Distribution)
    create_mixed_dataset("test_ood_mixed.txt", 500, k_range=[4])

Saved 10000 samples to train_mixed.txt
Saved 1000 samples to test_id_mixed.txt
Saved 500 samples to test_ood_mixed.txt


In [6]:
!head -n 5 train_mixed.txt

35+8=34
1+399=004
573+7=085
527+7=435
9+97=601


In [7]:
!head -n 5 test_id_mixed.txt

607+310=719
481+949=0341
797+971=8671
354+943=7921
568+405=379


In [8]:
!head -n 5 test_ood_mixed.txt

1210+4461=1765
6179+3693=2789
5873+3842=5179
4041+6903=44901
7554+6521=57041


# strategy 3: Zero Padding

In [9]:
def generate_padded_problem(max_k=4, reverse=True):
    # Randomly choose how many digits the numbers actually have (1 to max_k-1)
    # We keep training to 3 digits to see if it can do 4 digits later
    ka = random.randint(1, max_k - 1)
    kb = random.randint(1, max_k - 1)
    
    a = random.randint(0, 10**ka - 1)
    b = random.randint(0, 10**kb - 1)
    
    # PAD WITH ZEROS: "0123+0045="
    str_a = str(a).zfill(max_k)
    str_b = str(b).zfill(max_k)
    
    res_val = a + b
    # Result can be max_k + 1 digits (e.g., 999+999 = 1998)
    res_str = str(res_val).zfill(max_k + 1)
    
    if reverse:
        res_str = res_str[::-1]
        
    return f"{str_a}+{str_b}={res_str}"

def create_padded_dataset(filename, num_samples, max_k=4):
    with open(filename, 'w') as f:
        for _ in range(num_samples):
            f.write(generate_padded_problem(max_k=max_k) + "\n")
        print(f"Saved {num_samples} samples to {filename}")
        
if __name__ == "__main__":
    # Train on numbers padded to 4 slots (this covers 1, 2, and 3 digit numbers)
    create_padded_dataset("train_padded.txt", 10000, max_k=4)
    # Test In-Distribution (3 digits, padded to 4)
    create_padded_dataset("test_id_padded.txt", 1000, max_k=4)
    # Test Out-of-Distribution (Actual 4 digit numbers, padded to 4)
    # Note: Since max_k is 4, these will fit the template perfectly!
    with open("test_ood_padded.txt", 'w') as f:
        for _ in range(500):
            # Force 4 digits
            a = random.randint(1000, 9999)
            b = random.randint(1000, 9999)
            res = str(a+b).zfill(5)[::-1]
            f.write(f"{a}+{b}={res}\n")

Saved 10000 samples to train_padded.txt
Saved 1000 samples to test_id_padded.txt


In [10]:
!head -n 5 train_padded.txt

0076+0800=67800
0037+0034=17000
0485+0004=98400
0004+0090=49000
0188+0753=14900


In [11]:
!head -n 5 test_id_padded.txt

0687+0943=03610
0079+0537=61600
0002+0696=89600
0309+0234=34500
0920+0001=12900


In [12]:
!head -n 5 test_ood_padded.txt

9361+4261=22631
5018+2517=53570
7567+6277=44831
8946+6588=43551
1680+6201=18870
