# Generate Multiplication Datasets

# Either a or b is an n_digits number
ensure either a or b is an n_digits number, and the other can be any number from 0 to 10**n_digits - 1:

In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
import random
import os

class AdditionDataset(Dataset):
    def __init__(self, n_digits, n_operand, num_samples, reverse=False, prepend=False):
        self.n_digits = n_digits
        self.n_operand = n_operand
        self.num_samples = num_samples
        self.reverse = reverse
        self.prepend = prepend
        self.data = self.generate_data()
        
    def generate_data(self):
        data = set()
        max_unique_combinations = 18*10**(2*self.n_digits-1)-81*10**(2*self.n_digits-2) if self.n_digits>1 else 100

        while len(data) < self.num_samples and len(data) < max_unique_combinations:
            # Ensure one of the numbers is n_digits and the other can be from 0 to 10**n_digits - 1
            if random.choice([True, False]):
                a = random.randint(10**(self.n_digits - 1), 10**self.n_digits - 1) if self.n_digits>1 else random.randint(0, 9)
                b = random.randint(0, 10**self.n_digits - 1)
            else:
                a = random.randint(0, 10**self.n_digits - 1)
                b = random.randint(10**(self.n_digits - 1), 10**self.n_digits - 1) if self.n_digits>1 else random.randint(0, 9)

            c = a * b
            if self.reverse:
                if not self.prepend:
                    c_reversed = str(c)[::-1]  # Reverse the result
                    data.add(f"{a}+{b}={c_reversed};")  # still use + as * for convenience
                else:
                    a_str = f'%0{self.n_operand}d' % a
                    b_str = f'%0{self.n_operand}d' % b
                    c_reversed_str = (f'%0{self.n_operand+1}d' % c)[::-1]
                    data.add(f"{a_str}+{b_str}={c_reversed_str};")
            else:
                if not self.prepend:
                    data.add(f"{a}+{b}={c};")   # still use + as * for convenience
                else:
                    a_str = f'%0{self.n_operand}d' % a
                    b_str = f'%0{self.n_operand}d' % b
                    c_reversed_str = (f'%0{self.n_operand+1}d' % c)
                    data.add(f"{a_str}+{b_str}={c_reversed_str};")

        return list(data)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

def save_to_txt(dataset, filename):
    # Check if the file exists and remove it if it does
    if os.path.exists(filename):
        os.remove(filename)
    
    unique_lines = set(dataset)
    with open(filename, 'w') as f:
        for line in unique_lines:
            f.write(line + "\n")

# Parameters
n_digits = 9
n_operand = 20
num_samples = 500000
prepend = True

# Create datasets
# dataset_standard = AdditionDataset(n_digits, n_operand, num_samples, reverse=False, prepend=prepend)
dataset_reversed = AdditionDataset(n_digits, n_operand, num_samples, reverse=True, prepend=prepend)

# Save datasets to txt files
# save_to_txt(dataset_standard, f'addition_dataset_{n_digits}_{int(num_samples/1000)}k_standard_{int(prepend)}.txt')
save_to_txt(dataset_reversed, f'multiply_dataset_{n_digits}_{int(num_samples/1000)}k_reversed_{int(prepend)}.txt')

print(f"Datasets saved to addition_dataset_{n_digits}_{int(num_samples/1000)}k_standard_{int(prepend)}.txt and addition_dataset_{n_digits}_{int(num_samples/1000)}k_reversed_{int(prepend)}.txt")


Datasets saved to addition_dataset_9_500k_standard_1.txt and addition_dataset_9_500k_reversed_1.txt


In [6]:
astr = f'%0{6}d' % 1234
astr

'001234'

# Both a and b are n_digits numbers

ensure the dataset has no duplicate lines and that the number of unique lines meets the required conditions

In [11]:
import torch
from torch.utils.data import Dataset, DataLoader
import random
import os

class AdditionDataset(Dataset):
    def __init__(self, n_digits, n_operand, num_samples, reverse=False, prepend=False):
        self.n_digits = n_digits
        self.n_operand = n_operand
        self.num_samples = num_samples
        self.reverse = reverse
        self.prepend = prepend
        self.data = self.generate_data()
        
    def generate_data(self):
        data = set()
        max_unique_combinations = 9 * 10 ** (self.n_digits - 1) if self.n_digits>1 else 10 # Maximum unique n-digit combinations
        max_unique_combinations = max_unique_combinations**2

        while len(data) < self.num_samples and len(data) < max_unique_combinations:
            a = random.randint(10**(self.n_digits-1), 10**self.n_digits - 1) if self.n_digits>1 else random.randint(0, 9)
            b = random.randint(10**(self.n_digits-1), 10**self.n_digits - 1) if self.n_digits>1 else random.randint(0, 9)

            c = a * b
            if self.reverse:
                if not self.prepend:
                    c_reversed = str(c)[::-1]  # Reverse the result
                    data.add(f"{a}+{b}={c_reversed};")  # still use + as * for convenience
                else:
                    a_str = f'%0{self.n_operand}d' % a
                    b_str = f'%0{self.n_operand}d' % b
                    c_reversed_str = (f'%0{self.n_operand+1}d' % c)[::-1]
                    data.add(f"{a_str}+{b_str}={c_reversed_str};")
            else:
                if not self.prepend:
                    data.add(f"{a}+{b}={c};")  # still use + as * for convenience
                else:
                    a_str = f'%0{self.n_operand}d' % a
                    b_str = f'%0{self.n_operand}d' % b
                    c_reversed_str = (f'%0{self.n_operand+1}d' % c)
                    data.add(f"{a_str}+{b_str}={c_reversed_str};")

        return list(data)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

def save_to_txt(dataset, filename):
    # Check if the file exists and remove it if it does
    if os.path.exists(filename):
        os.remove(filename)
    
    unique_lines = set(dataset)
    with open(filename, 'w') as f:
        for line in unique_lines:
            f.write(line + "\n")

# Parameters
n_digits = 1
n_operand = 20
num_samples = 100000
prepend = True

# Create datasets
dataset_reversed = AdditionDataset(n_digits, n_operand, num_samples, reverse=True, prepend=prepend)

# Save datasets to txt files
save_to_txt(dataset_reversed, f'addition_dataset_ab{n_digits}_{int(num_samples/1000)}k_reversed_{int(prepend)}.txt')

print(f"Datasets saved to addition_dataset_ab{n_digits}_{int(num_samples/1000)}k_reversed_{int(prepend)}.txt")


Datasets saved to addition_dataset_ab8_100k_reversed_1.txt


# Combine datasets as input.txt

## 123-digit addition dataset

combine the three datasets named 

addition_dataset_1_reversed.txt, 

addition_dataset_2_reversed.txt, 

addition_dataset_3_reversed.txt, 

and sample $p_i$% from each dataset:

In [14]:
import os
import random

def load_dataset(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
    return lines

def sample_dataset(dataset, sample_rate=0.01):
    sample_size = int(len(dataset) * sample_rate)
    return random.sample(dataset, sample_size)

def save_combined_dataset(datasets, filename):
    # Check if the file exists and remove it if it does
    if os.path.exists(filename):
        os.remove(filename)
    
    # Shuffle the combined dataset
    random.shuffle(datasets)
    
    with open(filename, 'w') as f:
        for line in datasets:
            f.write(line)

# Load datasets
# dataset_1 = load_dataset('addition_dataset_1_reversed.txt')
# dataset_2 = load_dataset('addition_dataset_2_reversed.txt')
# dataset_3 = load_dataset('addition_dataset_3_reversed.txt')

dataset_1 = load_dataset('addition_dataset_1_reversed_prepend.txt')
dataset_2 = load_dataset('addition_dataset_2_reversed_prepend.txt')
dataset_3 = load_dataset('addition_dataset_3_reversed_prepend.txt')

# Sample 1% from each dataset
sampled_dataset_1 = sample_dataset(dataset_1, sample_rate=1.0)
sampled_dataset_2 = sample_dataset(dataset_2, sample_rate=1.0)
sampled_dataset_3 = sample_dataset(dataset_3, sample_rate=0.05)

# Combine sampled datasets
combined_dataset = sampled_dataset_1 + sampled_dataset_2 + sampled_dataset_3
# print(combined_dataset)

# Save combined dataset to a new file
save_combined_dataset(combined_dataset, 'input.txt')

print("Combined sampled dataset saved to input.txt")


Combined sampled dataset saved to input.txt


## 456-digit addition dataset

In [10]:
import os
import random

def load_dataset(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
    return lines

def sample_dataset(dataset, sample_rate=0.01):
    sample_size = int(len(dataset) * sample_rate)
    return random.sample(dataset, sample_size)

def save_combined_dataset(datasets, filename):
    # Check if the file exists and remove it if it does
    if os.path.exists(filename):
        os.remove(filename)
    
    # Shuffle the combined dataset
    random.shuffle(datasets)
    
    with open(filename, 'w') as f:
        for line in datasets:
            f.write(line)

# Load datasets
# 45-addition
dataset_1 = load_dataset('multiply_dataset_3_500k_reversed_1.txt')
dataset_2 = load_dataset('multiply_dataset_4_500k_reversed_1.txt')
sampled_dataset_1 = sample_dataset(dataset_1, sample_rate=0.6)
sampled_dataset_2 = sample_dataset(dataset_2, sample_rate=0.4)

# Combine sampled datasets
combined_dataset = sampled_dataset_1 + sampled_dataset_2 #+ sampled_dataset_3 + sampled_dataset_4 + sampled_dataset_5

# Save combined dataset to a new file
save_combined_dataset(combined_dataset, 'input_prepend_34.txt')

print("Combined sampled dataset saved to input.txt")


Combined sampled dataset saved to input.txt


## 123456-digit addition dataset

In [3]:
import os
import random

def load_dataset(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
    return lines

def sample_dataset(dataset, sample_rate=0.01):
    sample_size = int(len(dataset) * sample_rate)
    return random.sample(dataset, sample_size)

def save_combined_dataset(datasets, filename):
    # Check if the file exists and remove it if it does
    if os.path.exists(filename):
        os.remove(filename)
    
    # Shuffle the combined dataset
    random.shuffle(datasets)
    
    with open(filename, 'w') as f:
        for line in datasets:
            f.write(line)

# Load datasets
# dataset_1 = load_dataset('input_123_5_prepend.txt')
# dataset_2 = load_dataset('input_456_40_prepend.txt')

dataset_1 = load_dataset('addition_dataset_6_100k_reversed_prepend.txt')
dataset_2 = load_dataset('addition_dataset_7_100k_reversed_prepend.txt')

# Sample 1% from each dataset
sampled_dataset_1 = sample_dataset(dataset_1, sample_rate=0.6)
sampled_dataset_2 = sample_dataset(dataset_2, sample_rate=0.6)

# Combine sampled datasets
combined_dataset = sampled_dataset_1 + sampled_dataset_2
# print(combined_dataset)

# Save combined dataset to a new file
save_combined_dataset(combined_dataset, 'input.txt')

print("Combined sampled dataset saved to input.txt")


Combined sampled dataset saved to input.txt


# Generate Modular Addition Datasets

## Either a or b is an n_digits number

(a+b) mod p, p=50,100,101,200

In [87]:
import torch
from torch.utils.data import Dataset, DataLoader
import random
import os

class AdditionDataset(Dataset):
    def __init__(self, p, n_digits, n_operand, num_samples, reverse=False, prepend=False):
        self.p = p
        self.n_digits = n_digits
        self.n_operand = n_operand
        self.num_samples = num_samples
        self.reverse = reverse
        self.prepend = prepend
        self.data = self.generate_data()
        
    def generate_data(self):
        data = set()
        max_unique_combinations = 18*10**(2*self.n_digits-1)-81*10**(2*self.n_digits-2) if self.n_digits>1 else 100

        while len(data) < self.num_samples and len(data) < max_unique_combinations:
            # Ensure one of the numbers is n_digits and the other can be from 0 to 10**n_digits - 1
            if random.choice([True, False]):
                a = random.randint(10**(self.n_digits - 1), 10**self.n_digits - 1) if self.n_digits>1 else random.randint(0, 9)
                b = random.randint(0, 10**self.n_digits - 1)
            else:
                a = random.randint(0, 10**self.n_digits - 1)
                b = random.randint(10**(self.n_digits - 1), 10**self.n_digits - 1) if self.n_digits>1 else random.randint(0, 9)

            c = (a * b) % self.p
            if self.reverse:
                if not self.prepend:
                    c_reversed = str(c)[::-1]  # Reverse the result
                    data.add(f"{a}+{b}={c_reversed};")   # still use + as * for convenience
                else:
                    a_str = f'%0{self.n_operand}d' % a
                    b_str = f'%0{self.n_operand}d' % b
                    c_reversed_str = (f'%0{self.n_operand+1}d' % c)[::-1]
                    data.add(f"{a_str}+{b_str}={c_reversed_str};")
            else:
                if not self.prepend:
                    data.add(f"{a}+{b}={c};")   # still use + as * for convenience
                else:
                    a_str = f'%0{self.n_operand}d' % a
                    b_str = f'%0{self.n_operand}d' % b
                    c_reversed_str = (f'%0{self.n_operand+1}d' % c)
                    data.add(f"{a_str}+{b_str}={c_reversed_str};")

        return list(data)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

def save_to_txt(dataset, filename):
    # Check if the file exists and remove it if it does
    if os.path.exists(filename):
        os.remove(filename)
    
    unique_lines = set(dataset)
    with open(filename, 'w') as f:
        for line in unique_lines:
            f.write(line + "\n")

# Parameters
p = 201
n_digits = 8
n_operand = 20
num_samples = 500000
prepend = True

# Create datasets
# dataset_standard = AdditionDataset(p, n_digits, n_operand, num_samples, reverse=False, prepend=prepend)
dataset_reversed = AdditionDataset(p, n_digits, n_operand, num_samples, reverse=True, prepend=prepend)

# Save datasets to txt files
# save_to_txt(dataset_standard, f'addition_dataset_{n_digits}_{int(num_samples/1000)}k_standard_{int(prepend)}.txt')
save_to_txt(dataset_reversed, f'mod_multiply_p{p}_{n_digits}_{int(num_samples/1000)}k_reversed_{int(prepend)}.txt')

print(f"Datasets saved to addition_dataset_{n_digits}_{int(num_samples/1000)}k_standard_{int(prepend)}.txt and addition_dataset_{n_digits}_{int(num_samples/1000)}k_reversed_{int(prepend)}.txt")


Datasets saved to addition_dataset_8_500k_standard_1.txt and addition_dataset_8_500k_reversed_1.txt


## Both a and b are n_digits numbers

ensure the dataset has no duplicate lines and that the number of unique lines meets the required conditions

In [92]:
import torch
from torch.utils.data import Dataset, DataLoader
import random
import os

class AdditionDataset(Dataset):
    def __init__(self, p, n_digits, n_operand, num_samples, reverse=False, prepend=False):
        self.p = p
        self.n_digits = n_digits
        self.n_operand = n_operand
        self.num_samples = num_samples
        self.reverse = reverse
        self.prepend = prepend
        self.data = self.generate_data()
        
    def generate_data(self):
        data = set()
        max_unique_combinations = 9 * 10 ** (self.n_digits - 1) if self.n_digits>1 else 10 # Maximum unique n-digit combinations
        max_unique_combinations = max_unique_combinations**2

        while len(data) < self.num_samples and len(data) < max_unique_combinations:
            a = random.randint(10**(self.n_digits-1), 10**self.n_digits - 1) if self.n_digits>1 else random.randint(0, 9)
            b = random.randint(10**(self.n_digits-1), 10**self.n_digits - 1) if self.n_digits>1 else random.randint(0, 9)

            c = (a * b) % self.p
            if self.reverse:
                if not self.prepend:
                    c_reversed = str(c)[::-1]  # Reverse the result
                    data.add(f"{a}+{b}={c_reversed};")   # still use + as * for convenience
                else:
                    a_str = f'%0{self.n_operand}d' % a
                    b_str = f'%0{self.n_operand}d' % b
                    c_reversed_str = (f'%0{self.n_operand+1}d' % c)[::-1]
                    data.add(f"{a_str}+{b_str}={c_reversed_str};")
            else:
                if not self.prepend:
                    data.add(f"{a}+{b}={c};")   # still use + as * for convenience
                else:
                    a_str = f'%0{self.n_operand}d' % a
                    b_str = f'%0{self.n_operand}d' % b
                    c_reversed_str = (f'%0{self.n_operand+1}d' % c)
                    data.add(f"{a_str}+{b_str}={c_reversed_str};")

        return list(data)
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        return self.data[idx]

def save_to_txt(dataset, filename):
    # Check if the file exists and remove it if it does
    if os.path.exists(filename):
        os.remove(filename)
    
    unique_lines = set(dataset)
    with open(filename, 'w') as f:
        for line in unique_lines:
            f.write(line + "\n")

# # Parameters
# p = 101
# n_digits = 9
# n_operand = 20
# num_samples = 100000
# prepend = True

# # Create datasets
# dataset_reversed = AdditionDataset(p, n_digits, n_operand, num_samples, reverse=True, prepend=prepend)

# # Save datasets to txt files
# save_to_txt(dataset_reversed, f'mod_addition_p{p}_ab{n_digits}_{int(num_samples/1000)}k_reversed_{int(prepend)}.txt')

# print(f"Datasets saved to addition_dataset_ab{n_digits}_{int(num_samples/1000)}k_standard_{int(prepend)}.txt and addition_dataset_ab{n_digits}_{int(num_samples/1000)}k_reversed_{int(prepend)}.txt")


In [93]:
n_operand = 20
num_samples = 100000
prepend = True

modulus_list = [50, 51, 100, 101, 150, 151, 200, 201]

for p in modulus_list:
    for n_digits in range(1, 10):
        # Create datasets
        dataset_reversed = AdditionDataset(p, n_digits, n_operand, num_samples, reverse=True, prepend=prepend)
        
        # Save datasets to txt files
        save_to_txt(dataset_reversed, f'./mod_multiply_ab/mod_multiply_p{p}_ab{n_digits}_{int(num_samples/1000)}k_reversed_{int(prepend)}.txt')

print('Completed!')

Completed!


In [91]:
import os
import random

def load_dataset(filename):
    with open(filename, 'r') as f:
        lines = f.readlines()
    return lines

def sample_dataset(dataset, sample_rate=0.01):
    sample_size = int(len(dataset) * sample_rate)
    return random.sample(dataset, sample_size)

def save_combined_dataset(datasets, filename):
    # Check if the file exists and remove it if it does
    if os.path.exists(filename):
        os.remove(filename)
    
    # Shuffle the combined dataset
    random.shuffle(datasets)
    
    with open(filename, 'w') as f:
        for line in datasets:
            f.write(line)

# Load datasets
# 34-addition
# dataset_1 = load_dataset('multiply_dataset_1_500k_reversed_1.txt')
# dataset_2 = load_dataset('multiply_dataset_2_500k_reversed_1.txt')
# sampled_dataset_1 = sample_dataset(dataset_1, sample_rate=1.0)
# sampled_dataset_2 = sample_dataset(dataset_2, sample_rate=1.0)

# 1234-addition
dataset_1 = load_dataset('multiply_dataset_1_500k_reversed_1.txt')
dataset_2 = load_dataset('multiply_dataset_2_500k_reversed_1.txt')
dataset_3 = load_dataset('multiply_dataset_3_500k_reversed_1.txt')
dataset_4 = load_dataset('multiply_dataset_4_500k_reversed_1.txt')
sampled_dataset_1 = sample_dataset(dataset_1, sample_rate=1.0)
sampled_dataset_2 = sample_dataset(dataset_2, sample_rate=1.0)
sampled_dataset_3 = sample_dataset(dataset_3, sample_rate=0.2)
sampled_dataset_4 = sample_dataset(dataset_4, sample_rate=0.2)


# Combine sampled datasets
combined_dataset = sampled_dataset_1 + sampled_dataset_2 + sampled_dataset_3 + sampled_dataset_4

# Save combined dataset to a new file
save_combined_dataset(combined_dataset, 'input_prepend_1234.txt')

print("Combined sampled dataset saved to input.txt")


Combined sampled dataset saved to input.txt
