# Evaluation

In [1]:
import os
import torch
import pandas as pd
import tiktoken
import pickle
from model import GPTConfig, GPT

In [2]:
def load_model(ckpt_name, is_addition=True):
    # init from a model saved in a specific directory
    if is_addition:
        out_dir = 'out-addition-char'
    else:
        out_dir = 'out-multiply-char'
        
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    ckpt_path = os.path.join(out_dir, ckpt_name)
    checkpoint = torch.load(ckpt_path, map_location=device)
    gptconf = GPTConfig(**checkpoint['model_args'])
    model = GPT(gptconf)
    state_dict = checkpoint['model']
    
    unwanted_prefix = '_orig_mod.'
    for k,v in list(state_dict.items()):
        if k.startswith(unwanted_prefix):
            state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
    model.load_state_dict(state_dict)
    model.to(device)
    return model

# Estimate Accuracy

##  Test Accuracy wrt ground truth

In [3]:
import os
import pickle
import torch
import numpy as np

def parse_equation(equation):
    """Parse a single equation string into components a, b, and c."""
    parts = equation.split('=')
    result = parts[1]
    operands = parts[0].strip("\n") + "="
    return operands, result

def result_between_equal_and_semicolon(input_string):
    """Extract the result part of the equation between '=' and ';'."""
    equal_pos = input_string.find('=')
    semicolon_pos = input_string.find(';')
    # print(semicolon_pos, len(input_string))
    
    if equal_pos != -1 and semicolon_pos != -1:
        output_string = input_string[equal_pos + 1:semicolon_pos]
    else:
        output_string = None
    
    return output_string

def load_dataset(file_path):
    with open(file_path, 'r') as f:
        data = f.read()
    data_list = data.split(";")
    return data_list

def estimate_accuracy(model, data_dir, data_name='input.txt', split='train', n_eval=10000, batch_size=128, prepend=True):
    input_file_path = os.path.join(data_dir, data_name)
    data_list = load_dataset(input_file_path)
    
    n = len(data_list)
    if split == 'train':
        eval_data = data_list[:int(n * 0.9)]
    elif split == 'test':
        eval_data = data_list[int(n * 0.9):]
    else:
        eval_data = data_list
    
    if len(eval_data) > n_eval:
            eval_data = eval_data[:n_eval]

    meta_path = os.path.join(data_dir, 'meta.pkl')
    if os.path.exists(meta_path):
        with open(meta_path, 'rb') as f:
            meta = pickle.load(f)
    
    stoi, itos = meta['stoi'], meta['itos']
    if not prepend:
        encode = lambda s: [stoi[c] for c in s]
        decode = lambda l: ''.join([itos[i] for i in l])
    else:
        encode = lambda s: [stoi['<bos>']] + [stoi[c] for c in s]
        decode = lambda l: ''.join([itos[i] for i in l if i not in [stoi['<bos>'], stoi['<eos>']]])
    
    num_samples = 1
    max_new_tokens = 23 # 5 for 3-digit
    temperature = 1.0
    top_k = 200
    
    correct = 0
    total_data = len(eval_data)
    total_eval = 0
    all_input_ids = []
    all_ground_truths = []

    for line in eval_data[:-1]:
        start, gt_c_reversed = parse_equation(line)
        start_ids = encode(start)
        all_input_ids.append(start_ids)
        all_ground_truths.append(gt_c_reversed)

    for i in range(0, len(all_input_ids), batch_size):
        batch_input_ids = all_input_ids[i:i + batch_size]
        batch_ground_truths = all_ground_truths[i:i + batch_size]

        batch_input_tensors = [torch.tensor(ids, dtype=torch.long, device=device) for ids in batch_input_ids]
        x = torch.nn.utils.rnn.pad_sequence(batch_input_tensors, batch_first=True, padding_value=encode('\n')[0])

        with torch.no_grad():
            y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k, do_sample=False)
            for j, y_gen in enumerate(y):
                y_gen_decoded = decode(y_gen.tolist())
                gen_c_reversed = result_between_equal_and_semicolon(y_gen_decoded)
                if gen_c_reversed is not None and not gen_c_reversed.startswith("\n"):
                    total_eval += 1
                    if batch_ground_truths[j] == gen_c_reversed:
                        correct += 1
                    #     print(f"Model generation corret: {y_gen_decoded}")
                    # else:
                    #     print(f"Model generation wrong: {y_gen_decoded}, gt: {batch_ground_truths[j]}")
    
    accuracy = correct / total_eval if total_eval != 0 else 0
    print(total_eval)
    return accuracy


## Modular Addtion

In [None]:
# Collect Test Accuracy Table
dataset = 'addition'
data_dir = os.path.join('data', dataset)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

modulus_list = [50, 51, 100, 101, 150, 151, 200, 201]
ckpt_name_list = ['final_ckpt_prepend_p50_4.pt', 'final_ckpt_prepend_p51_4.pt', 
                  'final_ckpt_prepend_p100_4.pt','final_ckpt_prepend_p101_4.pt', 
                  'ckpt_prepend_p150_1234_20w.pt', 'final_ckpt_prepend_p151_4.pt', 
                  'final_ckpt_prepend_p200_4.pt', 'final_ckpt_prepend_p201_4.pt']

acc_list = []
for i, p in enumerate(modulus_list):
    print(f"The modulus p={p}")
    ckpt_name = ckpt_name_list[i]
    model = load_model(ckpt_name)
    model.eval()

    acc_list_p = []
    for ndigit in range(1, 10):
        data_name = f'./mod_addtion_ab/mod_addition_p{p}_ab{ndigit}_100k_reversed_1.txt'
        # Assuming `model` is already defined and loaded
        acc = estimate_accuracy(model, data_dir, data_name, split='test_temp', n_eval=10001, batch_size=128, prepend=True)
        acc_list_p.append(acc)
        
    acc_list.append(acc_list_p)

acc_list_df = pd.DataFrame(acc_list).T
acc_list_df.columns = modulus_list  # Set the column names using the modulus_list

acc_list_df = acc_list_df.T
print(acc_list_df)

# Save the DataFrame to an Excel file
excel_file_name = 'modular_addition_accuracy_results.xlsx'
acc_list_df.to_excel(excel_file_name, index=True)

print('Completed!')

In [70]:
# Define the file name for the Excel file
excel_file_name = 'modular_addition_accuracy_results.xlsx'
acc_list_df = pd.read_excel(excel_file_name)

In [71]:
temp = acc_list_df*100
temp = temp.drop(temp.columns[0], axis=1)
temp.index = [f'$p={p}$' for p in modulus_list]
print(temp.to_latex(index=True, float_format="%.1f", bold_rows=False))

\begin{tabular}{lrrrrrrrrr}
\toprule
 & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 \\
\midrule
$p=150$ & 100 & 100 & 100 & 100 & 33.2 & 33.6 & 32.3 & 33.0 & 33.7 \\
\bottomrule
\end{tabular}



## Standard Addtion

In [51]:
# Collect Test Accuracy Table
dataset = 'addition'
data_dir = os.path.join('data', dataset)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_list = [4, 'ab4', 5, 6, 45, 56, 67]
ckpt_name_list = ['final_ckpt_prepend_4.pt', 'final_ckpt_prepend_ab4.pt', 
                  'final_ckpt_prepend_5.pt','final_ckpt_prepend_6.pt', 
                  'final_ckpt_prepend_45.pt', 'final_ckpt_prepend_56.pt', 
                  'final_ckpt_prepend_67.pt']

acc_list = []
for i, d in enumerate(model_list):
    print(f"The model d={d}")
    ckpt_name = ckpt_name_list[i]
    model = load_model(ckpt_name)
    model.eval()

    acc_list_p = []
    for ndigit in range(1, 10):
        data_name = f'addition_dataset_{ndigit}_100k_reversed_prepend.txt'
        # Assuming `model` is already defined and loaded
        acc = estimate_accuracy(model, data_dir, data_name, split='test', n_eval=10001, batch_size=128, prepend=True)
        acc_list_p.append(acc)
        
    acc_list.append(acc_list_p)

acc_list_df = pd.DataFrame(acc_list).T
acc_list_df.columns = model_list  # Set the column names using the modulus_list

acc_list_df = acc_list_df.T
print(acc_list_df)

# Save the DataFrame to an Excel file
excel_file_name = 'addition_accuracy_results.xlsx'
acc_list_df.to_excel(excel_file_name, index=True)

print('Completed!')

The model d=4
number of parameters: 10.73M
10
990
10000
10000
10000
10000
10000
10000
10000
The model d=ab4
number of parameters: 10.73M
10
990
10000
10000
10000
10000
10000
10000
10000
The model d=5
number of parameters: 10.73M
10
990
10000
10000
10000
10000
10000
10000
10000
The model d=6
number of parameters: 10.73M
10
990
10000
10000
10000
10000
10000
10000
10000
The model d=45
number of parameters: 10.73M
10
990
10000
10000
10000
10000
10000
10000
10000
The model d=56
number of parameters: 10.73M
10
990
10000
10000
10000
10000
10000
10000
10000
The model d=67
number of parameters: 10.73M
10
990
10000
10000
10000
10000
10000
10000
10000
       0    1       2    3       4    5    6    7    8
4    1.0  1.0  1.0000  1.0  0.0000  0.0  0.0  0.0  0.0
ab4  1.0  1.0  0.7264  1.0  0.0001  0.0  0.0  0.0  0.0
5    1.0  1.0  1.0000  1.0  1.0000  0.0  0.0  0.0  0.0
6    1.0  1.0  1.0000  1.0  1.0000  1.0  0.0  0.0  0.0
45   1.0  1.0  1.0000  1.0  1.0000  0.0  0.0  0.0  0.0
56   1.0  1.0  1.0000

In [64]:
# Define the file name for the Excel file
excel_file_name = 'addition_accuracy_results.xlsx'
acc_list_df = pd.read_excel(excel_file_name)

In [65]:
temp = acc_list_df*100
temp = temp.drop(temp.columns[0], axis=1)
temp.index = [r'$\mathcal{D}_{'+f'{p}'+r'}$' for p in model_list]
print(temp.to_latex(index=True, float_format="%.1f", bold_rows=False))

\begin{tabular}{lrrrrrrrrr}
\toprule
 & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 \\
\midrule
$\mathcal{D}_{4}$ & 100 & 100 & 100.0 & 100 & 0.0 & 0 & 0 & 0 & 0 \\
$\mathcal{D}_{ab4}$ & 100 & 100 & 72.6 & 100 & 0.0 & 0 & 0 & 0 & 0 \\
$\mathcal{D}_{5}$ & 100 & 100 & 100.0 & 100 & 100.0 & 0 & 0 & 0 & 0 \\
$\mathcal{D}_{6}$ & 100 & 100 & 100.0 & 100 & 100.0 & 100 & 0 & 0 & 0 \\
$\mathcal{D}_{45}$ & 100 & 100 & 100.0 & 100 & 100.0 & 0 & 0 & 0 & 0 \\
$\mathcal{D}_{56}$ & 100 & 100 & 100.0 & 100 & 100.0 & 100 & 0 & 0 & 0 \\
$\mathcal{D}_{67}$ & 100 & 100 & 100.0 & 100 & 100.0 & 100 & 100 & 0 & 0 \\
\bottomrule
\end{tabular}



## Standard Multiplication

In [18]:
# Collect Test Accuracy Table
dataset = 'multiply'
data_dir = os.path.join('data', dataset)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model_list = [12, 2, 3, 34, 234]
ckpt_name_list = ['ckpt_prepend_multip_12_micro.pt', 'half_ckpt_prepend_multip_2_micro.pt', 
                  'final_ckpt_prepend_multip_3.pt','final_ckpt_prepend_multip_34.pt', 
                  'half_ckpt_prepend_multip_234.pt']

acc_list = []
for i, d in enumerate(model_list):
    print(f"The model d={d}")
    ckpt_name = ckpt_name_list[i]
    model = load_model(ckpt_name, is_addition=False)
    model.eval()

    acc_list_p = []
    for ndigit in range(1, 10):
        data_name = f'multiply_dataset_{ndigit}_500k_reversed_1.txt'
        # Assuming `model` is already defined and loaded
        acc = estimate_accuracy(model, data_dir, data_name, split='test', n_eval=10001, batch_size=128, prepend=True)
        acc_list_p.append(acc)
        
    acc_list.append(acc_list_p)

acc_list_df = pd.DataFrame(acc_list).T
acc_list_df.columns = model_list  # Set the column names using the modulus_list

acc_list_df = acc_list_df.T
print(acc_list_df)

# Save the DataFrame to an Excel file
excel_file_name = 'mutiplication_accuracy_results.xlsx'
acc_list_df.to_excel(excel_file_name, index=True)

print('Completed!')

The model d=12
number of parameters: 0.82M
10
990
10000
10000
10000
10000
10000
10000
10000
The model d=2
number of parameters: 0.82M
10
990
10000
10000
10000
10000
10000
10000
10000
The model d=3
number of parameters: 10.73M
10
990
10000
10000
10000
10000
10000
10000
10000
The model d=34
number of parameters: 10.73M
10
990
10000
10000
10000
10000
10000
10000
10000
The model d=234
number of parameters: 10.73M
10
990
10000
10000
10000
10000
10000
10000
10000
       0         1       2       3    4    5    6    7    8
12   1.0  1.000000  0.0013  0.0004  0.0  0.0  0.0  0.0  0.0
2    0.8  0.993939  0.0013  0.0004  0.0  0.0  0.0  0.0  0.0
3    1.0  0.963636  0.9896  0.0004  0.0  0.0  0.0  0.0  0.0
34   0.8  0.098990  0.9724  0.0132  0.0  0.0  0.0  0.0  0.0
234  1.0  1.000000  0.9885  0.8054  0.0  0.0  0.0  0.0  0.0
Completed!


In [19]:
# Define the file name for the Excel file
excel_file_name = 'mutiplication_accuracy_results.xlsx'
acc_list_df = pd.read_excel(excel_file_name)

In [20]:
temp = acc_list_df*100
temp = temp.drop(temp.columns[0], axis=1)
temp.index = [r'$\mathcal{D}_{'+f'{p}'+r'}$' for p in model_list]
print(temp.to_latex(index=True, float_format="%.1f", bold_rows=False))

\begin{tabular}{lrrrrrrrrr}
\toprule
 & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 \\
\midrule
$\mathcal{D}_{12}$ & 100.0 & 100.0 & 0.1 & 0.0 & 0 & 0 & 0 & 0 & 0 \\
$\mathcal{D}_{2}$ & 80.0 & 99.4 & 0.1 & 0.0 & 0 & 0 & 0 & 0 & 0 \\
$\mathcal{D}_{3}$ & 100.0 & 96.4 & 99.0 & 0.0 & 0 & 0 & 0 & 0 & 0 \\
$\mathcal{D}_{34}$ & 80.0 & 9.9 & 97.2 & 1.3 & 0 & 0 & 0 & 0 & 0 \\
$\mathcal{D}_{234}$ & 100.0 & 100.0 & 98.9 & 80.5 & 0 & 0 & 0 & 0 & 0 \\
\bottomrule
\end{tabular}



## Modular Multiplication

In [8]:
# Collect Test Accuracy Table
dataset = 'multiply'
data_dir = os.path.join('data', dataset)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

modulus_list = [50, 51, 100, 150, 200, 201]
ckpt_name_list = ['final_ckpt_prepend_modmultip_p50_3.pt', 'final_ckpt_prepend_modmultip_p51_3.pt', 
                  'final_ckpt_prepend_modmultip_p100_3.pt','ckpt_prepend_modmultip_p150_3.pt', 
                  'ckpt_prepend_modmultip_p200_3.pt', 'half_ckpt_prepend_modmultip_p201_3.pt']

acc_list = []
for i, p in enumerate(modulus_list):
    print(f"The model p={p}")
    ckpt_name = ckpt_name_list[i]
    model = load_model(ckpt_name, is_addition=False)
    model.eval()

    acc_list_p = []
    for ndigit in range(1, 10):
        data_name = f'./mod_multiply_ab/mod_multiply_p{p}_ab{ndigit}_100k_reversed_1.txt'
        # Assuming `model` is already defined and loaded
        acc = estimate_accuracy(model, data_dir, data_name, split='test', n_eval=10001, batch_size=128, prepend=True)
        acc_list_p.append(acc)
        
    acc_list.append(acc_list_p)

acc_list_df = pd.DataFrame(acc_list).T
acc_list_df.columns = modulus_list  # Set the column names using the modulus_list

acc_list_df = acc_list_df.T
print(acc_list_df)

# Save the DataFrame to an Excel file
excel_file_name = 'modular_mutiplication_accuracy_results.xlsx'
acc_list_df.to_excel(excel_file_name, index=True)

print('Completed!')

The model p=50
number of parameters: 10.73M
10
810
10000
10000
10000
10000
10000
10000
10000
The model p=51
number of parameters: 10.73M
10
810
10000
10000
10000
10000
10000
10000
10000
The model p=100
number of parameters: 10.73M
10
810
10000
10000
10000
10000
10000
10000
10000
The model p=150
number of parameters: 10.73M
10
810
10000
10000
10000
10000
10000
10000
10000
The model p=200
number of parameters: 10.73M
10
810
10000
10000
10000
10000
10000
10000
10000
The model p=201
number of parameters: 10.73M
10
810
10000
10000
10000
10000
10000
10000
10000
       0         1       2       3       4       5       6       7       8
50   1.0  1.000000  1.0000  1.0000  1.0000  1.0000  1.0000  1.0000  1.0000
51   1.0  1.000000  0.9973  0.0259  0.0245  0.0280  0.0237  0.0251  0.0319
100  1.0  1.000000  1.0000  1.0000  1.0000  1.0000  1.0000  1.0000  1.0000
150  0.6  0.545679  0.5242  0.5259  0.5310  0.5273  0.5377  0.5299  0.5331
200  0.1  0.627160  0.6305  0.6288  0.6215  0.6156  0.6179  0.6

In [20]:
# Define the file name for the Excel file
excel_file_name = 'modular_mutiplication_accuracy_results.xlsx'
acc_list_df = pd.read_excel(excel_file_name)

In [21]:
temp = acc_list_df*100
temp = temp.drop(temp.columns[0], axis=1)
temp.index = [f'$p={p}$' for p in modulus_list]
print(temp.to_latex(index=True, float_format="%.1f", bold_rows=False))

\begin{tabular}{lrrrrrrrrr}
\toprule
 & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 \\
\midrule
$p=50$ & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 \\
$p=51$ & 100.0 & 100.0 & 99.7 & 2.6 & 2.5 & 2.8 & 2.4 & 2.5 & 3.2 \\
$p=100$ & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 \\
$p=150$ & 60.0 & 54.6 & 52.4 & 52.6 & 53.1 & 52.7 & 53.8 & 53.0 & 53.3 \\
$p=200$ & 10.0 & 62.7 & 63.0 & 62.9 & 62.2 & 61.6 & 61.8 & 62.5 & 62.9 \\
$p=201$ & 80.0 & 78.3 & 92.2 & 0.7 & 0.6 & 0.5 & 0.6 & 0.6 & 0.6 \\
\bottomrule
\end{tabular}



## OOD Test Accuracy wrt equivalence class version

In [11]:
import os
import pickle
import torch
import numpy as np

def parse_equation(equation):
    """Parse a single equation string into components a, b, and c."""
    parts = equation.split('=')
    result = parts[1]
    operands = parts[0].strip("\n") + "="
    return operands, result
    
def parse_operands(start):
    """Parse a single string 'a+b' into int components a, b"""
    operands = start.split('+')
    a = int(operands[0])
    b = int(operands[1][:-1])
    return a, b
    
def modular_gtc_ndigit(start, ndigit=4, is_modular_addition=False, modulus=10, is_multiply=False):
    a, b = parse_operands(start)
    
    if not is_multiply:
        modular_c = (a%10**ndigit) + (b%10**ndigit)
    else:
        modular_c = (a%10**ndigit) * (b%10**ndigit)
    
    if is_modular_addition:
        modular_c = modular_c % modulus
    modular_c_reversed_str = str(modular_c)[::-1]
    return modular_c_reversed_str

def result_between_equal_and_semicolon(input_string):
    """Extract the result part of the equation between '=' and ';'."""
    equal_pos = input_string.find('=')
    semicolon_pos = input_string.find(';')
    
    if equal_pos != -1 and semicolon_pos != -1:
        output_string = input_string[equal_pos + 1:semicolon_pos]
    else:
        output_string = None
    
    return output_string

def load_dataset(file_path):
    with open(file_path, 'r') as f:
        data = f.read()
    data_list = data.split(";")
    return data_list

def estimate_accuracy(model, data_dir, device, data_name='input.txt', split='train', ndigit=3, n_eval=10000, batch_size=128, modular=False, prepend=True, is_modular_addition=False, modulus=None, is_multiply=False):
    input_file_path = os.path.join(data_dir, data_name)
    data_list = load_dataset(input_file_path)
    
    n = len(data_list)
    if split == 'train':
        eval_data = data_list[:int(n * 0.9)]
    elif split == 'test':
        eval_data = data_list[int(n * 0.9):]
    else:
        eval_data = data_list
    
    if len(eval_data) > n_eval:
            eval_data = eval_data[:n_eval]

    meta_path = os.path.join(data_dir, 'meta.pkl')
    if os.path.exists(meta_path):
        with open(meta_path, 'rb') as f:
            meta = pickle.load(f)

    stoi, itos = meta['stoi'], meta['itos']
    if not prepend:
        encode = lambda s: [stoi[c] for c in s]
        decode = lambda l: ''.join([itos[i] for i in l])
    else:
        encode = lambda s: [stoi['<bos>']] + [stoi[c] for c in s]
        decode = lambda l: ''.join([itos[i] for i in l if i not in [stoi['<bos>'], stoi['<eos>']]])
    
    num_samples = 1
    max_new_tokens = 23 # for 3-digit addition n+1+1
    temperature = 1.0
    top_k = 200
    
    correct = 0
    total_data = len(eval_data)
    total_eval = 0
    all_input_ids = []
    all_ground_truths = []

    for line in eval_data[:-1]:
        start, gt_c_reversed = parse_equation(line)
        start_ids = encode(start)
        all_input_ids.append(start_ids)

        if not modular: 
            all_ground_truths.append(gt_c_reversed)
        else:
            modular_c_reversed_str = modular_gtc_ndigit(start, ndigit=ndigit, is_modular_addition=is_modular_addition, modulus=p, is_multiply=is_multiply)
            if prepend:
                modular_c_reversed_str = modular_c_reversed_str[::-1].zfill(max_new_tokens-2)
                modular_c_reversed_str = modular_c_reversed_str[::-1]
            all_ground_truths.append(modular_c_reversed_str)
        
    model.eval()
    for i in range(0, len(all_input_ids), batch_size):
        batch_input_ids = all_input_ids[i:i + batch_size]
        batch_ground_truths = all_ground_truths[i:i + batch_size]

        batch_input_tensors = [torch.tensor(ids, dtype=torch.long, device=device) for ids in batch_input_ids]

        x = torch.nn.utils.rnn.pad_sequence(batch_input_tensors, batch_first=True, padding_value=encode('\n')[0])
        
        with torch.no_grad():
            y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k, do_sample=False)
            y.to('cpu')
            for j, y_gen in enumerate(y):
                y_gen_decoded = decode(y_gen.tolist())
                gen_c_reversed = result_between_equal_and_semicolon(y_gen_decoded)
                if gen_c_reversed is not None and not gen_c_reversed.startswith("\n"):
                    total_eval += 1
                    if batch_ground_truths[j] == gen_c_reversed:
                        correct += 1
                    #     print(f"Model generation corret: {y_gen_decoded}")
                    # else:
                    #     print(f"Model generation wrong: {y_gen_decoded}, gt: {batch_ground_truths[j]}")
        for i in range(len(batch_input_tensors)):
            batch_input_tensors[i].to('cpu')
        x.to('cpu')
    # model.train()
    
    accuracy = correct / total_eval if total_eval != 0 else 0
    print(total_eval)
    return accuracy


## Modular Addtion

In [None]:
# Collect Test Accuracy Table
dataset = 'addition'
data_dir = os.path.join('data', dataset)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

is_modular_addition = True

modulus_list = [50, 51, 100, 101, 150, 151, 200, 201]
ckpt_name_list = ['final_ckpt_prepend_p50_4.pt', 'final_ckpt_prepend_p51_4.pt', 
                  'final_ckpt_prepend_p100_4.pt','final_ckpt_prepend_p101_4.pt', 
                  'ckpt_prepend_p150_1234_20w.pt', 'final_ckpt_prepend_p151_4.pt', 
                  'final_ckpt_prepend_p200_4.pt', 'final_ckpt_prepend_p201_4.pt']

acc_list = []
for i, p in enumerate(modulus_list):
    print(f"The modulus p={p}")
    ckpt_name = ckpt_name_list[i]
    model = load_model(ckpt_name)
    print("Successfully load model!")
    model.eval()

    acc_list_p = []
    for ndigit in range(1, 10):
        data_name = f'./mod_addtion_ab/mod_addition_p{p}_ab{ndigit}_100k_reversed_1.txt'
        # Assuming `model` is already defined and loaded
        acc = estimate_accuracy(model, data_dir, device, data_name, split='ood_test', ndigit=4, n_eval=10001, batch_size=128, modular=True, prepend=True, is_modular_addition=is_modular_addition, modulus=p)
        acc_list_p.append(acc)
        
    acc_list.append(acc_list_p)

acc_list_df = pd.DataFrame(acc_list).T
acc_list_df.columns = modulus_list  # Set the column names using the modulus_list

acc_list_df = acc_list_df.T
print(acc_list_df)

# Save the DataFrame to an Excel file
excel_file_name = 'modular_addition_mod_accuracy_results.xlsx'
acc_list_df.to_excel(excel_file_name, index=True)

print('Completed!')

In [74]:
# Define the file name for the Excel file
excel_file_name = 'modular_addition_mod_accuracy_results.xlsx'
acc_list_df = pd.read_excel(excel_file_name)

In [75]:
temp = acc_list_df*100
temp = temp.drop(temp.columns[0], axis=1)
temp.index = [f'$p={p}$' for p in modulus_list]
print(temp.to_latex(index=True, float_format="%.1f", bold_rows=False))

\begin{tabular}{lrrrrrrrrr}
\toprule
 & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 \\
\midrule
$p=150$ & 100 & 100 & 100 & 100 & 100 & 100 & 100.0 & 99.8 & 99.7 \\
\bottomrule
\end{tabular}



## Standard Addtion

In [56]:
# Collect Test Accuracy Table
dataset = 'addition'
data_dir = os.path.join('data', dataset)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

is_modular_addition = False
p = None

# modulus_list = [50, 51]
model_list = [4, 'ab4', 5, 6, 45, 56, 67]
ndigit_list = [4, 4, 5, 6, 5, 6, 7]
ckpt_name_list = ['final_ckpt_prepend_4.pt', 'final_ckpt_prepend_ab4.pt', 
                  'final_ckpt_prepend_5.pt','final_ckpt_prepend_6.pt', 
                  'final_ckpt_prepend_45.pt', 'final_ckpt_prepend_56.pt', 
                  'final_ckpt_prepend_67.pt']

acc_list = []
for i, d in enumerate(model_list):
    print(f"The model d={d}")
    ckpt_name = ckpt_name_list[i]
    model = load_model(ckpt_name)
    print("Successfully load model!")
    model.eval()

    acc_list_p = []
    for ndigit in range(1, 10):
        data_name = f'addition_dataset_{ndigit}_100k_reversed_prepend.txt'
        # Assuming `model` is already defined and loaded
        acc = estimate_accuracy(model, data_dir, device, data_name, split='ood_test', ndigit=ndigit_list[i], n_eval=10001, batch_size=128, modular=True, prepend=True, is_modular_addition=is_modular_addition, modulus=p)
        acc_list_p.append(acc)
        
    acc_list.append(acc_list_p)

acc_list_df = pd.DataFrame(acc_list).T
acc_list_df.columns = model_list  # Set the column names using the modulus_list

acc_list_df = acc_list_df.T
print(acc_list_df)

# Save the DataFrame to an Excel file
excel_file_name = 'addition_mod_accuracy_results.xlsx'
acc_list_df.to_excel(excel_file_name, index=True)

print('Completed!')

The model d=4
number of parameters: 10.73M
Successfully load model!
100
9900
10000
10000
10000
10000
10000
10000
10000
The model d=ab4
number of parameters: 10.73M
Successfully load model!
100
9900
10000
10000
10000
10000
10000
10000
10000
The model d=5
number of parameters: 10.73M
Successfully load model!
100
9900
10000
10000
10000
10000
10000
10000
10000
The model d=6
number of parameters: 10.73M
Successfully load model!
100
9900
10000
10000
10000
10000
10000
10000
10000
The model d=45
number of parameters: 10.73M
Successfully load model!
100
9900
10000
10000
10000
10000
10000
10000
10000
The model d=56
number of parameters: 10.73M
Successfully load model!
100
9900
10000
10000
10000
10000
10000
10000
10000
The model d=67
number of parameters: 10.73M
Successfully load model!
100
9900
10000
10000
10000
10000
10000
10000
10000
       0         1       2    3       4       5       6       7       8
4    1.0  1.000000  1.0000  1.0  1.0000  1.0000  1.0000  1.0000  1.0000
ab4  1.0  0.998586

In [57]:
# Define the file name for the Excel file
excel_file_name = 'addition_mod_accuracy_results.xlsx'
acc_list_df = pd.read_excel(excel_file_name)

In [63]:
temp = acc_list_df*100
temp = temp.drop(temp.columns[0], axis=1)
temp.index = [r'$\mathcal{D}_{'+f'{p}'+r'}$' for p in model_list]
print(temp.to_latex(index=True, float_format="%.1f", bold_rows=False))

\begin{tabular}{lrrrrrrrrr}
\toprule
 & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 \\
\midrule
$\mathcal{D}_{4}$ & 100 & 100.0 & 100.0 & 100 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 \\
$\mathcal{D}_{ab4}$ & 100 & 99.9 & 72.3 & 100 & 99.7 & 99.7 & 99.6 & 99.7 & 99.5 \\
$\mathcal{D}_{5}$ & 100 & 100.0 & 100.0 & 100 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 \\
$\mathcal{D}_{6}$ & 100 & 100.0 & 100.0 & 100 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 \\
$\mathcal{D}_{45}$ & 100 & 100.0 & 100.0 & 100 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 \\
$\mathcal{D}_{56}$ & 100 & 100.0 & 100.0 & 100 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 \\
$\mathcal{D}_{67}$ & 100 & 100.0 & 100.0 & 100 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 \\
\bottomrule
\end{tabular}



## Standard Multiplication

In [22]:
# Collect Test Accuracy Table
dataset = 'multiply'
data_dir = os.path.join('data', dataset)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

is_modular_addition = False
p = None

# modulus_list = [50, 51]
model_list = [12, 2, 3, 34, 234]
ndigit_list = [2, 2, 3, 4, 4]
ckpt_name_list = ['ckpt_prepend_multip_12_micro.pt', 'half_ckpt_prepend_multip_2_micro.pt', 
                  'final_ckpt_prepend_multip_3.pt','final_ckpt_prepend_multip_34.pt', 
                  'half_ckpt_prepend_multip_234.pt']

acc_list = []
for i, d in enumerate(model_list):
    print(f"The model d={d}")
    ckpt_name = ckpt_name_list[i]
    model = load_model(ckpt_name, is_addition=False)
    print("Successfully load model!")
    model.eval()

    acc_list_p = []
    for ndigit in range(1, 10):
        data_name = f'multiply_dataset_{ndigit}_500k_reversed_1.txt'
        # Assuming `model` is already defined and loaded
        acc = estimate_accuracy(model, data_dir, device, data_name, split='ood_test', ndigit=ndigit_list[i], n_eval=10001, batch_size=128, modular=True, prepend=True, is_modular_addition=is_modular_addition, modulus=p, is_multiply=True)
        acc_list_p.append(acc)
        
    acc_list.append(acc_list_p)

acc_list_df = pd.DataFrame(acc_list).T
acc_list_df.columns = model_list  # Set the column names using the modulus_list

acc_list_df = acc_list_df.T
print(acc_list_df)

# Save the DataFrame to an Excel file
excel_file_name = 'multiplication_mod_accuracy_results.xlsx'
acc_list_df.to_excel(excel_file_name, index=True)

print('Completed!')

The model d=12
number of parameters: 0.82M
Successfully load model!
100
9900
10000
10000
10000
10000
10000
10000
10000
The model d=2
number of parameters: 0.82M
Successfully load model!
100
9900
10000
10000
10000
10000
10000
10000
10000
The model d=3
number of parameters: 10.73M
Successfully load model!
100
9900
10000
10000
10000
10000
10000
10000
10000
The model d=34
number of parameters: 10.73M
Successfully load model!
100
9900
10000
10000
10000
10000
10000
10000
10000
The model d=234
number of parameters: 10.73M
Successfully load model!
100
9900
10000
10000
10000
10000
10000
10000
10000
        0         1       2       3       4       5       6       7       8
12   1.00  0.999495  0.9295  0.9006  0.8604  0.8257  0.8063  0.7816  0.7772
2    0.85  0.993939  0.9812  0.9669  0.8897  0.8887  0.8842  0.8983  0.8870
3    1.00  0.962121  0.9880  0.9887  0.9895  0.9792  0.9794  0.9724  0.9709
34   0.85  0.102929  0.9687  0.0138  0.0089  0.0082  0.0079  0.0070  0.0022
234  1.00  0.999798  0.

In [23]:
# Define the file name for the Excel file
excel_file_name = 'multiplication_mod_accuracy_results.xlsx'
acc_list_df = pd.read_excel(excel_file_name)

In [24]:
temp = acc_list_df*100
temp = temp.drop(temp.columns[0], axis=1)
temp.index = [r'$\mathcal{D}_{'+f'{p}'+r'}$' for p in model_list]
print(temp.to_latex(index=True, float_format="%.1f", bold_rows=False))

\begin{tabular}{lrrrrrrrrr}
\toprule
 & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 \\
\midrule
$\mathcal{D}_{12}$ & 100.0 & 99.9 & 93.0 & 90.1 & 86.0 & 82.6 & 80.6 & 78.2 & 77.7 \\
$\mathcal{D}_{2}$ & 85.0 & 99.4 & 98.1 & 96.7 & 89.0 & 88.9 & 88.4 & 89.8 & 88.7 \\
$\mathcal{D}_{3}$ & 100.0 & 96.2 & 98.8 & 98.9 & 99.0 & 97.9 & 97.9 & 97.2 & 97.1 \\
$\mathcal{D}_{34}$ & 85.0 & 10.3 & 96.9 & 1.4 & 0.9 & 0.8 & 0.8 & 0.7 & 0.2 \\
$\mathcal{D}_{234}$ & 100.0 & 100.0 & 98.9 & 81.0 & 75.6 & 76.2 & 73.8 & 67.5 & 66.9 \\
\bottomrule
\end{tabular}



## Modular Multiplication

In [22]:
# Collect Test Accuracy Table
dataset = 'multiply'
data_dir = os.path.join('data', dataset)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

is_modular_addition = True

# modulus_list = [50, 51]
modulus_list = [50, 51, 100, 150, 200, 201]
ckpt_name_list = ['final_ckpt_prepend_modmultip_p50_3.pt', 'final_ckpt_prepend_modmultip_p51_3.pt', 
                  'final_ckpt_prepend_modmultip_p100_3.pt','ckpt_prepend_modmultip_p150_3.pt', 
                  'ckpt_prepend_modmultip_p200_3.pt', 'half_ckpt_prepend_modmultip_p201_3.pt']

acc_list = []
for i, p in enumerate(modulus_list):
    print(f"The model p={p}")
    ckpt_name = ckpt_name_list[i]
    model = load_model(ckpt_name, is_addition=False)
    print("Successfully load model!")
    model.eval()

    acc_list_p = []
    for ndigit in range(1, 10):
        data_name = f'./mod_multiply_ab/mod_multiply_p{p}_ab{ndigit}_100k_reversed_1.txt'
        # Assuming `model` is already defined and loaded
        acc = estimate_accuracy(model, data_dir, device, data_name, split='ood_test', ndigit=3, n_eval=10001, batch_size=128, modular=True, prepend=True, is_modular_addition=is_modular_addition, modulus=p, is_multiply=True)
        acc_list_p.append(acc)
        
    acc_list.append(acc_list_p)

acc_list_df = pd.DataFrame(acc_list).T
acc_list_df.columns = modulus_list  # Set the column names using the modulus_list

acc_list_df = acc_list_df.T
print(acc_list_df)

# Save the DataFrame to an Excel file
excel_file_name = 'modular_multiplication_mod_accuracy_results.xlsx'
acc_list_df.to_excel(excel_file_name, index=True)

print('Completed!')

The model p=50
number of parameters: 10.73M
Successfully load model!
100
8100
10000
10000
10000
10000
10000
10000
10000
The model p=51
number of parameters: 10.73M
Successfully load model!
100
8100
10000
10000
10000
10000
10000
10000
10000
The model p=100
number of parameters: 10.73M
Successfully load model!
100
8100
10000
10000
10000
10000
10000
10000
10000
The model p=150
number of parameters: 10.73M
Successfully load model!
100
8100
10000
10000
10000
10000
10000
10000
10000
The model p=200
number of parameters: 10.73M
Successfully load model!
100
8100
10000
10000
10000
10000
10000
10000
10000
The model p=201
number of parameters: 10.73M
Successfully load model!
100
8100
10000
10000
10000
10000
10000
10000
10000
        0         1       2       3       4       5       6       7       8
50   1.00  1.000000  1.0000  1.0000  1.0000  1.0000  1.0000  1.0000  1.0000
51   1.00  1.000000  0.9971  0.9977  0.9840  0.8442  0.8188  0.6864  0.5719
100  1.00  1.000000  1.0000  1.0000  1.0000  1.0

In [23]:
# Define the file name for the Excel file
excel_file_name = 'modular_multiplication_mod_accuracy_results.xlsx'
acc_list_df = pd.read_excel(excel_file_name)

In [25]:
temp = acc_list_df*100
temp = temp.drop(temp.columns[0], axis=1)
temp.index = [f'$p={p}$' for p in modulus_list]
print(temp.to_latex(index=True, float_format="%.1f", bold_rows=False))

\begin{tabular}{lrrrrrrrrr}
\toprule
 & 0 & 1 & 2 & 3 & 4 & 5 & 6 & 7 & 8 \\
\midrule
$p=50$ & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 \\
$p=51$ & 100.0 & 100.0 & 99.7 & 99.8 & 98.4 & 84.4 & 81.9 & 68.6 & 57.2 \\
$p=100$ & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 & 100.0 \\
$p=150$ & 66.0 & 52.5 & 51.9 & 52.0 & 53.5 & 52.8 & 52.5 & 53.3 & 52.5 \\
$p=200$ & 24.0 & 62.3 & 63.1 & 61.8 & 62.8 & 61.6 & 62.8 & 62.0 & 62.8 \\
$p=201$ & 71.0 & 79.5 & 92.1 & 90.9 & 90.7 & 90.5 & 88.7 & 87.9 & 85.0 \\
\bottomrule
\end{tabular}



# Calculate the theoretical accuracy of modular addtion


In [25]:
def count_valid_pairs(m, n, p):
    count = 0
    A_range = range(1, 10**(m-n))
    a0_range = range(0, 10**n)
    
    for A in A_range:
        for B in A_range:
            if (A + B) * 10**n % p == 0:
                count += len(a0_range) ** 2  # All combinations of a0 and b0 are valid
    
    total_pairs = (10**(m-n) - 1) ** 2 * (10**n) ** 2
    ratio = count / total_pairs
    
    return count, total_pairs, ratio

# Example usage:
m = 7  # number of digits in a and b
n = 4  # number of digits for the modulus operation
p = 151  # divisor
valid_count, total_pairs, ratio = count_valid_pairs(m, n, p)
print(f"Valid pairs: {valid_count}")
print(f"Total pairs: {total_pairs}")
print(f"Ratio: {ratio}")


Valid pairs: 658800000000
Total pairs: 99800100000000
Ratio: 0.0066011957903849795


In [42]:
def count_valid_pairs(m, n, p):
    from math import gcd
    
    total_pairs = (10**(m-n) - 1)**2 * (10**n)**2
    valid_count = 0
    
    for A in range(1, 10**(m-n)):
        for B in range(1, 10**(m-n)):
            if (A + B) % (p // gcd(p, 10**n)) == 0:
                valid_count += (10**n)**2

    ratio = valid_count / total_pairs
    return valid_count, total_pairs, ratio

# Example usage:
m = 7  # number of digits in a and b
n = 4  # number of digits for the modulus operation
p = 201  # divisor
valid_count, total_pairs, ratio = count_valid_pairs(m, n, p)
print(f"Valid pairs: {valid_count}")
print(f"Total pairs: {total_pairs}")
print(f"Ratio: {ratio}")

Valid pairs: 496600000000
Total pairs: 99800100000000
Ratio: 0.00497594691788886


In [43]:
from math import gcd
import numpy as np

p_prime = p/gcd(p, 10**n)
(10**(m-n)-1)/p_prime

4.970149253731344

In [44]:
frac = (10**(m-n)-1)/p_prime-np.floor((10**(m-n)-1)/p_prime)
epsilon = frac/(10**(m-n)-1)

In [45]:
1/p_prime, 1/p_prime-epsilon

(0.004975124378109453, 0.004004004004004004)

In [None]:
import numpy as np
import math
from math import gcd

p = 201
n = 4
p_prime = p/gcd(p, 10**n)
bound = n + np.log10(p_prime/2+1)
print(bound)

bound = math.ceil(bound)
print(bound)

# Calculate the theoretical accuracy of modular multiplication


In [47]:
def count_valid_pairs(m, n, p):
    count = 0
    A_range = range(1, 10**(m-n))
    a0_range = range(0, 10**n)
    
    for A in A_range:
        for B in A_range:
            for a0 in a0_range:
                for b0 in a0_range:
                    if ((A * B) * 10**(2*n) + a0*B*10**n + A*b0*10**n) % p == 0:
                        count += 1  # Not al combinations of a0 and b0 are valid
    
    total_pairs = (10**(m-n) - 1) ** 2 * (10**n) ** 2
    ratio = count / total_pairs
    
    return count, total_pairs, ratio

# Example usage:
m, n, p = 4, 3, 51
# divisor
valid_count, total_pairs, ratio = count_valid_pairs(m, n, p)
print(f"Valid pairs: {valid_count}")
print(f"Total pairs: {total_pairs}")
print(f"Ratio: {ratio}")


Valid pairs: 1942585
Total pairs: 81000000
Ratio: 0.02398253086419753


In [45]:
def count_valid_pairs(m, n, p):
    total_pairs = (10**(m-n) - 1)**2* (10**n)**2
    
    valid_pairs = 0
    for A in range(1, 10**(m-n)):
        for B in range(1, 10**(m-n)):
            for a_0 in range(0, 10**n):
                for b_0 in range(0, 10**n):
                    if (A * B * 10**(2*n) + 10**n * (A * b_0 + B * a_0)) % p == 0:
                        valid_pairs += 1
                        
    accuracy = valid_pairs / total_pairs
    return valid_pairs, total_pairs, accuracy

# Example
m, n, p = 4, 3, 51
valid_pairs, total_pairs, accuracy = count_valid_pairs(m, n, p)
valid_pairs, total_pairs, accuracy

(1942585, 81000000, 0.02398253086419753)

In [46]:
import numpy as np
import math
from math import gcd

# p = 201
# n = 4
p1 = p/gcd(p, 10**n)
p2 = p/gcd(p, 10**(2*n))

p1, p2

(51.0, 51.0)