In [None]:
from google.colab import drive
import sys
import os


drive.mount('/content/drive')

FOLDERNAME_IN_MY_DRIVE = 'cse493s'  # <--- CHANGE THIS TO YOUR ACTUAL FOLDER PATH
assert FOLDERNAME_IN_MY_DRIVE is not None, "[!] Enter the foldername."

# Construct the full path to your project folder
PROJECT_FOLDER_PATH = os.path.join('/content/drive/MyDrive/', FOLDERNAME_IN_MY_DRIVE)

if PROJECT_FOLDER_PATH not in sys.path:
    sys.path.append(PROJECT_FOLDER_PATH)
    print(f"Added '{PROJECT_FOLDER_PATH}' to sys.path")
else:
    print(f"'{PROJECT_FOLDER_PATH}' is already in sys.path")

try:
    os.chdir(PROJECT_FOLDER_PATH)
    print(f"Changed current working directory to: {os.getcwd()}")
except FileNotFoundError:
    print(f"[ERROR] The folder '{PROJECT_FOLDER_PATH}' was not found. Please check your FOLDERNAME_IN_MY_DRIVE.")
    # You might want to stop execution here or handle the error appropriately

# Verify by listing files in the current directory
print("\nFiles in the current working directory (should be your project folder):")
!ls

Mounted at /content/drive
Added '/content/drive/MyDrive/cse493s' to sys.path
Changed current working directory to: /content/drive/MyDrive/cse493s

Files in the current working directory (should be your project folder):
data		  inference.py	out	     train.py
generate_data.py  model.py	__pycache__  Untitled0.ipynb


In [8]:
%load_ext ipyparallel
from ipyparallel import Client
rc = Client()
dview = rc[:]
dview.block = False

The ipyparallel module is not an IPython extension.


In [None]:
!python generate_data.py --sanity_check --output_dir data/sanity_check

Saved 100 equations to data/sanity_check/train.txt
Saved 20 equations to data/sanity_check/val.txt
Saved 20 equations to data/sanity_check/test.txt

Generated sanity check data in data/sanity_check


In [None]:
!python train.py \
    --data_dir data/sanity_check \
    --out_dir out/sanity_check \
    --n_layer 1 \
    --n_embd 32 \
    --n_head 4 \
    --max_steps 1000 \
    --log_interval 10 \
    --eval_interval 100

Loading data from data/sanity_check
Vocabulary size: 15
Initializing 1-layer model
number of parameters: 0.01M
num decayed parameter tensors: 6, with 13,792 parameters
num non-decayed parameter tensors: 3, with 96 parameters
using fused AdamW: True
  scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
Starting training for 1000 steps
Step 0: train loss 2.7670, train acc 0.0000
Step 0: val loss 2.7379, val acc 0.0000
Step 10: train loss 2.5085, train acc 0.1364
Step 20: train loss 2.3168, train acc 0.5000
Step 30: train loss 2.1729, train acc 0.9545
Step 40: train loss 2.0515, train acc 1.0000
Step 50: train loss 1.9524, train acc 1.0000
Step 60: train loss 1.8636, train acc 1.0000
Step 70: train loss 1.7805, train acc 1.0000
Step 80: train loss 1.6967, train acc 1.0000
Step 90: train loss 1.6186, train acc 1.0000
Step 100: train loss 1.5380, train acc 1.0000
Step 100: val loss 1.5305, val acc 1.0000
Step 110: train loss 1.4602, train acc 1.0000
Step 120: train loss 1.3835,

In [None]:
!python train.py \
    --data_dir data/sanity_check \
    --out_dir out/sanity_check_masked \
    --n_layer 1 \
    --n_embd 32 \
    --n_head 4 \
    --max_steps 1000 \
    --log_interval 10 \
    --eval_interval 100 \
    --mask_first_n 3

Loading data from data/sanity_check
Vocabulary size: 15
Initializing 1-layer model
number of parameters: 0.01M
num decayed parameter tensors: 6, with 13,792 parameters
num non-decayed parameter tensors: 3, with 96 parameters
using fused AdamW: True
  scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
Starting training for 1000 steps
Step 0: train loss 2.7633, train acc 0.0000
Step 0: val loss 2.7344, val acc 0.0000
Step 10: train loss 2.4984, train acc 0.1000
Step 20: train loss 2.3031, train acc 0.5500
Step 30: train loss 2.1609, train acc 0.9500
Step 40: train loss 2.0457, train acc 1.0000
Step 50: train loss 1.9488, train acc 1.0000
Step 60: train loss 1.8613, train acc 1.0000
Step 70: train loss 1.7789, train acc 1.0000
Step 80: train loss 1.6980, train acc 1.0000
Step 90: train loss 1.6180, train acc 1.0000
Step 100: train loss 1.5383, train acc 1.0000
Step 100: val loss 1.5309, val acc 1.0000
Step 110: train loss 1.4609, train acc 1.0000
Step 120: train loss 1.3828,

In [None]:
!python inference.py \
    --checkpoint out/sanity_check/final_model.pt \
    --prompts "I" \
    --max_new_tokens 22 \
    --temperature 0.1

Loading checkpoint from out/sanity_check/final_model.pt
Loaded tokenizer with vocabulary size: 15
number of parameters: 0.01M
Loaded model with 1 layers, 32 dimensions

Prompt 1: 'I'
Generated: I love machine learning
--------------------------------------------------

Inference completed!


In [None]:
!python inference.py \
    --checkpoint out/sanity_check_masked/final_model.pt \
    --prompts "I l" \
    --max_new_tokens 20 \
    --temperature 0.1

Loading checkpoint from out/sanity_check_masked/final_model.pt
Loaded tokenizer with vocabulary size: 15
number of parameters: 0.01M
Loaded model with 1 layers, 32 dimensions

Prompt 1: 'I l'
Generated: I love machine learning
--------------------------------------------------

Inference completed!


In [26]:
!python generate_data.py \
    --operations add,subtract,divide \
    --moduli 97,113 \
    --output_dir data/algorithmic


Generating add data with modulus 97
Saved 6586 equations to data/algorithmic/add_mod97/train.txt
Saved 1411 equations to data/algorithmic/add_mod97/val.txt
Saved 1412 equations to data/algorithmic/add_mod97/test.txt
Total equations: 9409
Train: 6586, Val: 1411, Test: 1412
Example equations:
  73+93=69
  15+74=89
  75+47=25
  89+93=85
  76+27=6

Generating add data with modulus 113
Saved 8938 equations to data/algorithmic/add_mod113/train.txt
Saved 1915 equations to data/algorithmic/add_mod113/val.txt
Saved 1916 equations to data/algorithmic/add_mod113/test.txt
Total equations: 12769
Train: 8938, Val: 1915, Test: 1916
Example equations:
  71+4=75
  99+97=83
  7+59=66
  102+16=5
  1+42=43

Generating subtract data with modulus 97
Saved 6586 equations to data/algorithmic/subtract_mod97/train.txt
Saved 1411 equations to data/algorithmic/subtract_mod97/val.txt
Saved 1412 equations to data/algorithmic/subtract_mod97/test.txt
Total equations: 9409
Train: 6586, Val: 1411, Test: 1412
Example e

In [28]:
!for seed in 42 123 456; do echo "Running experiment with seed $seed"; python train.py --data_dir data/algorithmic/add_mod97 --out_dir out/add_mod97_layer1_seed${seed} --seed $seed --n_layer 1 --n_embd 128 --n_head 4 --batch_size 64 --max_steps 100000 --learning_rate 1e-3 --log_interval 100 --eval_interval 1000; done

Running experiment with seed 42
Loading data from data/algorithmic/add_mod97
Vocabulary size: 102
Initializing 1-layer model
number of parameters: 0.21M
num decayed parameter tensors: 6, with 213,760 parameters
num non-decayed parameter tensors: 3, with 384 parameters
using fused AdamW: True
  scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
Starting training for 100000 steps
Step 0: train loss 4.7021, train acc 0.0000
Step 0: val loss 4.6801, val acc 0.0054
Step 100: train loss 4.6313, train acc 0.0156
Step 200: train loss 4.5825, train acc 0.0000
Step 300: train loss 4.5933, train acc 0.0000
Step 400: train loss 4.5830, train acc 0.0000
Step 500: train loss 4.6040, train acc 0.0156
Step 600: train loss 4.5840, train acc 0.0000
Step 700: train loss 4.5806, train acc 0.0156
Step 800: train loss 4.5762, train acc 0.0312
Step 900: train loss 4.5781, train acc 0.0156
Step 1000: train loss 4.5830, train acc 0.0000
Step 1000: val loss 4.5859, val acc 0.0061
Step 1100: train 

In [27]:
!for seed in 42 123 456; do echo "Running experiment with seed $seed"; python train.py --data_dir data/algorithmic/add_mod97 --out_dir out/add_mod97_layer2_seed${seed} --seed $seed --n_layer 2 --n_embd 128 --n_head 4 --batch_size 64 --max_steps 100000 --learning_rate 1e-3 --log_interval 100 --eval_interval 1000; done

Running experiment with seed 42
^C
Traceback (most recent call last):
  File "/home/xsling/CSE/599s/train.py", line 452, in <module>
    args = parse_args()
           ^^^^^^^^^^^^
  File "/home/xsling/CSE/599s/train.py", line 445, in parse_args
    parser.add_argument('--device', type=str, default='cuda' if torch.cuda.is_available() else 'cpu', help='Device to use')
                                                                ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/xsling/.miniforge3/envs/599s/lib/python3.12/site-packages/torch/cuda/__init__.py", line 174, in is_available
    return torch._C._cuda_getDeviceCount() > 0
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


In [13]:
!for seed in 42 123 456; do echo "Running experiment with seed $seed"; python train.py --data_dir data/algorithmic/add_mod113 --out_dir out/add_mod113_layer1_seed${seed} --seed $seed --n_layer 1 --n_embd 128 --n_head 4 --batch_size 64 --max_steps 100000 --learning_rate 1e-3 --log_interval 100 --eval_interval 1000; done

<AsyncResult(%px): pending>

In [14]:
!for seed in 42 123 456; do echo "Running experiment with seed $seed"; python train.py --data_dir data/algorithmic/add_mod113 --out_dir out/add_mod113_layer2_seed${seed} --seed $seed --n_layer 2 --n_embd 128 --n_head 4 --batch_size 64 --max_steps 100000 --learning_rate 1e-3 --log_interval 100 --eval_interval 1000; done

<AsyncResult(%px): pending>

In [15]:
!for seed in 42 123 456; do echo "Running experiment with seed $seed"; python train.py --data_dir data/algorithmic/subtract_mod97 --out_dir out/subtract_mod97_layer1_seed${seed} --seed $seed --n_layer 1 --n_embd 128 --n_head 4 --batch_size 64 --max_steps 100000 --learning_rate 1e-3 --log_interval 100 --eval_interval 1000; done

<AsyncResult(%px): pending>

In [16]:
!for seed in 42 123 456; do echo "Running experiment with seed $seed"; python train.py --data_dir data/algorithmic/subtract_mod97 --out_dir out/subtract_mod97_layer2_seed${seed} --seed $seed --n_layer 2 --n_embd 128 --n_head 4 --batch_size 64 --max_steps 100000 --learning_rate 1e-3 --log_interval 100 --eval_interval 1000; done

<AsyncResult(%px): pending>

In [17]:
!for seed in 42 123 456; do echo "Running experiment with seed $seed"; python train.py --data_dir data/algorithmic/subtract_mod113 --out_dir out/subtract_mod113_layer1_seed${seed} --seed $seed --n_layer 1 --n_embd 128 --n_head 4 --batch_size 64 --max_steps 100000 --learning_rate 1e-3 --log_interval 100 --eval_interval 1000; done

<AsyncResult(%px): pending>

In [18]:
!for seed in 42 123 456; do echo "Running experiment with seed $seed"; python train.py --data_dir data/algorithmic/subtract_mod113 --out_dir out/subtract_mod113_layer2_seed${seed} --seed $seed --n_layer 2 --n_embd 128 --n_head 4 --batch_size 64 --max_steps 100000 --learning_rate 1e-3 --log_interval 100 --eval_interval 1000; done

<AsyncResult(%px): pending>

In [19]:
!for seed in 42 123 456; do echo "Running experiment with seed $seed"; python train.py --data_dir data/algorithmic/divide_mod97 --out_dir out/divide_mod97_layer2_seed${seed}_batch64 --seed $seed --n_layer 2 --n_embd 128 --n_head 4 --batch_size 64 --max_steps 100000 --learning_rate 1e-3 --weight_decay 1.0 --beta1 0.9 --beta2 0.98 --log_interval 100 --eval_interval 1000; done

<AsyncResult(%px): pending>

In [20]:
!for seed in 42 123 456; do echo "Running experiment with seed $seed"; python train.py --data_dir data/algorithmic/divide_mod97 --out_dir out/divide_mod97_layer2_seed${seed}_batch16 --seed $seed --n_layer 2 --n_embd 128 --n_head 4 --batch_size 16 --max_steps 100000 --learning_rate 1e-3 --weight_decay 1.0 --beta1 0.9 --beta2 0.98 --log_interval 100 --eval_interval 1000; done

<AsyncResult(%px): pending>

In [21]:
!python train.py --data_dir data/algorithmic/divide_mod97 --out_dir out/divide_mod97_layer2_seed42_batch256 --seed 42 --n_layer 2 --n_embd 128 --n_head 4 --batch_size 256 --max_steps 100000 --learning_rate 1e-3 --weight_decay 1.0 --beta1 0.9 --beta2 0.98 --log_interval 100 --eval_interval 1000

IndexError: No such engine: 10

In [None]:
!python train.py --data_dir data/algorithmic/divide_mod97 --out_dir out/divide_mod97_layer2_seed42_batch512 --seed 42 --n_layer 2 --n_embd 128 --n_head 4 --batch_size 512 --max_steps 100000 --learning_rate 1e-3 --weight_decay 1.0 --beta1 0.9 --beta2 0.98 --log_interval 100 --eval_interval 1000

In [None]:
# Test the new number-level tokenization
from train import NumberTokenizer

# Example equations (no special tokens needed)
test_equations = [
    "12+24=36",
    "100-57=43",
    "8/2=4"
]

# Create tokenizer
tokenizer = NumberTokenizer(test_equations)

print(f"Vocabulary size: {tokenizer.vocab_size}")
print(f"Tokens: {tokenizer.tokens}")
print("\nTokenization examples:")

for eq in test_equations:
    encoded = tokenizer.encode(eq)
    decoded = tokenizer.decode(encoded)
    print(f"\nOriginal: {eq}")
    print(f"Encoded: {encoded}")
    print(f"Tokens: {[tokenizer.idx_to_token[idx] for idx in encoded]}")
    print(f"Decoded: {decoded}")
