In [19]:
import torch
# Define device first
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device: {torch.cuda.get_device_name()}")

PyTorch version: 2.5.1+cu118
CUDA available: True
GPU device: NVIDIA GeForce RTX 4080 SUPER


In [20]:
username = str('alexli76')
key = str('01b753e09563c67314e7c90d9fd4f6a8')

In [21]:
# All imports
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import os
import shutil  # Add this import
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from datetime import datetime
import time
from torch.optim.lr_scheduler import ReduceLROnPlateau
from tqdm import tqdm
from datetime import datetime

In [22]:
# Download dataset from Kaggle
import opendatasets as od
import pandas

# Create .kaggle directory if it doesn't exist
os.makedirs(os.path.expanduser('~/.kaggle'), exist_ok=True)

try:
    kaggle_token = {
        "username": username,
        "key": key
    }
    
    # Verify the data is string
    print("\nVerifying credentials format:")
    print(f"Username type: {type(username)}")
    print(f"Key type: {type(key)}")
    
    with open(os.path.expanduser('~/.kaggle/kaggle.json'), 'w') as f:
        json.dump(kaggle_token, f)
    
    print("Kaggle credentials saved successfully")
except Exception as e:
    print(f"Error saving Kaggle credentials: {e}")
    raise

# Set permissions
os.chmod(os.path.expanduser('~/.kaggle/kaggle.json'), 0o600)

# Download dataset
dataset_url = "https://www.kaggle.com/datasets/sagyamthapa/handwritten-math-symbols"
od.download(dataset_url)

TypeError: Object of type bytes is not JSON serializable

In [16]:
# Extract dataset from archive with more detailed logging
print("\nStarting extraction process...")
dataset_path = "./handwritten-math-symbols"
archive_path = f"{dataset_path}.zip"
print(f"Looking for archive at: {archive_path}")
print(f"Archive exists: {os.path.exists(archive_path)}")

if os.path.exists(archive_path):
    import zipfile
    with zipfile.ZipFile(archive_path, 'r') as zip_ref:
        # List contents of zip file
        print("\nContents of zip file:")
        for file in zip_ref.namelist()[:10]:  # Show first 10 files
            print(f"- {file}")
        print("...")
        
        zip_ref.extractall("./")
    print("Dataset extracted successfully")

def verify_directory_structure():
    """Verify and print the directory structure"""
    print("\nVerifying directory structure:")
    for root, dirs, files in os.walk('./data'):
        print(f"\nDirectory: {root}")
        print(f"Subdirectories: {dirs}")
        print(f"Number of files: {len(files)}")

# First organize the data
print("\nStarting file organization...")
source_dir = "./handwritten-math-symbols/dataset"
print(f"Looking for source directory at: {source_dir}")
print(f"Source directory exists: {os.path.exists(source_dir)}")

if not os.path.exists(source_dir):
    raise FileNotFoundError(f"Source directory {source_dir} not found!")

# Create destination directories
os.makedirs('./data/digits', exist_ok=True)
os.makedirs('./data/operators', exist_ok=True)

# Define which folders belong to digits and operators
digit_folders = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
operator_folders = ['add', 'sub', 'mul', 'div', 'eq', 'dec', 'x', 'y', 'z']

# Move digit folders
print("\nMoving digit folders...")
for folder in digit_folders:
    src = os.path.join(source_dir, folder)
    dst = os.path.join('./data/digits', folder)
    if os.path.exists(src):
        print(f"Copying {src} to {dst}")
        shutil.copytree(src, dst, dirs_exist_ok=True)
    else:
        print(f"Warning: Source folder not found: {src}")

# Move operator folders
print("\nMoving operator folders...")
for folder in operator_folders:
    src = os.path.join(source_dir, folder)
    dst = os.path.join('./data/operators', folder)
    if os.path.exists(src):
        print(f"Copying {src} to {dst}")
        shutil.copytree(src, dst, dirs_exist_ok=True)
    else:
        print(f"Warning: Source folder not found: {src}")

# Verify the directory structure
verify_directory_structure()

# Verify that we have data in the folders
print("\nVerifying data in folders:")
digit_path = './data/digits'
operator_path = './data/operators'

if not os.path.exists(digit_path) or not os.path.exists(operator_path):
    raise FileNotFoundError("Data directories not created properly!")

digit_classes = sorted(os.listdir(digit_path))
operator_classes = sorted(os.listdir(operator_path))

print(f"\nFound digit classes: {digit_classes}")
print(f"Found operator classes: {operator_classes}")

if not digit_classes:
    raise FileNotFoundError("No digit classes found!")
if not operator_classes:
    raise FileNotFoundError("No operator classes found!")



Starting extraction process...
Looking for archive at: ./handwritten-math-symbols.zip
Archive exists: False

Starting file organization...
Looking for source directory at: ./handwritten-math-symbols/dataset
Source directory exists: True

Moving digit folders...
Copying ./handwritten-math-symbols/dataset\0 to ./data/digits\0
Copying ./handwritten-math-symbols/dataset\1 to ./data/digits\1
Copying ./handwritten-math-symbols/dataset\2 to ./data/digits\2
Copying ./handwritten-math-symbols/dataset\3 to ./data/digits\3
Copying ./handwritten-math-symbols/dataset\4 to ./data/digits\4
Copying ./handwritten-math-symbols/dataset\5 to ./data/digits\5
Copying ./handwritten-math-symbols/dataset\6 to ./data/digits\6
Copying ./handwritten-math-symbols/dataset\7 to ./data/digits\7
Copying ./handwritten-math-symbols/dataset\8 to ./data/digits\8
Copying ./handwritten-math-symbols/dataset\9 to ./data/digits\9

Moving operator folders...
Copying ./handwritten-math-symbols/dataset\add to ./data/operators\ad

In [17]:
# Create transforms
transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.Grayscale(num_output_channels=1),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

# Create datasets with error handling
try:
    digit_dataset = torchvision.datasets.ImageFolder(
        root='./data/digits',
        transform=transform
    )
    print(f"\nDigit dataset created successfully with {len(digit_dataset)} images")
except Exception as e:
    print(f"Error creating digit dataset: {e}")
    raise

try:
    operator_dataset = torchvision.datasets.ImageFolder(
        root='./data/operators',
        transform=transform
    )
    print(f"Operator dataset created successfully with {len(operator_dataset)} images")
except Exception as e:
    print(f"Error creating operator dataset: {e}")
    raise

# Create dataloaders
digit_trainloader = torch.utils.data.DataLoader(
    digit_dataset, 
    batch_size=32,
    shuffle=True, 
    num_workers=2
    )

operator_trainloader = torch.utils.data.DataLoader(
    operator_dataset, 
    batch_size=32,
    shuffle=True, 
    num_workers=2
    )

print("\nDatasets and dataloaders created successfully!")
print(f"Number of digit classes: {len(digit_dataset.classes)}")
print(f"Number of operator classes: {len(operator_dataset.classes)}")



Digit dataset created successfully with 5304 images
Operator dataset created successfully with 4767 images

Datasets and dataloaders created successfully!
Number of digit classes: 10
Number of operator classes: 9


In [18]:
# Define the classes based on the folders
digit_classes = sorted(os.listdir('./data/digits'))
operator_classes = sorted(os.listdir('./data/operators'))

# Define network architectures first
class DigitNet(nn.Module):
    def __init__(self):
        super(DigitNet, self).__init__()
        # Input: 1x32x32
        self.conv1 = nn.Conv2d(1, 6, 5)  # Output: 6x28x28
        self.pool = nn.MaxPool2d(2, 2)    # Output: 6x14x14
        self.conv2 = nn.Conv2d(6, 16, 5)  # Output: 16x10x10
        # After second pooling: 16x5x5
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, len(digit_classes))

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

class OperatorNet(nn.Module):
    def __init__(self):
        super(OperatorNet, self).__init__()
        # Input: 1x32x32
        self.conv1 = nn.Conv2d(1, 6, 5)  # Output: 6x28x28
        self.pool = nn.MaxPool2d(2, 2)    # Output: 6x14x14
        self.conv2 = nn.Conv2d(6, 16, 5)  # Output: 16x10x10
        # After second pooling: 16x5x5
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, len(operator_classes))

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
# Initialize networks on CPU first
digit_net = DigitNet()
operator_net = OperatorNet()

# Diagnostic code to check dimensions and classes
print("\nDiagnostic Information:")
print(f"Digit classes: {digit_classes}")
print(f"Number of digit classes: {len(digit_classes)}")
print(f"DigitNet output dimension: {digit_net.fc3.out_features}")

print(f"Operator classes: {operator_classes}")
print(f"Number of operator classes: {len(operator_classes)}")
print(f"OperatorNet output dimension: {operator_net.fc3.out_features}")

# Verify network structures before moving to GPU
print("\nNetwork Architecture Check:")
print("DigitNet:")
print(f"Input -> Conv1 (1->6) -> Pool -> Conv2 (6->16) -> Pool -> FC1 (400->120) -> FC2 (120->84) -> FC3 (84->{len(digit_classes)})")
print("\nOperatorNet:")
print(f"Input -> Conv1 (1->6) -> Pool -> Conv2 (6->16) -> Pool -> FC1 (400->120) -> FC2 (120->84) -> FC3 (84->{len(operator_classes)})")

# Try moving to GPU with error handling
try:
    print("\nMoving networks to GPU...")
    digit_net = digit_net.to(device)
    operator_net = operator_net.to(device)
    print("Successfully moved networks to GPU")
except RuntimeError as e:
    print(f"Error moving networks to GPU: {e}")
    print("Falling back to CPU")
    device = torch.device("cpu")
    digit_net = digit_net.to(device)
    operator_net = operator_net.to(device)

# Create optimizers after moving networks to device
digit_optimizer = optim.SGD(digit_net.parameters(), lr=0.001, momentum=0.9)
operator_optimizer = optim.SGD(operator_net.parameters(), lr=0.001, momentum=0.9)

# Now verify data and label shapes
print("\nVerifying data shapes:")
sample_digit_batch = next(iter(digit_trainloader))
sample_operator_batch = next(iter(operator_trainloader))

print(f"Digit batch - Images: {sample_digit_batch[0].shape}, Labels: {sample_digit_batch[1].shape}")
print(f"Operator batch - Images: {sample_operator_batch[0].shape}, Labels: {sample_operator_batch[1].shape}")

# Check label distributions
print("\nLabel distributions:")
print(f"Digit labels unique values: {torch.unique(sample_digit_batch[1])}")
print(f"Operator labels unique values: {torch.unique(sample_operator_batch[1])}")

# Verify network output dimensions match number of classes
with torch.no_grad():
    digit_out = digit_net(sample_digit_batch[0].to(device))
    operator_out = operator_net(sample_operator_batch[0].to(device))
    print("\nNetwork output dimensions:")
    print(f"DigitNet output: {digit_out.shape}")
    print(f"OperatorNet output: {operator_out.shape}")



Diagnostic Information:
Digit classes: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
Number of digit classes: 10
DigitNet output dimension: 10
Operator classes: ['add', 'dec', 'div', 'eq', 'mul', 'sub', 'x', 'y', 'z']
Number of operator classes: 9
OperatorNet output dimension: 9

Network Architecture Check:
DigitNet:
Input -> Conv1 (1->6) -> Pool -> Conv2 (6->16) -> Pool -> FC1 (400->120) -> FC2 (120->84) -> FC3 (84->10)

OperatorNet:
Input -> Conv1 (1->6) -> Pool -> Conv2 (6->16) -> Pool -> FC1 (400->120) -> FC2 (120->84) -> FC3 (84->9)

Moving networks to GPU...


NameError: name 'device' is not defined

In [1]:
# Training parameters
num_epochs = 50
eval_interval = 5
early_stopping_patience = 15
best_digit_accuracy = 0
best_operator_accuracy = 0
epochs_without_improvement = 0

# Get current date for logging
training_date = datetime.now().strftime("%Y-%m-%d")
start_time = time.time()

# Add learning rate scheduler
digit_scheduler = ReduceLROnPlateau(digit_optimizer, mode='max', factor=0.1, patience=5, verbose=True)
operator_scheduler = ReduceLROnPlateau(operator_optimizer, mode='max', factor=0.1, patience=5, verbose=True)

# Diagnostic code to check dimensions and classes
print("\nDiagnostic Information:")

# Check digit dataset
digit_classes = sorted(os.listdir('./data/digits'))
print(f"Digit classes: {digit_classes}")
print(f"Number of digit classes: {len(digit_classes)}")
print(f"DigitNet output dimension: {digit_net.fc3.out_features}")

# Check operator dataset
operator_classes = sorted(os.listdir('./data/operators'))
print(f"Operator classes: {operator_classes}")
print(f"Number of operator classes: {len(operator_classes)}")
print(f"OperatorNet output dimension: {operator_net.fc3.out_features}")

# Check a batch of data
digit_batch = next(iter(digit_trainloader))
operator_batch = next(iter(operator_trainloader))

print("\nBatch shapes:")
print(f"Digit batch - Images: {digit_batch[0].shape}, Labels: {digit_batch[1].shape}")
print(f"Operator batch - Images: {operator_batch[0].shape}, Labels: {operator_batch[1].shape}")

print("\nLabel ranges:")
print(f"Digit labels: {digit_batch[1].min().item()} to {digit_batch[1].max().item()}")
print(f"Operator labels: {operator_batch[1].min().item()} to {operator_batch[1].max().item()}")


# Add criterion definitions before training
digit_criterion = nn.CrossEntropyLoss()
operator_criterion = nn.CrossEntropyLoss()

# Add dataset class mapping
digit_dataset.class_to_idx  # Check the mapping of classes to indices
operator_dataset.class_to_idx  # Check the mapping of classes to indices

# Before training loop, add diagnostic prints
print("\nClass mappings:")
print(f"Digit classes: {digit_dataset.class_to_idx}")
print(f"Operator classes: {operator_dataset.class_to_idx}")

# Training loop
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    
    # Training digit network
    digit_net.train()
    digit_running_loss = 0.0
    epoch_start = time.time()
    
    # Add progress bar
    pbar = tqdm(digit_trainloader, desc=f"Training Digit Net")
    for i, data in enumerate(pbar):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        
        digit_optimizer.zero_grad()
        outputs = digit_net(inputs)
        loss = digit_criterion(outputs, labels)
        loss.backward()
        digit_optimizer.step()
        
        digit_running_loss += loss.item()
        if i % 100 == 99:
            pbar.set_postfix({'loss': digit_running_loss/100})
            digit_running_loss = 0.0
    
    # Training operator network
    operator_net.train()
    operator_running_loss = 0.0
    
    pbar = tqdm(operator_trainloader, desc=f"Training Operator Net")
    for i, data in enumerate(pbar):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        
        operator_optimizer.zero_grad()
        outputs = operator_net(inputs)
        loss = operator_criterion(outputs, labels)
        loss.backward()
        operator_optimizer.step()
        
        operator_running_loss += loss.item()
        if i % 100 == 99:
            pbar.set_postfix({'loss': operator_running_loss/100})
            operator_running_loss = 0.0
    
    # Evaluation every eval_interval epochs
    if epoch % eval_interval == 0:
        digit_net.eval()
        operator_net.eval()
        
        # Evaluate digit network
        correct = 0
        total = 0
        with torch.no_grad():
            for data in digit_trainloader:
                images, labels = data
                images, labels = images.to(device), labels.to(device)
                outputs = digit_net(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        current_digit_accuracy = 100 * correct / total
        print(f"\nDigit Network Accuracy: {current_digit_accuracy:.2f}%")
        
        # Evaluate operator network
        correct = 0
        total = 0
        with torch.no_grad():
            for data in operator_trainloader:
                images, labels = data
                images, labels = images.to(device), labels.to(device)
                outputs = operator_net(images)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        
        current_operator_accuracy = 100 * correct / total
        print(f"Operator Network Accuracy: {current_operator_accuracy:.2f}%")
        
        # Update schedulers
        digit_scheduler.step(current_digit_accuracy)
        operator_scheduler.step(current_operator_accuracy)
        
        # Check for improvement
        if current_digit_accuracy > best_digit_accuracy or current_operator_accuracy > best_operator_accuracy:
            best_digit_accuracy = max(best_digit_accuracy, current_digit_accuracy)
            best_operator_accuracy = max(best_operator_accuracy, current_operator_accuracy)
            torch.save(digit_net.state_dict(), 'digit_net_best.pth')
            torch.save(operator_net.state_dict(), 'operator_net_best.pth')
            epochs_without_improvement = 0
            print("New best accuracy! Saved models.")
        else:
            epochs_without_improvement += eval_interval
        
        # Early stopping check
        if epochs_without_improvement >= early_stopping_patience:
            print("\nEarly stopping triggered!")
            break
    
    # Print epoch summary
    epoch_time = time.time() - epoch_start
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    print(f"Time taken: {epoch_time:.2f} seconds")
    print(f"Best Digit Accuracy: {best_digit_accuracy:.2f}%")
    print(f"Best Operator Accuracy: {best_operator_accuracy:.2f}%")

print('Finished Training')
# Save final weights
print("\nSaving final model weights...")
try:
    final_digit_path = 'digit_net_final.pth'
    final_operator_path = 'operator_net_final.pth'
    
    torch.save(digit_net.state_dict(), final_digit_path)
    torch.save(operator_net.state_dict(), final_operator_path)
    
    # Verify files were saved
    if os.path.exists(final_digit_path) and os.path.exists(final_operator_path):
        print(f"Final weights saved successfully:")
        print(f"- Digit network: {final_digit_path}")
        print(f"- Operator network: {final_operator_path}")
    else:
        print("Warning: Weight files not found after saving!")
except Exception as e:
    print(f"Error saving final weights: {e}")

# Also verify best weights were saved
best_digit_path = 'digit_net_best.pth'
best_operator_path = 'operator_net_best.pth'

if os.path.exists(best_digit_path) and os.path.exists(best_operator_path):
    print("\nBest weights were saved during training:")
    print(f"- Best digit network: {best_digit_path}")
    print(f"- Best operator network: {best_operator_path}")
else:
    print("\nWarning: Best weight files not found!")

NameError: name 'datetime' is not defined