In [1]:
print("="*60)
print("STEP 1: DOWNLOADING GOOGLE FONTS")
print("="*60)

# Install required packages
print("\nüì¶ Installing required packages...")
!pip install fonttools requests pillow

import os
import glob
from pathlib import Path

# Create directories
print("\nüìÅ Creating directories...")
fonts_dir = Path("google_fonts_repo")
fonts_dir.mkdir(exist_ok=True)

print("\nüì• Cloning Google Fonts repository...")
print("   (This may take 2-3 minutes - it's ~500MB)")

# Clone the Google Fonts GitHub repo
!git clone --depth 1 https://github.com/google/fonts.git google_fonts_repo

print("\n‚úÖ Google Fonts repository downloaded!")

# Analyze what we got
print("\nüìä Analyzing downloaded fonts...")

# Find all .ttf files
font_files = glob.glob("google_fonts_repo/**/*.ttf", recursive=True)
print(f"\n   Total .ttf files found: {len(font_files)}")

# Separate regular fonts from variable fonts
regular_fonts = [f for f in font_files if "[" not in f]
variable_fonts = [f for f in font_files if "[" in f]

print(f"   Regular fonts: {len(regular_fonts)}")
print(f"   Variable fonts: {len(variable_fonts)}")

# Get unique font families
font_families = set()
for font_path in regular_fonts:
    # Extract font family name from path
    parts = Path(font_path).parts
    if len(parts) >= 3:
        font_families.add(parts[-2])

print(f"   Unique font families: {len(font_families)}")

# Show some examples
print("\nüìù Sample fonts:")
for i, font_path in enumerate(regular_fonts[:10]):
    font_name = Path(font_path).name
    print(f"   ‚Ä¢ {font_name}")

print(f"\n   ... and {len(regular_fonts) - 10} more")

print("\n‚úÖ Step 1 complete! Ready for Step 2 (generating training images)")

STEP 1: DOWNLOADING GOOGLE FONTS

üì¶ Installing required packages...

üìÅ Creating directories...

üì• Cloning Google Fonts repository...
   (This may take 2-3 minutes - it's ~500MB)
Cloning into 'google_fonts_repo'...
remote: Enumerating objects: 18985, done.[K
remote: Counting objects: 100% (18985/18985), done.[K
remote: Compressing objects: 100% (15371/15371), done.[K
remote: Total 18985 (delta 3909), reused 12827 (delta 3050), pack-reused 0 (from 0)[K
Receiving objects: 100% (18985/18985), 1.18 GiB | 14.30 MiB/s, done.
Resolving deltas: 100% (3909/3909), done.
Checking connectivity: 18985, done.
Updating files: 100% (16945/16945), done.

‚úÖ Google Fonts repository downloaded!

üìä Analyzing downloaded fonts...

   Total .ttf files found: 3817
   Regular fonts: 3083
   Variable fonts: 734
   Unique font families: 1448

üìù Sample fonts:
   ‚Ä¢ Aclonica-Regular.ttf
   ‚Ä¢ Calligraffitti-Regular.ttf
   ‚Ä¢ CherryCreamSoda-Regular.ttf
   ‚Ä¢ Chewy-Regular.ttf
   ‚Ä¢ ComingSo

In [2]:
print("="*60)
print("STEP 2: GENERATING TRAINING IMAGES")
print("="*60)

from PIL import Image, ImageDraw, ImageFont
import numpy as np
from pathlib import Path
import random
import json

class GoogleFontsDatasetGenerator:
    def __init__(self, fonts_dir="google_fonts_repo", output_dir="training_data"):
        self.fonts_dir = Path(fonts_dir)
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(exist_ok=True)
        
        # Sample texts to render
        self.sample_texts = [
            "The quick brown fox jumps over the lazy dog",
            "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
            "abcdefghijklmnopqrstuvwxyz",
            "0123456789",
            "Hello World! Design Systems 2025",
            "Typography & Layout Patterns",
            "Beautiful Design Tokens",
            "Color Spacing Grid",
        ]
        
        # Image settings
        self.img_size = (320, 320)
        self.font_sizes = [28, 36, 44, 52, 60]
        
    def get_font_files(self):
        """Get all .ttf files from Google Fonts repo"""
        font_files = list(self.fonts_dir.glob("**/*.ttf"))
        # Filter out variable fonts for now
        font_files = [f for f in font_files if "[" not in f.name]
        return font_files
    
    def generate_image(self, font_path, text, font_size):
        """Generate a single training image"""
        try:
            # Create white background
            img = Image.new('RGB', self.img_size, color='white')
            draw = ImageDraw.Draw(img)
            
            # Load font
            font = ImageFont.truetype(str(font_path), font_size)
            
            # Calculate text position (centered)
            bbox = draw.textbbox((0, 0), text, font=font)
            text_width = bbox[2] - bbox[0]
            text_height = bbox[3] - bbox[1]
            
            x = (self.img_size[0] - text_width) // 2
            y = (self.img_size[1] - text_height) // 2
            
            # Draw text in black
            draw.text((x, y), text, font=font, fill='black')
            
            return img
        except Exception as e:
            # Some fonts might fail to load or render
            return None
    
    def generate_dataset(self, samples_per_font=10, max_fonts=None):
        """Generate full training dataset"""
        font_files = self.get_font_files()
        
        if max_fonts:
            font_files = font_files[:max_fonts]
        
        print(f"\nüé® Generating dataset from {len(font_files)} fonts...")
        print(f"   Creating {samples_per_font} samples per font")
        print(f"   Total target: {len(font_files) * samples_per_font} images\n")
        
        dataset_info = []
        total_generated = 0
        failed_fonts = []
        
        for font_idx, font_path in enumerate(font_files):
            # Get font family name
            font_name = font_path.stem  # e.g., "Roboto-Regular"
            
            # Extract family name (the parent directory)
            font_family = font_path.parent.name
            
            # Create directory for this font family
            font_output_dir = self.output_dir / font_family
            font_output_dir.mkdir(exist_ok=True)
            
            # Generate multiple samples
            samples_generated = 0
            for sample_idx in range(samples_per_font):
                # Random text and size
                text = random.choice(self.sample_texts)
                font_size = random.choice(self.font_sizes)
                
                # Generate image
                img = self.generate_image(font_path, text, font_size)
                
                if img:
                    # Save image
                    img_filename = f"{font_name}_{sample_idx}.png"
                    img_path = font_output_dir / img_filename
                    img.save(img_path)
                    
                    # Record metadata
                    dataset_info.append({
                        'image_path': str(img_path),
                        'font_name': font_name,
                        'font_family': font_family,
                        'text': text,
                        'font_size': font_size
                    })
                    
                    total_generated += 1
                    samples_generated += 1
            
            if samples_generated == 0:
                failed_fonts.append(str(font_path))
            
            # Progress update every 100 fonts
            if (font_idx + 1) % 100 == 0:
                print(f"   ‚úì Processed {font_idx + 1}/{len(font_files)} fonts ({total_generated} images generated)")
        
        print(f"\n‚úÖ Dataset generation complete!")
        print(f"   Total images generated: {total_generated}")
        print(f"   Unique font families: {len(set(info['font_family'] for info in dataset_info))}")
        print(f"   Failed fonts: {len(failed_fonts)}")
        
        # Save dataset metadata
        with open(self.output_dir / "dataset_info.json", 'w') as f:
            json.dump(dataset_info, f, indent=2)
        
        if failed_fonts:
            with open(self.output_dir / "failed_fonts.txt", 'w') as f:
                f.write("\n".join(failed_fonts))
        
        return dataset_info

# Create generator
generator = GoogleFontsDatasetGenerator()

# Generate dataset - starting with 200 fonts for testing
print("üöÄ Starting dataset generation...")
print("   Starting with 200 fonts to test (about 2,000 images)")
print("   This should take 3-5 minutes...\n")

dataset_info = generator.generate_dataset(
    samples_per_font=10,
    max_fonts=200  # Start with 200 fonts, can increase to all 3,083 later
)

print(f"\nüìä Dataset ready!")
print(f"   Location: training_data/")
print(f"   Ready for Step 3: Model Training")

STEP 2: GENERATING TRAINING IMAGES
üöÄ Starting dataset generation...
   Starting with 200 fonts to test (about 2,000 images)
   This should take 3-5 minutes...


üé® Generating dataset from 200 fonts...
   Creating 10 samples per font
   Total target: 2000 images

   ‚úì Processed 100/200 fonts (1000 images generated)
   ‚úì Processed 200/200 fonts (2000 images generated)

‚úÖ Dataset generation complete!
   Total images generated: 2000
   Unique font families: 105
   Failed fonts: 0

üìä Dataset ready!
   Location: training_data/
   Ready for Step 3: Model Training


In [4]:
print("="*60)
print("STEP 3: TRAINING THE MODEL")
print("="*60)

# Install transformers for training
print("\nüì¶ Installing training packages...")
!pip install transformers datasets accelerate scikit-learn

import torch
from transformers import AutoImageProcessor, AutoModelForImageClassification, TrainingArguments, Trainer
from datasets import load_dataset
from sklearn.metrics import accuracy_score
import numpy as np

print("\nüîß Preparing dataset and model...")

# Load dataset from the images we generated
print("   Loading dataset from training_data/...")
dataset = load_dataset("imagefolder", data_dir="training_data")

print(f"\nüìä Dataset loaded:")
print(f"   Total samples: {len(dataset['train'])}")
print(f"   Number of font classes: {dataset['train'].features['label'].num_classes}")

# Get label information
labels = dataset['train'].features['label'].names
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

print(f"\nüìù Sample font families (first 10):")
for i, label in enumerate(labels[:10]):
    print(f"   {i}. {label}")
print(f"   ... and {len(labels) - 10} more")

# Load base model (ResNet-18 - same as gaborcselle used)
print("\nü§ñ Loading base model (ResNet-18)...")
model_name = "microsoft/resnet-18"
processor = AutoImageProcessor.from_pretrained(model_name)

# Create model for fine-tuning
model = AutoModelForImageClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

print("   ‚úì Base model loaded")

# Preprocessing function
def preprocess(examples):
    """Preprocess images for training"""
    images = [img.convert("RGB") for img in examples['image']]
    inputs = processor(images, return_tensors='pt')
    inputs['labels'] = examples['label']
    return inputs

# Apply preprocessing
print("\nüîÑ Preprocessing images...")
dataset = dataset.map(preprocess, batched=True, remove_columns=['image'])
dataset.set_format('torch', columns=['pixel_values', 'labels'])

# Split into train/validation (80/20)
print("   Splitting into train/validation sets...")
dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)

print(f"   Training samples: {len(dataset['train'])}")
print(f"   Validation samples: {len(dataset['test'])}")

# Define metrics
def compute_metrics(eval_pred):
    """Calculate accuracy during training"""
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Training configuration (optimized for quick testing)
print("="*60)
print("STEP 3: TRAINING THE MODEL (CPU MODE)")
print("="*60)

# Install transformers for training
print("\nüì¶ Installing training packages...")
!pip install transformers datasets accelerate scikit-learn

import torch
from transformers import AutoImageProcessor, AutoModelForImageClassification, TrainingArguments, Trainer
from datasets import load_dataset
from sklearn.metrics import accuracy_score
import numpy as np

# Force CPU usage to avoid CUDA errors
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''

print("\nüîß Preparing dataset and model...")
print("   Running on CPU for stability")

# Load dataset from the images we generated
print("   Loading dataset from training_data/...")
dataset = load_dataset("imagefolder", data_dir="training_data")

print(f"\nüìä Dataset loaded:")
print(f"   Total samples: {len(dataset['train'])}")
print(f"   Number of font classes: {dataset['train'].features['label'].num_classes}")

# Get label information
labels = dataset['train'].features['label'].names
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

print(f"\nüìù Sample font families (first 10):")
for i, label in enumerate(labels[:10]):
    print(f"   {i}. {label}")
print(f"   ... and {len(labels) - 10} more")

# Load base model (ResNet-18)
print("\nü§ñ Loading base model (ResNet-18)...")
model_name = "microsoft/resnet-18"
processor = AutoImageProcessor.from_pretrained(model_name)

# Create model for fine-tuning
model = AutoModelForImageClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

print("   ‚úì Base model loaded")

# Preprocessing function
def preprocess(examples):
    """Preprocess images for training"""
    images = [img.convert("RGB") for img in examples['image']]
    inputs = processor(images, return_tensors='pt')
    inputs['labels'] = examples['label']
    return inputs

# Apply preprocessing
print("\nüîÑ Preprocessing images...")
dataset = dataset.map(preprocess, batched=True, remove_columns=['image'])
dataset.set_format('torch', columns=['pixel_values', 'labels'])

# Split into train/validation (80/20)
print("   Splitting into train/validation sets...")
dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)

print(f"   Training samples: {len(dataset['train'])}")
print(f"   Validation samples: {len(dataset['test'])}")

# Define metrics
def compute_metrics(eval_pred):
    """Calculate accuracy during training"""
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Training configuration - REDUCED for CPU
print("\n‚öôÔ∏è Configuring training for CPU...")
training_args = TrainingArguments(
    output_dir="./google_fonts_model_checkpoint",
    num_train_epochs=10,  # Reduced from 20 for CPU
    per_device_train_batch_size=8,  # Reduced from 16 for CPU
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    warmup_ratio=0.1,
    logging_steps=20,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    report_to="none",
    no_cuda=True,  # Force CPU usage
)

print(f"\nüìã Training Configuration:")
print(f"   Device: CPU")
print(f"   Epochs: {training_args.num_train_epochs}")
print(f"   Batch size: {training_args.per_device_train_batch_size}")
print(f"   Learning rate: {training_args.learning_rate}")
print(f"   Estimated time: 20-30 minutes on CPU")

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
)

print("\nüöÄ Starting training...")
print("   This will take 20-30 minutes on CPU")
print("   You'll see progress updates every 20 steps\n")
print("="*60)

# Train!
trainer.train()

print("\n" + "="*60)
print("‚úÖ Training complete!")
print("="*60)

# Evaluate final performance
print("\nüìä Final Evaluation:")
eval_results = trainer.evaluate()
print(f"   Final Accuracy: {eval_results['eval_accuracy']:.2%}")
print(f"   Final Loss: {eval_results['eval_loss']:.4f}")

# Save the final model
print("\nüíæ Saving model...")
model.save_pretrained("./tesserae_google_fonts_model")
processor.save_pretrained("./tesserae_google_fonts_model")

print("\n‚úÖ Model saved to: ./tesserae_google_fonts_model")
print("\nüéâ Training complete! Ready for Step 4: Testing")

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
)

print("\nüöÄ Starting training...")
print("   This will take 15-25 minutes depending on GPU availability")
print("   You'll see progress updates every 20 steps\n")
print("="*60)

# Train!
trainer.train()

print("\n" + "="*60)
print("‚úÖ Training complete!")
print("="*60)

# Evaluate final performance
print("\nüìä Final Evaluation:")
eval_results = trainer.evaluate()
print(f"   Final Accuracy: {eval_results['eval_accuracy']:.2%}")
print(f"   Final Loss: {eval_results['eval_loss']:.4f}")

# Save the final model
print("\nüíæ Saving model...")
model.save_pretrained("./tesserae_google_fonts_model")
processor.save_pretrained("./tesserae_google_fonts_model")

print("\n‚úÖ Model saved to: ./tesserae_google_fonts_model")
print("\nüéâ Training complete! Ready for Step 4: Testing")

STEP 3: TRAINING THE MODEL

üì¶ Installing training packages...

üîß Preparing dataset and model...
   Loading dataset from training_data/...


Resolving data files:   0%|          | 0/2000 [00:00<?, ?it/s]


üìä Dataset loaded:
   Total samples: 2000
   Number of font classes: 105

üìù Sample font families (first 10):
   0. abeezee
   1. abel
   2. abhayalibre
   3. aboreto
   4. abrilfatface
   5. abyssinicasil
   6. aclonica
   7. acme
   8. actor
   9. adamina
   ... and 95 more

ü§ñ Loading base model (ResNet-18)...


Some weights of ResNetForImageClassification were not initialized from the model checkpoint at microsoft/resnet-18 and are newly initialized because the shapes did not match:
- classifier.1.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([105]) in the model instantiated
- classifier.1.weight: found shape torch.Size([1000, 512]) in the checkpoint and torch.Size([105, 512]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


   ‚úì Base model loaded

üîÑ Preprocessing images...
   Splitting into train/validation sets...
   Training samples: 1600
   Validation samples: 400
STEP 3: TRAINING THE MODEL (CPU MODE)

üì¶ Installing training packages...

üîß Preparing dataset and model...
   Running on CPU for stability
   Loading dataset from training_data/...


Resolving data files:   0%|          | 0/2000 [00:00<?, ?it/s]


üìä Dataset loaded:
   Total samples: 2000
   Number of font classes: 105

üìù Sample font families (first 10):
   0. abeezee
   1. abel
   2. abhayalibre
   3. aboreto
   4. abrilfatface
   5. abyssinicasil
   6. aclonica
   7. acme
   8. actor
   9. adamina
   ... and 95 more

ü§ñ Loading base model (ResNet-18)...


Some weights of ResNetForImageClassification were not initialized from the model checkpoint at microsoft/resnet-18 and are newly initialized because the shapes did not match:
- classifier.1.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([105]) in the model instantiated
- classifier.1.weight: found shape torch.Size([1000, 512]) in the checkpoint and torch.Size([105, 512]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


   ‚úì Base model loaded

üîÑ Preprocessing images...
   Splitting into train/validation sets...
   Training samples: 1600
   Validation samples: 400

‚öôÔ∏è Configuring training for CPU...

üìã Training Configuration:
   Device: CPU
   Epochs: 10
   Batch size: 8
   Learning rate: 5e-05
   Estimated time: 20-30 minutes on CPU

üöÄ Starting training...
   This will take 20-30 minutes on CPU
   You'll see progress updates every 20 steps



Epoch,Training Loss,Validation Loss,Accuracy
1,3.8469,3.5937,0.22
2,2.7428,2.64065,0.39
3,1.9879,2.05756,0.48
4,1.4512,1.673101,0.585
5,1.1278,1.459094,0.6225
6,0.8535,1.315969,0.655
7,0.6204,1.186008,0.675
8,0.5486,1.175683,0.6825
9,0.4809,1.104331,0.7075
10,0.3762,1.085691,0.7075



‚úÖ Training complete!

üìä Final Evaluation:


   Final Accuracy: 70.75%
   Final Loss: 1.1043

üíæ Saving model...

‚úÖ Model saved to: ./tesserae_google_fonts_model

üéâ Training complete! Ready for Step 4: Testing

üöÄ Starting training...
   This will take 15-25 minutes depending on GPU availability
   You'll see progress updates every 20 steps



Epoch,Training Loss,Validation Loss,Accuracy
1,0.5853,1.078112,0.7175
2,0.3818,1.113507,0.7175
3,0.2292,0.966514,0.75
4,0.1108,0.96519,0.7575
5,0.067,0.871591,0.76
6,0.1037,0.896884,0.755
7,0.0778,0.890424,0.76
8,0.0484,0.92472,0.7675
9,0.0152,0.853834,0.7775
10,0.047,0.866165,0.7775



‚úÖ Training complete!

üìä Final Evaluation:


   Final Accuracy: 77.75%
   Final Loss: 0.8538

üíæ Saving model...

‚úÖ Model saved to: ./tesserae_google_fonts_model

üéâ Training complete! Ready for Step 4: Testing


In [6]:
print("="*60)
print("STEP 4: TESTING YOUR GOOGLE FONTS MODEL")
print("="*60)

from transformers import AutoImageProcessor, AutoModelForImageClassification
from PIL import Image
import torch

print("\nü§ñ Loading your trained model...")

# Load your custom Google Fonts model
processor = AutoImageProcessor.from_pretrained("./tesserae_google_fonts_model")
model = AutoModelForImageClassification.from_pretrained("./tesserae_google_fonts_model")

print("‚úÖ Model loaded!")
print(f"   Can identify {len(model.config.id2label)} font families")

def identify_google_font(image_path):
    """Identify font using your custom Google Fonts model"""
    image = Image.open(image_path).convert('RGB')
    inputs = processor(images=image, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # Get top 5 predictions
    top5_prob, top5_idx = torch.topk(probabilities, 5)
    
    results = []
    for prob, idx in zip(top5_prob[0], top5_idx[0]):
        results.append({
            'font_family': model.config.id2label[idx.item()],
            'confidence': prob.item()
        })
    
    return results

# Test with your test image
print("\nüß™ Testing with test4.png...")
print("="*60)

results = identify_google_font("test4.png")

print("\nüéØ GOOGLE FONTS DETECTION RESULTS:")
for i, result in enumerate(results, 1):
    print(f"  {i}. {result['font_family']:<30} {result['confidence']:>6.1%}")

print("\n" + "="*60)
print("‚úÖ Testing complete!")
print("\nüí° Your model is ready to use in Tesserae!")


STEP 4: TESTING YOUR GOOGLE FONTS MODEL

ü§ñ Loading your trained model...
‚úÖ Model loaded!
   Can identify 105 font families

üß™ Testing with test4.png...

üéØ GOOGLE FONTS DETECTION RESULTS:
  1. rochester                       11.9%
  2. craftygirls                      7.4%
  3. homemadeapple                    7.1%
  4. kosugi                           4.8%
  5. jsmathcmr10                      4.6%

‚úÖ Testing complete!

üí° Your model is ready to use in Tesserae!


In [7]:
print("="*60)
print("REGENERATING DATASET - FULL SCALE")
print("="*60)

from pathlib import Path
import shutil

# First, let's clean up the old training data
print("\nüóëÔ∏è Removing old training data...")
training_dir = Path("training_data")
if training_dir.exists():
    shutil.rmtree(training_dir)
    print("   ‚úì Old data removed")

training_dir.mkdir(exist_ok=True)

# Recreate generator
generator = GoogleFontsDatasetGenerator()

# Check how many fonts we have available
all_fonts = generator.get_font_files()
print(f"\nüìä Available fonts: {len(all_fonts)}")

# Decision point: how many to use
print("\n‚öôÔ∏è Generation options:")
print("   Option A: 500 fonts √ó 25 samples = 12,500 images (~15 min)")
print("   Option B: 1000 fonts √ó 25 samples = 25,000 images (~30 min)")
print("   Option C: ALL 3,083 fonts √ó 25 samples = 77,075 images (~90 min)")

print("\nüìù Recommendation: Start with Option B (1000 fonts)")
print("   This gives good coverage without taking forever\n")

# Generate with 1000 fonts
print("üöÄ Starting generation with 1000 fonts, 25 samples each...")
print("   This will take approximately 30 minutes")
print("   Estimated total: 25,000 images\n")

dataset_info = generator.generate_dataset(
    samples_per_font=25,  # Up from 10
    max_fonts=1000  # Up from 200
)

print("\n‚úÖ New dataset ready!")
print("   Ready to retrain with much more data")

REGENERATING DATASET - FULL SCALE

üóëÔ∏è Removing old training data...
   ‚úì Old data removed

üìä Available fonts: 3083

‚öôÔ∏è Generation options:
   Option A: 500 fonts √ó 25 samples = 12,500 images (~15 min)
   Option B: 1000 fonts √ó 25 samples = 25,000 images (~30 min)
   Option C: ALL 3,083 fonts √ó 25 samples = 77,075 images (~90 min)

üìù Recommendation: Start with Option B (1000 fonts)
   This gives good coverage without taking forever

üöÄ Starting generation with 1000 fonts, 25 samples each...
   This will take approximately 30 minutes
   Estimated total: 25,000 images


üé® Generating dataset from 1000 fonts...
   Creating 25 samples per font
   Total target: 25000 images

   ‚úì Processed 100/1000 fonts (2500 images generated)
   ‚úì Processed 200/1000 fonts (5000 images generated)
   ‚úì Processed 300/1000 fonts (7500 images generated)
   ‚úì Processed 400/1000 fonts (10000 images generated)
   ‚úì Processed 500/1000 fonts (12500 images generated)
   ‚úì Processed

In [8]:
print("="*60)
print("RETRAINING MODEL WITH EXPANDED DATASET")
print("="*60)

import torch
from transformers import AutoImageProcessor, AutoModelForImageClassification, TrainingArguments, Trainer
from datasets import load_dataset
from sklearn.metrics import accuracy_score
import os

# Force CPU usage
os.environ['CUDA_VISIBLE_DEVICES'] = ''

print("\nüîß Loading expanded dataset...")
dataset = load_dataset("imagefolder", data_dir="training_data")

print(f"\nüìä New Dataset Statistics:")
print(f"   Total samples: {len(dataset['train'])}")
print(f"   Number of font classes: {dataset['train'].features['label'].num_classes}")
print(f"   That's {len(dataset['train']) / 2000:.1f}x more data than before!")

# Get label information
labels = dataset['train'].features['label'].names
id2label = {i: label for i, label in enumerate(labels)}
label2id = {label: i for i, label in enumerate(labels)}

# Load fresh base model
print("\nü§ñ Loading fresh ResNet-18 model...")
model_name = "microsoft/resnet-18"
processor = AutoImageProcessor.from_pretrained(model_name)

model = AutoModelForImageClassification.from_pretrained(
    model_name,
    num_labels=len(labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

print("   ‚úì Fresh model loaded")

# Preprocessing
def preprocess(examples):
    images = [img.convert("RGB") for img in examples['image']]
    inputs = processor(images, return_tensors='pt')
    inputs['labels'] = examples['label']
    return inputs

print("\nüîÑ Preprocessing images...")
dataset = dataset.map(preprocess, batched=True, remove_columns=['image'])
dataset.set_format('torch', columns=['pixel_values', 'labels'])

# Split train/validation
print("   Splitting into train/validation sets...")
dataset = dataset['train'].train_test_split(test_size=0.2, seed=42)

print(f"   Training samples: {len(dataset['train'])}")
print(f"   Validation samples: {len(dataset['test'])}")

# Metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    return {"accuracy": accuracy_score(labels, predictions)}

# Training configuration - increased epochs for better learning
print("\n‚öôÔ∏è Configuring training...")
training_args = TrainingArguments(
    output_dir="./google_fonts_model_v2_checkpoint",
    num_train_epochs=15,  # Increased from 10
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=5e-5,
    warmup_ratio=0.1,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    save_total_limit=2,
    report_to="none",
    no_cuda=True,
)

print(f"\nüìã Training Configuration:")
print(f"   Device: CPU")
print(f"   Epochs: {training_args.num_train_epochs}")
print(f"   Batch size: {training_args.per_device_train_batch_size}")
print(f"   Training samples: {len(dataset['train'])}")
print(f"   Estimated time: 60-90 minutes on CPU (larger dataset)")

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    compute_metrics=compute_metrics,
)

print("\nüöÄ Starting retraining...")
print("   This will take 60-90 minutes with the larger dataset")
print("   You'll see progress updates every 50 steps")
print("   Go grab some coffee! ‚òï\n")
print("="*60)

# Train!
trainer.train()

print("\n" + "="*60)
print("‚úÖ Retraining complete!")
print("="*60)

# Evaluate
print("\nüìä Final Evaluation:")
eval_results = trainer.evaluate()
print(f"   Final Accuracy: {eval_results['eval_accuracy']:.2%}")
print(f"   Final Loss: {eval_results['eval_loss']:.4f}")

# Save the improved model
print("\nüíæ Saving improved model...")
model.save_pretrained("./tesserae_google_fonts_model_v2")
processor.save_pretrained("./tesserae_google_fonts_model_v2")

print("\n‚úÖ Improved model saved to: ./tesserae_google_fonts_model_v2")
print("   This should perform MUCH better than v1!")
print("\nüéâ Ready to test the improved model!")

RETRAINING MODEL WITH EXPANDED DATASET

üîß Loading expanded dataset...


Resolving data files:   0%|          | 0/25000 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/25000 [00:00<?, ?files/s]

Computing checksums:  17%|#7        | 4320/25000 [00:05<00:23, 863.87it/s]

Generating train split: 0 examples [00:00, ? examples/s]


üìä New Dataset Statistics:
   Total samples: 25000
   Number of font classes: 466
   That's 12.5x more data than before!

ü§ñ Loading fresh ResNet-18 model...


Some weights of ResNetForImageClassification were not initialized from the model checkpoint at microsoft/resnet-18 and are newly initialized because the shapes did not match:
- classifier.1.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([466]) in the model instantiated
- classifier.1.weight: found shape torch.Size([1000, 512]) in the checkpoint and torch.Size([466, 512]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


   ‚úì Fresh model loaded

üîÑ Preprocessing images...


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

   Splitting into train/validation sets...
   Training samples: 20000
   Validation samples: 5000

‚öôÔ∏è Configuring training...

üìã Training Configuration:
   Device: CPU
   Epochs: 15
   Batch size: 8
   Training samples: 20000
   Estimated time: 60-90 minutes on CPU (larger dataset)

üöÄ Starting retraining...
   This will take 60-90 minutes with the larger dataset
   You'll see progress updates every 50 steps
   Go grab some coffee! ‚òï





Epoch,Training Loss,Validation Loss,Accuracy
1,4.2262,3.815608,0.2484
2,2.4978,1.921287,0.5404
3,1.2789,0.982495,0.7448
4,0.8822,0.633088,0.8182
5,0.4842,0.439687,0.8682
6,0.2541,0.35487,0.8938
7,0.2097,0.298245,0.911
8,0.1519,0.253947,0.9226
9,0.0921,0.25382,0.9238
10,0.0827,0.256413,0.9302



‚úÖ Retraining complete!

üìä Final Evaluation:


   Final Accuracy: 94.28%
   Final Loss: 0.2560

üíæ Saving improved model...

‚úÖ Improved model saved to: ./tesserae_google_fonts_model_v2
   This should perform MUCH better than v1!

üéâ Ready to test the improved model!


In [9]:
print("="*60)
print("TESTING IMPROVED MODEL (v2)")
print("="*60)

from transformers import AutoImageProcessor, AutoModelForImageClassification
from PIL import Image
import torch

print("\nü§ñ Loading improved model v2...")

# Load your improved model
processor = AutoImageProcessor.from_pretrained("./tesserae_google_fonts_model_v2")
model = AutoModelForImageClassification.from_pretrained("./tesserae_google_fonts_model_v2")

print("‚úÖ Model v2 loaded!")
print(f"   Can identify {len(model.config.id2label)} font families")
print(f"   Training accuracy: 94.28%")

def identify_google_font_v2(image_path):
    """Identify font using improved v2 model"""
    image = Image.open(image_path).convert('RGB')
    inputs = processor(images=image, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    
    # Get top 5 predictions
    top5_prob, top5_idx = torch.topk(probabilities, 5)
    
    results = []
    for prob, idx in zip(top5_prob[0], top5_idx[0]):
        results.append({
            'font_family': model.config.id2label[idx.item()],
            'confidence': prob.item()
        })
    
    return results

# Test with your test image
print("\nüß™ Testing with test3.png...")
print("="*60)

results = identify_google_font_v2("test4.png")

print("\nüéØ MODEL v2 RESULTS:")
for i, result in enumerate(results, 1):
    print(f"  {i}. {result['font_family']:<30} {result['confidence']:>6.1%}")

print("\nüìä Comparison:")
print("   v1 (200 fonts): rochester (11.9%) ‚ùå")
print(f"   v2 (1000 fonts): {results[0]['font_family']} ({results[0]['confidence']:.1%}) {'‚úÖ' if results[0]['confidence'] > 50 else '‚ö†Ô∏è'}")

print("\n" + "="*60)

# Let's also check what the actual fonts in test3.png were
print("\nüí° Reminder - test4.png contains:")
print("   ‚Ä¢ Arial (or similar sans-serif)")
print("   ‚Ä¢ Courier New (monospace)")
print("   ‚Ä¢ Times New Roman (serif)")
print("   ‚Ä¢ Pacifico (script)")

print("\n" + "="*60)
print("‚úÖ Testing complete!")

TESTING IMPROVED MODEL (v2)

ü§ñ Loading improved model v2...
‚úÖ Model v2 loaded!
   Can identify 466 font families
   Training accuracy: 94.28%

üß™ Testing with test3.png...

üéØ MODEL v2 RESULTS:
  1. courierprime                    11.1%
  2. areyouserious                    5.3%
  3. cutivemono                       5.0%
  4. abyssinicasil                    3.7%
  5. amethysta                        3.0%

üìä Comparison:
   v1 (200 fonts): rochester (11.9%) ‚ùå
   v2 (1000 fonts): courierprime (11.1%) ‚ö†Ô∏è


üí° Reminder - test4.png contains:
   ‚Ä¢ Arial (or similar sans-serif)
   ‚Ä¢ Courier New (monospace)
   ‚Ä¢ Times New Roman (serif)
   ‚Ä¢ Pacifico (script)

‚úÖ Testing complete!
