In [6]:
# --- Import Libraries ---
import torch
from datasets import load_dataset, Image, DatasetDict
from transformers import (
    AutoImageProcessor,
    AutoModelForImageClassification,
    TrainingArguments,
    Trainer
)
from torchvision.transforms import (
    Compose,
    Normalize,
    RandomResizedCrop,
    RandomHorizontalFlip,
    ToTensor,
    Resize
)
import numpy as np
import evaluate
import os

# ResNet

In [7]:
MODEL_CHECKPOINT = 'microsoft/resnet-50'

LEARNING_RATE = 3e-5 # ViTs often benefit from smaller LRs
BATCH_SIZE = 16 # Adjust based on your GPU memory
NUM_EPOCHS = 5 # Start with a few epochs, increase if needed
WEIGHT_DECAY = 0.01
REMOVE_UNUSED_COLUMNS = False # Important for image classification

In [8]:


# --- Configuration ---

DATASET_PATH = 'c:/users/cauchepy/Datasets/ComputerVisionImages/kaggle_terraintypes'
SAVE_NAME = MODEL_CHECKPOINT.split("/")[-1] + "/kaggle_terraintypes"
OUTPUT_DIR = f"./{SAVE_NAME}_results"




In [9]:
# --- 1. Load Data ---
print("Loading dataset...")

# Load dataset using 'imagefolder'
full_dataset = load_dataset("imagefolder", data_dir=DATASET_PATH, split="train")



Loading dataset...


Resolving data files:   0%|          | 0/3196 [00:00<?, ?it/s]

In [13]:
# --- 2. Preprocessing ---
print("Setting up preprocessing...")

image_processor = AutoImageProcessor.from_pretrained(MODEL_CHECKPOINT)

labels = full_dataset.features["label"].names
label2id = {label: i for i, label in enumerate(labels)}
id2label = {i: label for i, label in enumerate(labels)}
num_classes = len(labels)
print(f"Found classes: {labels}")
print(f"Number of classes: {num_classes}")

normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)

_train_transforms = Compose(
    [
        RandomResizedCrop(image_processor.size["shortest_edge"]),
        RandomHorizontalFlip(),
        ToTensor(),
        normalize,
    ]
)

_val_transforms = Compose(
    [
        # torch.nn.Upsample(size=(image_processor.size["height"], image_processor.size["width"]), mode='bilinear', align_corners=False), 
        Resize(image_processor.size["shortest_edge"]), # Alternative resize
        ToTensor(),
        normalize,
    ]
)

def train_transforms(examples):
    # Assumes input column is named "image" by imagefolder loader
    examples['pixel_values'] = [_train_transforms(img.convert("RGB")) for img in examples['image']]
    # examples.pop("image", None) # Let Trainer handle column removal if needed
    return examples

def val_transforms(examples):
    examples['pixel_values'] = [_val_transforms(img.convert("RGB")) for img in examples['image']]
    # examples.pop("image", None)
    return examples

train_test_split = full_dataset.train_test_split(test_size=0.2, seed=42) # stratified by default
dataset_splits = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

print("Applying transformations...")
dataset_splits["train"].set_transform(train_transforms)
dataset_splits["test"].set_transform(val_transforms)


def collate_fn(examples):
    # Stack tensors correctly
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples], dtype=torch.long)
    return {"pixel_values": pixel_values, "labels": labels}



Setting up preprocessing...
Found classes: ['Desert', 'Forest', 'Mountain', 'Plains']
Number of classes: 4
Applying transformations...


In [14]:
# --- 3. Model ---
print("Loading pre-trained model...")

model = AutoModelForImageClassification.from_pretrained(
    MODEL_CHECKPOINT,
    num_labels=num_classes, 
    label2id=label2id,
    id2label=id2label,
    ignore_mismatched_sizes=True, # Allow replacing the classifier head
)



Loading pre-trained model...


config.json:   0%|          | 0.00/69.6k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/102M [00:00<?, ?B/s]

Some weights of ResNetForImageClassification were not initialized from the model checkpoint at microsoft/resnet-50 and are newly initialized because the shapes did not match:
- classifier.1.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([4]) in the model instantiated
- classifier.1.weight: found shape torch.Size([1000, 2048]) in the checkpoint and torch.Size([4, 2048]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# --- 4. Metrics ---
print("Setting up metrics...")

accuracy = evaluate.load("accuracy") 
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
# cm = evaluate.load("confusion_matrix")


def compute_metrics(eval_pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(eval_pred.predictions, axis=1)
    labels = eval_pred.label_ids
    
    # Compute metrics
    accuracy_score = accuracy.compute(predictions=predictions, references=labels)
    f1_score = f1.compute(predictions=predictions, references=labels, average="weighted")
    precision_score = precision.compute(predictions=predictions, references=labels, average="weighted")
    recall_score = recall.compute(predictions=predictions, references=labels, average="weighted")
    # confusion_matrix = cm.compute(predictions=predictions, references=labels, labels=list(range(num_classes)))
    
    return {
        "accuracy": accuracy_score["accuracy"],
        "f1": f1_score["f1"],
        "precision": precision_score["precision"],
        "recall": recall_score["recall"],
        # "confusion_matrix": confusion_matrix,
    }


Setting up metrics...


In [20]:
# --- 5. Training ---
print("Configuring training...")

# Define Training Arguments
args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    # Training Hyperparameters
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE * 2, # Usually can use larger batch for eval
    num_train_epochs=NUM_EPOCHS,
    weight_decay=WEIGHT_DECAY,
    # Evaluation and Saving
    evaluation_strategy="epoch", # Evaluate at the end of each epoch
    save_strategy="epoch",      # Save model checkpoint at the end of each epoch
    load_best_model_at_end=True, # Load the best model found during training
    metric_for_best_model="accuracy", # Use accuracy to determine the best model
    # Technical settings
    logging_dir=f'{OUTPUT_DIR}/logs',
    logging_steps=50, # Log training loss every N steps
    remove_unused_columns=REMOVE_UNUSED_COLUMNS, # Keep necessary columns like 'label'
    push_to_hub=False, # Set to True to upload model to Hugging Face Hub
    report_to="tensorboard", # Or "wandb" if you use Weights & Biases
    seed=42, # For reproducibility
)

# Instantiate Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=dataset_splits["train"],
    eval_dataset=dataset_splits["test"], # Using test split as validation here
    compute_metrics=compute_metrics,
    tokenizer=image_processor, # Pass processor for consistent saving
    data_collator=collate_fn,
)

# Start Training
print("Starting training...")
train_results = trainer.train()

# Save the final best model and processor
trainer.save_model()
trainer.save_state()
image_processor.save_pretrained(OUTPUT_DIR) # Save processor alongside model

print("Training finished.")

model.save_pretrained(f"./{SAVE_NAME}_model")
image_processor.save_pretrained(f"./{SAVE_NAME}_processor")



Configuring training...
Starting training...


  trainer = Trainer(


  0%|          | 0/800 [00:00<?, ?it/s]

{'loss': 1.0749, 'grad_norm': 10.472182273864746, 'learning_rate': 2.8125e-05, 'epoch': 0.31}
{'loss': 1.025, 'grad_norm': 9.202753067016602, 'learning_rate': 2.625e-05, 'epoch': 0.62}
{'loss': 0.959, 'grad_norm': 4.189481735229492, 'learning_rate': 2.4375e-05, 'epoch': 0.94}


  0%|          | 0/20 [00:00<?, ?it/s]

Trainer is attempting to log a value of "{'confusion_matrix': array([[138,   2,  15,   1],
       [ 10, 112,   0,  19],
       [  5,   3, 155,   0],
       [ 14,   4,   0, 162]], dtype=int64)}" of type <class 'dict'> for key "eval/confusion_matrix" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.9195715188980103, 'eval_accuracy': 0.8859375, 'eval_f1': 0.8854635031805722, 'eval_precision': 0.8879062556461518, 'eval_recall': 0.8859375, 'eval_confusion_matrix': {'confusion_matrix': array([[138,   2,  15,   1],
       [ 10, 112,   0,  19],
       [  5,   3, 155,   0],
       [ 14,   4,   0, 162]], dtype=int64)}, 'eval_runtime': 25.5722, 'eval_samples_per_second': 25.027, 'eval_steps_per_second': 0.782, 'epoch': 1.0}


TypeError: Object of type ndarray is not JSON serializable

In [21]:
# --- 6. Evaluation ---
print("Evaluating model on the test set...")
eval_results = trainer.evaluate(dataset_splits["test"])
print(f"Evaluation results: {eval_results}")

Evaluating model on the test set...


  0%|          | 0/20 [00:00<?, ?it/s]

Evaluation results: {'eval_loss': 0.019092431291937828, 'eval_accuracy': 0.9953125, 'eval_runtime': 51.2003, 'eval_samples_per_second': 12.5, 'eval_steps_per_second': 0.391, 'epoch': 5.0}


In [None]:
# --- 7. Prediction ---

print("\n--- Example Prediction ---")
from PIL import Image as PILImage
import requests

# Example: Load an image (replace with your own image path/URL)
# url = "https://wallpapercave.com/wp/JZQsFFO.jpg"
# image_to_predict = PILImage.open(requests.get(url, stream=True).raw)

image_to_predict = dataset_splits["test"][0]['image'].filename

print(f"Loading example image: {image_to_predict}")
image = PILImage.open(image_to_predict)

inputs = image_processor(images=image, return_tensors="pt")

inputs = {k: v.to(trainer.model.device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

predicted_class_idx = logits.argmax(-1).item()

predicted_class_label = model.config.id2label[predicted_class_idx]

print(f"Predicted class: {predicted_class_label} (Index: {predicted_class_idx})")


--- Example Prediction ---
Loading example image: <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1024x768 at 0x25189B4C8B0>


AttributeError: read

In [None]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
model_repo_id = "yanncauchepin/kaggle_terraintypes_resnet_model"
model.push_to_hub(model_repo_id)

processor_repo_id = "yanncauchepin/kaggle_terraintypes_resnet_processor"
image_processor.push_to_hub(processor_repo_id)

CommitInfo(commit_url='https://huggingface.co/yanncauchepin/kaggle_terraintypes_vit_processor/commit/639b623b7cc560313573b7e0d8949d7657cc7ef6', commit_message='Upload processor', commit_description='', oid='639b623b7cc560313573b7e0d8949d7657cc7ef6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/yanncauchepin/kaggle_terraintypes_vit_processor', endpoint='https://huggingface.co', repo_type='model', repo_id='yanncauchepin/kaggle_terraintypes_vit_processor'), pr_revision=None, pr_num=None)