# Setup and Installation
Install necessary libraries and dependencies, including the Florence 2 model and any required frameworks like PyTorch.

In [None]:
# Ensure pip is up-to-date
!pip install --upgrade pip

# Install PyTorch
!pip install torch torchvision torchaudio

# Install Hugging Face's transformers library for accessing Florence models
!pip install transformers

# Install additional libraries for data manipulation and visualization
!pip install numpy pandas matplotlib seaborn

# Verify installation by importing the libraries
import torch
from transformers import AutoModelForImageClassification, AutoTokenizer

# Check if CUDA is available for GPU acceleration
print("CUDA available: ", torch.cuda.is_available())

# Load and Prepare Dataset
Load the dataset for object detection and classification. Preprocess the images and annotations to fit the model's input requirements.

In [None]:
from torchvision import transforms
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader
import os

# Define the path to your dataset
dataset_path = "path/to/your/dataset"

# Define transformations for the images
transformations = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to fit the model's input size requirements
    transforms.ToTensor(),  # Convert images to PyTorch tensors
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize images
])

# Load the dataset using ImageFolder, assuming images are organized in folders by class
dataset = ImageFolder(root=dataset_path, transform=transformations)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Create DataLoaders for training and validation sets
batch_size = 32  # You can adjust the batch size
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

# Print dataset information
print(f"Total dataset size: {len(dataset)}")
print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")

# Initialize Florence 2 Model
Load the pre-trained Florence 2 model and prepare it for fine-tuning by setting up the appropriate configurations.

In [None]:
# Load the pre-trained Florence 2 model
model_name = "microsoft/florence-base-1"
model = AutoModelForImageClassification.from_pretrained(model_name)

# Prepare the model for fine-tuning
model.train()

# Check if a GPU is available and move the model to GPU if it is
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Print the model's device to confirm it's set correctly
print(f"Model is using device: {device}")

# Fine-Tuning the Model
Fine-tune the Florence 2 model on the prepared dataset, adjusting hyperparameters as necessary for optimal performance.

In [None]:
# Define the loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)  # Learning rate can be adjusted

# Define the number of epochs for training
num_epochs = 10  # Number of epochs can be adjusted

# Training loop
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)  # Move data to the device
        
        optimizer.zero_grad()  # Zero the parameter gradients
        outputs = model(images)  # Forward pass
        loss = criterion(outputs.logits, labels)  # Calculate loss
        loss.backward()  # Backward pass
        optimizer.step()  # Optimize
        
        running_loss += loss.item() * images.size(0)
    
    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

    # Validation loop
    model.eval()  # Set the model to evaluation mode
    val_running_loss = 0.0
    with torch.no_grad():  # Inference mode, gradients not needed
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs.logits, labels)
            val_running_loss += loss.item() * images.size(0)
    
    val_epoch_loss = val_running_loss / len(val_loader.dataset)
    print(f"Validation Loss: {val_epoch_loss:.4f}")

# Model Evaluation
Evaluate the fine-tuned model's performance on a validation or test set to measure its accuracy and efficiency in object detection and classification.

In [None]:
# Model Evaluation

# Calculate accuracy on validation set
correct = 0
total = 0
with torch.no_grad():  # Inference mode, gradients not needed
    for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.logits, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f'Accuracy of the model on the validation images: {accuracy:.2f}%')

# Calculate and display confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

all_labels = []
all_preds = []
with torch.no_grad():
    for images, labels in val_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs.logits, 1)
        all_labels.extend(labels.cpu().numpy())
        all_preds.extend(preds.cpu().numpy())

cm = confusion_matrix(all_labels, all_preds)
plt.figure(figsize=(10,10))
sns.heatmap(cm, annot=True, fmt="d", cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

# Calculate precision, recall, and F1-score
from sklearn.metrics import classification_report
print(classification_report(all_labels, all_preds, target_names=dataset.classes))

# Visualize Predictions
Visualize the model's predictions on sample images from the dataset to demonstrate its object detection and classification capabilities.

In [None]:
# Visualize Predictions

# Select a few images from the validation set
images, labels = next(iter(val_loader))
images, labels = images.to(device), labels.to(device)

# Get predictions from the model
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    outputs = model(images)
    _, preds = torch.max(outputs.logits, 1)

# Function to display images
def imshow(inp, title=None):
    """Imshow for Tensor."""
    inp = inp.numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    inp = std * inp + mean
    inp = np.clip(inp, 0, 1)
    plt.figure(figsize=(15, 15))
    plt.imshow(inp)
    if title is not None:
        plt.title(title)
    plt.pause(0.001)  # pause a bit so that plots are updated

# Display images along with predicted labels
out = torchvision.utils.make_grid(images.cpu())
imshow(out, title=[dataset.classes[x] for x in preds])