# **Data Analysis**

In [None]:
import pandas as pd

df = pd.read_csv("metadata.csv")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15209 entries, 0 to 15208
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   image_id     15209 non-null  int64 
 1   identity     13074 non-null  object
 2   path         15209 non-null  object
 3   date         11302 non-null  object
 4   orientation  14506 non-null  object
 5   species      13821 non-null  object
 6   split        15209 non-null  object
 7   dataset      15209 non-null  object
dtypes: int64(1), object(7)
memory usage: 950.7+ KB


In [None]:
database_df = df[df['split'] == 'database']
query_df = df[df['split'] == 'query']

In [None]:
database_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13074 entries, 0 to 14708
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   image_id     13074 non-null  int64 
 1   identity     13074 non-null  object
 2   path         13074 non-null  object
 3   date         10113 non-null  object
 4   orientation  12871 non-null  object
 5   species      11686 non-null  object
 6   split        13074 non-null  object
 7   dataset      13074 non-null  object
dtypes: int64(1), object(7)
memory usage: 919.3+ KB


In [None]:
database_df["identity"].info()

<class 'pandas.core.series.Series'>
Index: 13074 entries, 0 to 14708
Series name: identity
Non-Null Count  Dtype 
--------------  ----- 
13074 non-null  object
dtypes: object(1)
memory usage: 204.3+ KB


In [None]:
database_df = pd.read_csv('database_metadata.csv')

identity_counts = database_df['identity'].value_counts()
single_occurrence_identities = identity_counts[identity_counts == 1].index

print(single_occurrence_identities)
print()
print(len(single_occurrence_identities))

Index(['SalamanderID2025_250', 'SalamanderID2025_216', 'SalamanderID2025_217',
       'SalamanderID2025_221', 'SalamanderID2025_222', 'SalamanderID2025_223',
       'SalamanderID2025_224', 'SalamanderID2025_225', 'SalamanderID2025_229',
       'SeaTurtleID2022_t103',
       ...
       'SalamanderID2025_126', 'SalamanderID2025_125', 'SalamanderID2025_124',
       'SalamanderID2025_167', 'SalamanderID2025_166', 'SalamanderID2025_163',
       'SalamanderID2025_159', 'SalamanderID2025_158', 'SalamanderID2025_154',
       'SalamanderID2025_153'],
      dtype='object', name='identity', length=317)

317


In [None]:
query_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2135 entries, 3 to 15208
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   image_id     2135 non-null   int64 
 1   identity     0 non-null      object
 2   path         2135 non-null   object
 3   date         1189 non-null   object
 4   orientation  1635 non-null   object
 5   species      2135 non-null   object
 6   split        2135 non-null   object
 7   dataset      2135 non-null   object
dtypes: int64(1), object(7)
memory usage: 150.1+ KB


In [None]:
database_df.to_csv("database_metadata.csv", index=False)
query_df.to_csv("query_metadata.csv", index=False)

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

# 1. Load the database_metadata.csv
database_df = pd.read_csv('database_metadata.csv')

# 2. Find identities that appear only once
identity_counts = database_df['identity'].value_counts()
single_occurrence_identities = identity_counts[identity_counts == 1].index

# 3. Split the data:
# - Images with identities appearing only once
single_occurrence_df = database_df[database_df['identity'].isin(single_occurrence_identities)]
print("single_occurrence_df:")
single_occurrence_df.info()
print()

# - Images with identities appearing at least twice
normal_df = database_df[~database_df['identity'].isin(single_occurrence_identities)]
print("normal_df:")
normal_df.info()
print()

# 4. Perform stratified train/validation split on normal identities
train_normal_df, test_normal_df = train_test_split(
    normal_df,
    test_size=0.2,
    stratify=normal_df['identity'],
    random_state=42
)
print("train_normal.df:")
train_normal_df.info()
print()
print("test_normal.df:")
test_normal_df.info()
print()

# 5. Combine all single-occurrence identities + 80% of normal identities into the training set
train_df = pd.concat([train_normal_df, single_occurrence_df]).reset_index(drop=True)

# 6. Save the CSV files
train_df.to_csv('train_database_metadata.csv', index=False)
test_normal_df.to_csv('test_database_metadata.csv', index=False)

print(f"Train set: {len(train_df)} images")
print(f"Test set: {len(test_normal_df)} images")
split = len(test_normal_df)/len(train_df)
print(f"Split: {split}")

single_occurrence_df:
<class 'pandas.core.frame.DataFrame'>
Index: 317 entries, 678 to 6309
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   image_id     317 non-null    int64 
 1   identity     317 non-null    object
 2   path         317 non-null    object
 3   date         308 non-null    object
 4   orientation  317 non-null    object
 5   species      7 non-null      object
 6   split        317 non-null    object
 7   dataset      317 non-null    object
dtypes: int64(1), object(7)
memory usage: 22.3+ KB

normal_df:
<class 'pandas.core.frame.DataFrame'>
Index: 12757 entries, 0 to 13073
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   image_id     12757 non-null  int64 
 1   identity     12757 non-null  object
 2   path         12757 non-null  object
 3   date         9805 non-null   object
 4   orientation  12554 non-null  object
 5   species 

In [None]:
import pandas as pd

train_df = pd.read_csv('train_database_metadata.csv')
test_df = pd.read_csv('test_database_metadata.csv')

train_identities = set(train_df['identity'].unique())
test_identities = set(test_df['identity'].unique())

missing_identities = test_identities - train_identities

if len(missing_identities) == 0:
    print("✅ All identities in the test set are present in the train set.")
else:
    print(f"❌ {len(missing_identities)} identities in the test set are missing from the train set!")
    print("Missing identities:")
    print(missing_identities)

✅ All identities in the test set are present in the train set.


# **Create Dataloaders**

In [None]:
%%writefile create_dataloaders.py
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
import os
import torchvision.transforms as T
import torch

class AnimalDataset(Dataset):
    def __init__(self, csv_file, root_dir, label_encoder, transform=None):
        self.data = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.label_encoder = label_encoder
        self.label_decoder = {v: k for k, v in label_encoder.items()}  # reverse mapping

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, self.data.iloc[idx]['path'])
        identity = self.data.iloc[idx]['identity']

        image = Image.open(img_path).convert('RGB')

        if self.transform:
            image = self.transform(image)

        label = self.label_encoder[identity]

        return image, label, img_path, identity  # return more info

# Root directory where images are located
root = './animal-clef-2025'

# Simple resize for displaying images
transform_display = T.Compose([
    T.Resize([384, 384]),
])

transform = T.Compose([
    *transform_display.transforms,
    T.ToTensor(),
    T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
])

train_csv = "train_database_metadata.csv"
test_csv = "test_database_metadata.csv"

train_identities = pd.read_csv(train_csv)['identity'].unique()
label_encoder = {identity: idx for idx, identity in enumerate(sorted(train_identities))}

# Datasets
train_dataset = AnimalDataset(
    csv_file=train_csv,
    root_dir=root,
    label_encoder=label_encoder,
    transform=transform
)

test_dataset = AnimalDataset(
    csv_file=test_csv,
    root_dir=root,
    label_encoder=label_encoder,
    transform=transform
)

batch_size = 2

def collate_fn(batch):
    images, labels, paths, identities = zip(*batch)
    images = torch.stack(images)
    labels = torch.tensor(labels)
    return images, labels, paths, identities

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2, collate_fn=collate_fn)

print(f"Number of training batches: {len(train_loader)}")
print(f"Number of testing batches: {len(test_loader)}")

images, labels, paths, identities = next(iter(train_loader))

print(f"\nTrain batch - images shape: {images.shape}")
print(f"Train batch - labels shape: {labels.shape}")
print(f"Train batch - labels example: {labels[:5]}")

for i in range(len(paths)):
    print(f"[Train] Image Path: {paths[i]} | Label ID: {labels[i].item()} | Identity: {identities[i]}")

images_test, labels_test, paths_test, identities_test = next(iter(test_loader))

print(f"\nTest batch - images shape: {images_test.shape}")
print(f"Test batch - labels shape: {labels_test.shape}")
print(f"Test batch - labels example: {labels_test[:5]}")

for i in range(len(paths_test)):
    print(f"[Test] Image Path: {paths_test[i]} | Label ID: {labels_test[i].item()} | Identity: {identities_test[i]}")


Overwriting create_dataloaders.py


In [None]:
%%writefile create_dataloaders.py
from torch.utils.data import Dataset, DataLoader
from PIL import Image
import pandas as pd
import os
import torchvision.transforms as T

class AnimalDataset(Dataset):
    def __init__(self, csv_file, root_dir, label_encoder, transform=None):
        self.data = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.label_encoder = label_encoder

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Build the full path to the image
        img_path = os.path.join(self.root_dir, self.data.iloc[idx]['path'])

        # Open the image
        image = Image.open(img_path).convert('RGB')

        # Apply transformations (if any)
        if self.transform:
            image = self.transform(image)

        # Label is the value from "identity"
        label_str = self.data.iloc[idx]['identity']
        label = self.label_encoder[label_str]

        return image, label

# Root directory where images are located
root = './animal-clef-2025'

# Simple resize for displaying images
transform_display = T.Compose([
    T.Resize([384, 384]),
])

# Transform for training / evaluation (resize + normalization)
transform = T.Compose([
    *transform_display.transforms,   # reuse resize step
    T.ToTensor(),
    T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
])

train_csv = "train_database_metadata.csv"
test_csv = "test_database_metadata.csv"

train_identities = pd.read_csv(train_csv)['identity'].unique()
label_encoder = {identity: idx for idx, identity in enumerate(sorted(train_identities))}

# Datasets
train_dataset = AnimalDataset(
    csv_file=train_csv,
    root_dir=root,
    label_encoder=label_encoder,
    transform=transform
)

test_dataset = AnimalDataset(
    csv_file=test_csv,
    root_dir=root,
    label_encoder=label_encoder,
    transform=transform
)

batch_size = 2

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

print(f"Number of training batches: {len(train_loader)}")
print(f"Number of testing batches: {len(test_loader)}")

images, labels = next(iter(train_loader))

print(f"\nTrain batch - images shape: {images.shape}")
print(f"Train batch - labels shape: {labels.shape}")
print(f"Train batch - labels example: {labels[:5]}")

images_test, labels_test = next(iter(test_loader))

print(f"\nTest batch - images shape: {images_test.shape}")
print(f"Test batch - labels shape: {labels_test.shape}")
print(f"Test batch - labels example: {labels_test[:5]}")

Writing create_dataloaders.py


# **Fine-Tune**

In [None]:
%%writefile fine_tune_animal_clef.py
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from transformers import get_scheduler
import timm
import csv
import torch.nn as nn
import torch
import torch.optim as optim
import pandas as pd
import os
import torchvision.transforms as T
import matplotlib.pyplot as plt

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class AnimalDataset(Dataset):
    def __init__(self, csv_file, root_dir, label_encoder, transform=None):
        self.data = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
        self.label_encoder = label_encoder

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Build the full path to the image
        img_path = os.path.join(self.root_dir, self.data.iloc[idx]['path'])

        # Open the image
        image = Image.open(img_path).convert('RGB')

        # Apply transformations (if any)
        if self.transform:
            image = self.transform(image)

        # Label is the value from "identity"
        label_str = self.data.iloc[idx]['identity']
        label = self.label_encoder[label_str]

        return image, label

def train_one_epoch(model, dataloader, optimizer, scheduler, criterion, device, epoch):
    model.train()
    running_loss = 0.0

    for images, labels in tqdm(dataloader, desc=f"Training (Epoch {epoch+1})"):
        images, labels = images.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(images)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

        running_loss += loss.item()

    return running_loss / len(dataloader)

def validate(model, dataloader, criterion, device, epoch):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in tqdm(dataloader, desc=f"Validating (Epoch {epoch+1})"):
            images, labels = images.to(device), labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            running_loss += loss.item()

            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return running_loss / len(dataloader), accuracy

# Root directory where images are located
root = './animal-clef-2025'

# Transforms for train and validation
train_transform = T.Compose([
    T.RandomResizedCrop(384, scale=(0.8, 1.0)),
    T.RandomHorizontalFlip(p=0.5),
    T.RandomRotation(degrees=10),
    T.ColorJitter(0.4, 0.4, 0.4, 0.1),
    T.AutoAugment(policy=T.AutoAugmentPolicy.IMAGENET),
    T.ToTensor(),
    T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    T.RandomErasing(p=0.25, scale=(0.02, 0.33))
])

test_transform = T.Compose([
    T.Resize([384, 384]),
    T.ToTensor(),
    T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
])

train_csv = "train_database_metadata.csv"
test_csv = "test_database_metadata.csv"

train_identities = pd.read_csv(train_csv)['identity'].unique()
label_encoder = {identity: idx for idx, identity in enumerate(sorted(train_identities))}

# Datasets
train_dataset = AnimalDataset(
    csv_file=train_csv,
    root_dir=root,
    label_encoder=label_encoder,
    transform=train_transform
)

test_dataset = AnimalDataset(
    csv_file=test_csv,
    root_dir=root,
    label_encoder=label_encoder,
    transform=test_transform
)

batch_size = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

model = timm.create_model(
    #'vit_large_patch16_384',
    #'convnext_xlarge.fb_in22k_ft_in1k_512',
    'swin_large_patch4_window12_384',
    pretrained=True,
    num_classes=len(label_encoder)
)
model = model.to(device)

num_epochs = 10
lr = 8e-5
weight_decay = 0.01

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

num_training_steps = len(train_loader) * num_epochs
num_warmup_steps = int(0.1 * num_training_steps)

scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

metrics_csv = "epoch_metrics.csv"

if not os.path.exists(metrics_csv):
    with open(metrics_csv, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["epoch", "train_loss", "val_loss", "val_acc", "learning_rate"])

train_losses = []
val_losses = []
val_accuracies = []
learning_rates = []

best_val_loss = float('inf')
best_val_acc = 0.0
best_model_path = "best_model.pth"

for epoch in range(num_epochs):
    train_loss = train_one_epoch(model, train_loader, optimizer, scheduler, criterion, device, epoch)
    val_loss, val_acc = validate(model, test_loader, criterion, device, epoch)

    current_lr = scheduler.get_last_lr()[0]
    learning_rates.append(current_lr)

    train_losses.append(train_loss)
    val_losses.append(val_loss)
    val_accuracies.append(val_acc)

    print(f"Epoch {epoch+1}/{num_epochs} | "
          f"Train Loss: {train_loss:.4f} | "
          f"Val Loss: {val_loss:.4f} | "
          f"Val Acc: {val_acc:.4f} | "
          f"LR: {current_lr:.8f}")

    with open(metrics_csv, mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow([epoch+1, train_loss, val_loss, val_acc, current_lr])

    # Save best model if val_loss decreased and val_acc increased
    if val_loss < best_val_loss and val_acc > best_val_acc:
        best_val_loss = val_loss
        best_val_acc = val_acc
        torch.save({
            'model_state_dict': model.state_dict(),
            'label_encoder': label_encoder,
        }, best_model_path)
        print(f"✅ Best model saved at epoch {epoch+1} with Val Loss {val_loss:.4f} and Val Acc {val_acc:.4f}")

# Plot Train and Validation Loss
plt.plot(train_losses, label="Train Loss")
plt.plot(val_losses, label="Validation Loss")
plt.legend()
plt.title("Training and Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.grid()
plt.savefig("loss_plot.png")
plt.close()

# Plot Validation Accuracy
plt.plot(val_accuracies, label="Validation Accuracy")
plt.legend()
plt.title("Validation Accuracy Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.grid()
plt.savefig("accuracy_plot.png")
plt.close()

# Plot Learning Rate over Epochs
plt.plot(learning_rates, label="Learning Rate")
plt.legend()
plt.title("Learning Rate Over Epochs")
plt.xlabel("Epoch")
plt.ylabel("Learning Rate")
plt.grid()
plt.savefig("learning_rate_plot.png")
plt.close()

Overwriting fine_tune_animal_clef.py


# **Inference**

In [None]:
%%writefile inference_query.py
import torch
import timm
from torchvision import transforms
from PIL import Image
import pandas as pd
import os
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_path = "best_model.pth"
checkpoint = torch.load(model_path, map_location=device)

label_encoder = checkpoint['label_encoder']
idx_to_label = {v: k for k, v in label_encoder.items()}

model = timm.create_model(
    'convnext_xlarge.fb_in22k_ft_in1k_384',
    pretrained=False,
    num_classes=len(label_encoder)
)
model.load_state_dict(checkpoint['model_state_dict'])
model = model.to(device)
model.eval()

transform = transforms.Compose([
    transforms.Resize((384, 384)),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
])

def predict(image_path, threshold=0.6):
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0)
    image = image.to(device)

    with torch.no_grad():
        outputs = model(image)
        probs = torch.softmax(outputs, dim=1)
        conf, preds = probs.max(dim=1)
        confidence = conf.item()
        predicted_idx = preds.item()

    if confidence >= threshold:
        predicted_label = idx_to_label[predicted_idx]
    else:
        predicted_label = "new_individual"

    return predicted_label, confidence

query_csv = "query_metadata.csv"
query_data = pd.read_csv(query_csv)

root = "./animal-clef-2025"

predictions = []
confidences = []

for idx, row in tqdm(query_data.iterrows(), total=len(query_data), desc="Predicting"):
    img_rel_path = row['path']
    img_full_path = os.path.join(root, img_rel_path)

    try:
        pred_label, conf = predict(img_full_path)
    except Exception as e:
        print(f"❌ Eroare la imaginea {img_full_path}: {e}")
        pred_label, conf = "error", 0.0

    predictions.append(pred_label)
    confidences.append(conf)

query_data['identity'] = predictions
query_data['confidence'] = confidences

output_csv = "query_with_predictions.csv"
query_data.to_csv(output_csv, index=False)

print(f"✅ File saved at {output_csv}")

Writing inference_query.py


# **Sample Submission**

In [None]:
import pandas as pd

query_predictions = pd.read_csv('query_with_predictions.csv')

submission = query_predictions[['image_id', 'identity']]

submission.to_csv('sample_submission.csv', index=False)

print("✅ sample_submission.csv done!")

✅ sample_submission.csv done!
