# Bài 1: Bird Species Classification (40 pts)

Cho tập data Bird Species gồm ~84k ảnh của 525 nhãn, mỗi nhãn là tên khoa học của 1 loại chim tại Kaggle: https://www.kaggle.com/datasets/gpiosenka/100-bird-species. Yêu cầu:

1. Sử dụng các kiến thức đã học (CNN, Transfer Learning), tạo 1 cấu trúc mô hình phù hợp để phân loại 525 nhãn.


## 1. Load Data

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms, datasets, models
from sklearn.model_selection import train_test_split

In [3]:
# Thiết lập các tham số
batch_size = 32
learning_rate = 0.001
num_epochs = 10

# Đường dẫn đến tập dữ liệu
data_dir = "/kaggle/input/100-bird-species/"

# Áp dụng các biến đổi cho dữ liệu huấn luyện và kiểm thử
transform_train = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
])

transform_test = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
])

In [4]:
# Tạo DataLoader cho dữ liệu huấn luyện và kiểm thử 
# ImageFolder => label encoder already included
train_dataset = datasets.ImageFolder(root=data_dir + 'train', transform=transform_train)
test_dataset = datasets.ImageFolder(root=data_dir + 'test', transform=transform_test)
valid_dataset = datasets.ImageFolder(root=data_dir + 'valid', transform=transform_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

In [5]:
# Số lượng mẫu
len(train_dataset)

84635

In [6]:
# Print all the classes
classes = train_dataset.classes
print("Len of train Classes:",len(classes))
print("Len of test Classes:",len(test_dataset.classes))

# Get the class-to-index mapping
class_to_idx = train_dataset.class_to_idx

# Print classes with their indices
for class_name, class_idx in class_to_idx.items():
    print(f"Class: {class_name}, Index: {class_idx}")

Len of train Classes: 525
Len of test Classes: 525
Class: ABBOTTS BABBLER, Index: 0
Class: ABBOTTS BOOBY, Index: 1
Class: ABYSSINIAN GROUND HORNBILL, Index: 2
Class: AFRICAN CROWNED CRANE, Index: 3
Class: AFRICAN EMERALD CUCKOO, Index: 4
Class: AFRICAN FIREFINCH, Index: 5
Class: AFRICAN OYSTER CATCHER, Index: 6
Class: AFRICAN PIED HORNBILL, Index: 7
Class: AFRICAN PYGMY GOOSE, Index: 8
Class: ALBATROSS, Index: 9
Class: ALBERTS TOWHEE, Index: 10
Class: ALEXANDRINE PARAKEET, Index: 11
Class: ALPINE CHOUGH, Index: 12
Class: ALTAMIRA YELLOWTHROAT, Index: 13
Class: AMERICAN AVOCET, Index: 14
Class: AMERICAN BITTERN, Index: 15
Class: AMERICAN COOT, Index: 16
Class: AMERICAN DIPPER, Index: 17
Class: AMERICAN FLAMINGO, Index: 18
Class: AMERICAN GOLDFINCH, Index: 19
Class: AMERICAN KESTREL, Index: 20
Class: AMERICAN PIPIT, Index: 21
Class: AMERICAN REDSTART, Index: 22
Class: AMERICAN ROBIN, Index: 23
Class: AMERICAN WIGEON, Index: 24
Class: AMETHYST WOODSTAR, Index: 25
Class: ANDEAN GOOSE, Inde

In [7]:
# In input và labels của batch đầu tiên
for batch_idx, (inputs, labels) in enumerate(train_loader):
    # Print the current batch
    print(f"Batch {batch_idx + 1} - Inputs: {inputs}, Labels: {labels}")

    # Print the size of the current batch (number of samples in the batch)
    print(f"Batch {batch_idx + 1} - Batch Size: {len(inputs)}")

    # Break after printing the first 5 batches
    if batch_idx == 0:
        break

Batch 1 - Inputs: tensor([[[[0.4510, 0.4471, 0.4510,  ..., 0.5490, 0.5569, 0.5647],
          [0.4510, 0.4510, 0.4549,  ..., 0.5451, 0.5529, 0.5608],
          [0.4471, 0.4471, 0.4510,  ..., 0.5451, 0.5529, 0.5569],
          ...,
          [0.4980, 0.5020, 0.4980,  ..., 0.3725, 0.3412, 0.3216],
          [0.4941, 0.5020, 0.5020,  ..., 0.3686, 0.3373, 0.3176],
          [0.4784, 0.4902, 0.5020,  ..., 0.3686, 0.3412, 0.3176]],

         [[0.4549, 0.4510, 0.4549,  ..., 0.5451, 0.5529, 0.5569],
          [0.4549, 0.4549, 0.4588,  ..., 0.5412, 0.5490, 0.5529],
          [0.4510, 0.4510, 0.4549,  ..., 0.5412, 0.5451, 0.5490],
          ...,
          [0.4784, 0.4784, 0.4784,  ..., 0.4235, 0.4000, 0.3882],
          [0.4745, 0.4824, 0.4863,  ..., 0.4196, 0.3961, 0.3843],
          [0.4667, 0.4784, 0.4902,  ..., 0.4235, 0.4000, 0.3843]],

         [[0.4000, 0.3961, 0.4000,  ..., 0.4745, 0.4863, 0.4980],
          [0.4000, 0.4000, 0.4039,  ..., 0.4706, 0.4824, 0.4941],
          [0.3961, 0.396

 # 2. Metrics

In [8]:
from sklearn import metrics as skmetrics
import numpy
class Metrics:
    def __init__(self, metric_names):
        self.metric_names = metric_names
        # initialize a metric dictionary
        self.metric_dict = {metric_name: [0] for metric_name in self.metric_names}

    def step(self, labels, preds):
        for metric in self.metric_names:
            # get the metric function
            do_metric = getattr(
                skmetrics, metric, "The metric {} is not implemented".format(metric)
            )
            # check if metric require average method, if yes set to 'micro' or 'macro' or 'None'
            try:
                self.metric_dict[metric].append(
                    do_metric(labels, preds, average="macro")
                )
            except:
                self.metric_dict[metric].append(do_metric(labels, preds))

    def epoch(self):
        # calculate metrics for an entire epoch
        avg = [sum(metric) / (len(metric) - 1) for metric in self.metric_dict.values()]
        metric_as_dict = dict(zip(self.metric_names, avg))
        return metric_as_dict

    def last_step_metrics(self):
        # return metrics of last steps
        values = [self.metric_dict[metric][-1] for metric in self.metric_names]
        metric_as_dict = dict(zip(self.metric_names, values))
        return metric_as_dict
    
train_metrics = Metrics(["accuracy_score","f1_score"])
valid_metrics = Metrics(["accuracy_score","f1_score"])

In [9]:
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from torch.optim import lr_scheduler

class LabelSmoothingLoss(nn.Module):
    def __init__(self, smoothing=0.1, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.smoothing = smoothing
        self.dim = dim

    def forward(self, pred, target):
        target = F.one_hot(target, num_classes=pred.size(-1))
        target = target.float()
        target = (1 - self.smoothing) * target + self.smoothing / pred.size(-1)
        log_pred = F.log_softmax(pred, dim=self.dim)
        loss = nn.KLDivLoss(reduction='batchmean')(log_pred, target)
        return loss

# 3. Model EfficientNet-B4 

In [10]:
# Chuyển mô hình và dữ liệu sang GPU nếu có sẵn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print (device)

cpu


In [11]:
# EfficientNet-B4 architecture from the torchvision library and modifying its classifier for transfer learning. 
# freezing the pre-trained weights of the base model and replacing the classifier with custom classifier.

# Download the weights manually
state_dict = torch.hub.load_state_dict_from_url("https://download.pytorch.org/models/efficientnet_b4_rwightman-7eb33cd5.pth")

# Create the model
model = models.efficientnet_b4(pretrained=True).to(device)

# Load the weights
model.load_state_dict(state_dict)

for param in model.parameters():
    param.requires_grad = False
    # freeze tất cả trọng số của model gốc
classifier = nn.Sequential(
    nn.Linear(in_features=model.classifier[1].in_features, out_features=256,bias=True),
    nn.Linear(in_features=256, out_features=525,bias=True)
)
model.classifier  = classifier


# Định nghĩa hàm loss và optimizer
#criterion = nn.CrossEntropyLoss()
criterion = LabelSmoothingLoss(smoothing=0.12)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Apply learning rate scheduling
scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)



In [12]:
# Chuyển mô hình và dữ liệu sang GPU nếu có sẵn
import torch
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp

try:
    # Connect to TPU
    device = xm.xla_device()
    print("Running on TPU")
except ImportError:
    print("TPU not available. Running on CPU/GPU.")
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


model.to(device)

model.load_state_dict(state_dict)

device

Running on TPU


RuntimeError: Error(s) in loading state_dict for EfficientNet:
	Missing key(s) in state_dict: "classifier.0.weight", "classifier.0.bias". 
	size mismatch for classifier.1.weight: copying a param with shape torch.Size([1000, 1792]) from checkpoint, the shape in current model is torch.Size([525, 256]).
	size mismatch for classifier.1.bias: copying a param with shape torch.Size([1000]) from checkpoint, the shape in current model is torch.Size([525]).

# 4. Def Training Epoch

In [None]:
def train_one_epoch(
    model,
    train_loader,
    test_loader,
    device,
    optimizer,
    criterion,
    train_metrics,
    val_metrics,
):

    # training-the-model
    train_loss = 0
    valid_loss = 0
    all_labels = []
    all_preds = []
    model.train()
    for data, target in train_loader:
        # move-tensors-to-GPU
        data = data.type(torch.FloatTensor).to(device)
        # target=torch.Tensor(target)
        target = target.float().to(device)
        # clear-the-gradients-of-all-optimized-variables
        optimizer.zero_grad()
        # forward-pass: compute-predicted-outputs-by-passing-inputs-to-the-model
        output = model(data)
        # get the prediction label and target label
        output = model(data)
        preds = torch.argmax(output, axis=1).cpu().detach().numpy()
        labels = target.cpu().numpy()
        # calculate-the-batch-loss
        loss = criterion(output.type(torch.FloatTensor), target.type(torch.LongTensor))
        # backward-pass: compute-gradient-of-the-loss-wrt-model-parameters
        loss.backward()
        # perform-a-ingle-optimization-step (parameter-update)
        optimizer.step()
        # update-training-loss
        train_loss += loss.item() * data.size(0)
        # calculate training metrics
        all_labels.extend(labels)
        all_preds.extend(preds)
    
    train_metrics.step(all_labels, all_preds)

    # validate-the-model
    model.eval()
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for data, target in test_loader:
            data = data.type(torch.FloatTensor).to(device)
            target = target.to(device)
            output = model(data)
            preds = torch.argmax(output, axis=1).tolist()
            labels = target.tolist()
            all_labels.extend(labels)
            all_preds.extend(preds)
            loss = criterion(output, target)

            # update-average-validation-loss
            valid_loss += loss.item() * data.size(0)

    val_metrics.step(all_labels, all_preds)
    train_loss = train_loss / len(train_loader.sampler)
    valid_loss = valid_loss / len(test_loader.sampler)

    return (
        train_loss,
        valid_loss,
        train_metrics.last_step_metrics(),
        val_metrics.last_step_metrics(),
    )

# 5. Train model

In [None]:
import time
import logging
import numpy as np
from tqdm import tqdm

## 5.1 First 10 epochs

In [None]:
start_time = time.time()
best_val_acc = 0
print("begin training process")
for i in tqdm(range(0, num_epochs)):
    loss, val_loss, train_result, val_result = train_one_epoch(
        model,
        train_loader,
        valid_loader,
        device,
        optimizer,
        criterion,
        train_metrics,
        valid_metrics,
    )

    scheduler.step(val_loss)
    
    print("Epoch {} / {} \n Training loss: {} - Other training metrics: ".format(i + 1, num_epochs, loss))
    print(train_result)
    print(" \n Validation loss : {} - Other validation metrics:".format(val_loss))
    print(val_result)
    print("\n")
    
    # saving epoch with best validation accuracy
    if best_val_acc < float(val_result["accuracy_score"]):
        print("Validation accuracy= "+ str(val_result["accuracy_score"])+ "===> Save best epoch")
        best_val_acc = val_result["accuracy_score"]
        torch.save(model.state_dict(),"./" +  "best.pt",)
    else:
        print("Validation accuracy= "+ str(val_result["accuracy_score"])+ "===> No saving")
        continue

end_time = time.time()
elapsed_time = end_time - start_time
print(f"Total runtime first 10 epoch : {elapsed_time} seconds")

## 5.2 Next 10 epochs

In [None]:
for param in model.parameters():
    param.requires_grad = True
    
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)

start_time = time.time()
model = model.to(device)
best_val_acc = 0.8528
import logging
import numpy as np
print("begin training process")
for i in tqdm(range(0, num_epochs)):
    loss, val_loss, train_result, val_result = train_one_epoch(
        model,
        train_loader,
        valid_loader,
        device,
        optimizer,
        criterion,
        train_metrics,
        valid_metrics,
    )

    scheduler.step(val_loss)
    print("Epoch {} / {} \n Training loss: {} - Other training metrics: ".format(i + 1, num_epochs, loss))
    print(train_result)
    print(" \n Validation loss : {} - Other validation metrics:".format(val_loss))
    print(val_result)
    print("\n")
    
    # saving epoch with best validation accuracy
    if best_val_acc < float(val_result["accuracy_score"]):
        print("Validation accuracy= "+ str(val_result["accuracy_score"])+ "===> Save best epoch")
        best_val_acc = val_result["accuracy_score"]
        torch.save(model, "./" +  "best.pt")
    else:
        print("Validation accuracy= "+ str(val_result["accuracy_score"])+ "===> No saving")
        continue
        
end_time = time.time()
elapsed_time = end_time - start_time
print(f"Total runtime next 10 epoch : {elapsed_time} seconds")

## 2. Tuning mô hình sử dụng các kỹ thuật đã học. Report lại performance trên tập Val và Test. Nhận xét kết quả thu được

In summary, **lr_scheduler.StepLR** is suitable for a fixed schedule of learning rate reductions, while **torch.optim.lr_scheduler.ReduceLROnPlateau** is more adaptive and changes the learning rate based on the model's performance.

=> can try this scheduler for further tunning.


In [None]:
test_model = torch.load("/kaggle/working/best.pt")
test_model = test_model.to(device)

In [None]:
def test_result(model, test_loader, device):
    model.eval()  # Set the model to evaluation mode
    preds, labels = [], []

    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)

            # Forward pass: compute predicted outputs by passing inputs to the model
            output = model(data)

            # Applying Softmax to the results
            probs = nn.Softmax(dim=1)(output)

            labels.extend(target.tolist())
            preds.extend(torch.argmax(probs, axis=1).tolist())

    return labels, preds

In [None]:
# Display confusion matrix and classification report for the test set
labels_test, preds_test = test_result(model, test_loader, device)
report_test = classification_report(labels_test, preds_test, digits=4, target_names=train_dataset.classes)

cm_test = confusion_matrix(labels_test, preds_test)
print(report_test)

In [None]:
# Plotting the training and validation losses
plt.plot(train_losses, label='Training Loss')
plt.plot(valid_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# Display confusion matrix
disp_test = ConfusionMatrixDisplay(confusion_matrix=cm_test, display_labels=train_dataset.classes)
disp_test.plot()

In [None]:
# Evaluate the model on the test set
model.eval()
test_correct = 0
test_total = 0
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        _, predicted = torch.max(outputs.data, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_accuracy = test_correct / test_total
print(f"Test Accuracy: {test_accuracy:.4f}")

In [13]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms, datasets, models
from sklearn.model_selection import train_test_split

# Thiết lập các tham số
batch_size = 32
learning_rate = 0.001
num_epochs = 10

# Đường dẫn đến tập dữ liệu
data_dir = "/kaggle/input/100-bird-species/"

# Áp dụng các biến đổi cho dữ liệu huấn luyện và kiểm thử
transform_train = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
])

transform_test = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
])

# Tạo DataLoader cho dữ liệu huấn luyện và kiểm thử 
# ImageFolder => label encoder already included
train_dataset = datasets.ImageFolder(root=data_dir + 'train', transform=transform_train)
test_dataset = datasets.ImageFolder(root=data_dir + 'test', transform=transform_test)
valid_dataset = datasets.ImageFolder(root=data_dir + 'valid', transform=transform_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

# Số lượng mẫu
len(train_dataset)

# Print all the classes
classes = train_dataset.classes
print("Len of train Classes:",len(classes))
print("Len of test Classes:",len(test_dataset.classes))

# Get the class-to-index mapping
class_to_idx = train_dataset.class_to_idx

# Print classes with their indices
for class_name, class_idx in class_to_idx.items():
    print(f"Class: {class_name}, Index: {class_idx}")
    
# In input và labels của batch đầu tiên
for batch_idx, (inputs, labels) in enumerate(train_loader):
    # Print the current batch
    print(f"Batch {batch_idx + 1} - Inputs: {inputs}, Labels: {labels}")

    # Print the size of the current batch (number of samples in the batch)
    print(f"Batch {batch_idx + 1} - Batch Size: {len(inputs)}")

    # Break after printing the first 5 batches
    if batch_idx == 0:
        break
        
from sklearn import metrics as skmetrics
import numpy
class Metrics:
    def __init__(self, metric_names):
        self.metric_names = metric_names
        # initialize a metric dictionary
        self.metric_dict = {metric_name: [0] for metric_name in self.metric_names}

    def step(self, labels, preds):
        for metric in self.metric_names:
            # get the metric function
            do_metric = getattr(
                skmetrics, metric, "The metric {} is not implemented".format(metric)
            )
            # check if metric require average method, if yes set to 'micro' or 'macro' or 'None'
            try:
                self.metric_dict[metric].append(
                    do_metric(labels, preds, average="macro")
                )
            except:
                self.metric_dict[metric].append(do_metric(labels, preds))

    def epoch(self):
        # calculate metrics for an entire epoch
        avg = [sum(metric) / (len(metric) - 1) for metric in self.metric_dict.values()]
        metric_as_dict = dict(zip(self.metric_names, avg))
        return metric_as_dict

    def last_step_metrics(self):
        # return metrics of last steps
        values = [self.metric_dict[metric][-1] for metric in self.metric_names]
        metric_as_dict = dict(zip(self.metric_names, values))
        return metric_as_dict
    
train_metrics = Metrics(["accuracy_score","f1_score"])
valid_metrics = Metrics(["accuracy_score","f1_score"])

import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from torch.optim import lr_scheduler

class LabelSmoothingLoss(nn.Module):
    def __init__(self, smoothing=0.1, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.smoothing = smoothing
        self.dim = dim

    def forward(self, pred, target):
        target = F.one_hot(target, num_classes=pred.size(-1))
        target = target.float()
        target = (1 - self.smoothing) * target + self.smoothing / pred.size(-1)
        log_pred = F.log_softmax(pred, dim=self.dim)
        loss = nn.KLDivLoss(reduction='batchmean')(log_pred, target)
        return loss
        
# Chuyển mô hình và dữ liệu sang GPU nếu có sẵn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print (device)

# EfficientNet-B4 architecture from the torchvision library and modifying its classifier for transfer learning. 
# freezing the pre-trained weights of the base model and replacing the classifier with custom classifier.

# Download the weights manually
state_dict = torch.hub.load_state_dict_from_url("https://download.pytorch.org/models/efficientnet_b4_rwightman-7eb33cd5.pth")

# Create the model
model = models.efficientnet_b4(pretrained=True).to(device)

# Load the weights
model.load_state_dict(state_dict)

for param in model.parameters():
    param.requires_grad = False
    # freeze tất cả trọng số của model gốc
classifier = nn.Sequential(
    nn.Linear(in_features=model.classifier[1].in_features, out_features=256,bias=True),
    nn.Linear(in_features=256, out_features=525,bias=True)
)
model.classifier  = classifier


# Định nghĩa hàm loss và optimizer
#criterion = nn.CrossEntropyLoss()
criterion = LabelSmoothingLoss(smoothing=0.12)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Apply learning rate scheduling
scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)


Len of train Classes: 525
Len of test Classes: 525
Class: ABBOTTS BABBLER, Index: 0
Class: ABBOTTS BOOBY, Index: 1
Class: ABYSSINIAN GROUND HORNBILL, Index: 2
Class: AFRICAN CROWNED CRANE, Index: 3
Class: AFRICAN EMERALD CUCKOO, Index: 4
Class: AFRICAN FIREFINCH, Index: 5
Class: AFRICAN OYSTER CATCHER, Index: 6
Class: AFRICAN PIED HORNBILL, Index: 7
Class: AFRICAN PYGMY GOOSE, Index: 8
Class: ALBATROSS, Index: 9
Class: ALBERTS TOWHEE, Index: 10
Class: ALEXANDRINE PARAKEET, Index: 11
Class: ALPINE CHOUGH, Index: 12
Class: ALTAMIRA YELLOWTHROAT, Index: 13
Class: AMERICAN AVOCET, Index: 14
Class: AMERICAN BITTERN, Index: 15
Class: AMERICAN COOT, Index: 16
Class: AMERICAN DIPPER, Index: 17
Class: AMERICAN FLAMINGO, Index: 18
Class: AMERICAN GOLDFINCH, Index: 19
Class: AMERICAN KESTREL, Index: 20
Class: AMERICAN PIPIT, Index: 21
Class: AMERICAN REDSTART, Index: 22
Class: AMERICAN ROBIN, Index: 23
Class: AMERICAN WIGEON, Index: 24
Class: AMETHYST WOODSTAR, Index: 25
Class: ANDEAN GOOSE, Inde

In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import transforms, datasets, models
from sklearn.model_selection import train_test_split
from sklearn import metrics as skmetrics
import torch_xla
import torch_xla.core.xla_model as xm
import torch_xla.distributed.xla_multiprocessing as xmp
from torch.optim import lr_scheduler
from tqdm import tqdm
import time

# ... (Assuming you have some code here for data loading and preprocessing)

# Connect to TPU
device = xm.xla_device()
print("Running on TPU")

# Define the model, optimizer, and scheduler inside the _MP_fn to ensure proper TPU initialization
def _MP_fn(rank):
    # Load the EfficientNet-B4 architecture
    # Download the weights manually
    state_dict = torch.hub.load_state_dict_from_url("https://download.pytorch.org/models/efficientnet_b4_rwightman-7eb33cd5.pth")

    # Create the model
    model = models.efficientnet_b4(pretrained=True).to(device)

    # Load the weights
    model.load_state_dict(state_dict)
    
    # Freeze pre-trained weights
    for param in model.parameters():
        param.requires_grad = False

    # Modify the classifier
    classifier = nn.Sequential(
        nn.Linear(in_features=model.classifier[1].in_features, out_features=256, bias=True),
        nn.Linear(in_features=256, out_features=525, bias=True)
    )
    model.classifier = classifier
    
    # Move the model to TPU
    model.to(device)
    
    # Define the optimizer and scheduler
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

    # Training loop
    start_time = time.time()
    best_val_acc = 0
    print("Begin training process")
    
    for i in tqdm(range(0, num_epochs)):
        loss, val_loss, train_result, val_result = train_one_epoch(
            model,
            train_loader,
            valid_loader,
            device,
            optimizer,
            criterion,
            train_metrics,
            valid_metrics,
        )

        scheduler.step(val_loss)

        print("Epoch {} / {} \n Training loss: {} - Other training metrics: ".format(i + 1, num_epochs, loss))
        print(train_result)
        print(" \n Validation loss : {} - Other validation metrics:".format(val_loss))
        print(val_result)
        print("\n")

        # saving epoch with the best validation accuracy
        if best_val_acc < float(val_result["accuracy_score"]):
            print("Validation accuracy= " + str(val_result["accuracy_score"]) + "===> Save best epoch")
            best_val_acc = val_result["accuracy_score"]
            xm.save(model.state_dict(), "best.pt")
        else:
            print("Validation accuracy= " + str(val_result["accuracy_score"]) + "===> No saving")
            continue

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Total runtime first 10 epochs: {elapsed_time} seconds")

    for param in model.parameters():
        param.requires_grad = True

    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.999), eps=1e-08, weight_decay=0)

    start_time = time.time()
    best_val_acc = 0.8528
    print("Begin training process")
    
    for i in tqdm(range(0, num_epochs)):
        loss, val_loss, train_result, val_result = train_one_epoch(
            model,
            train_loader,
            valid_loader,
            device,
            optimizer,
            criterion,
            train_metrics,
            valid_metrics,
        )

        scheduler.step(val_loss)
        print("Epoch {} / {} \n Training loss: {} - Other training metrics: ".format(i + 1, num_epochs, loss))
        print(train_result)
        print(" \n Validation loss : {} - Other validation metrics:".format(val_loss))
        print(val_result)
        print("\n")

        # saving epoch with best validation accuracy
        if best_val_acc < float(val_result["accuracy_score"]):
            print("Validation accuracy= " + str(val_result["accuracy_score"]) + "===> Save best epoch")
            best_val_acc = val_result["accuracy_score"]
            xm.save(model.state_dict(), "best.pt")
        else:
            print("Validation accuracy= " + str(val_result["accuracy_score"]) + "===> No saving")
            continue

    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Total runtime next 10 epochs: {elapsed_time} seconds")


Running on TPU


In [None]:
xmp.spawn(_MP_fn, args=(), nprocs=8, start_method='fork')

