# Dog breed classification using Pytorch

In [None]:
# import the tools we need
import os 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.preprocessing import LabelEncoder

In [None]:
train_df = pd.read_csv('../input/dog-breed-identification/labels.csv')
test_df = pd.read_csv('../input/dog-breed-identification/sample_submission.csv')

# Prepare for training set and test set

In [None]:
# Training df

train_path = train_df.id.tolist()
train_path = [image+'.jpg' for image in train_path]
train_dir = '../input/dog-breed-identification/train'
train_path = [os.path.join(train_dir,dog_name) for dog_name in train_path]
train_df['path'] = train_path
train_df.drop('id',axis=1, inplace=True)

# Label encoding the breed column
train_df['breed_label'] = LabelEncoder().fit_transform(train_df['breed'])

# Store the id and breed for result checking
match_id_df = train_df.copy()
match_id_df.drop('path',axis=1,inplace=True)

# drop the breed column
train_df.drop('breed',inplace=True,axis=1)

In [None]:
# Testing df
test_df = test_df[['id']]
answer_df = test_df.copy()
test_path = test_df.id.tolist()
test_path = [name+'.jpg' for name in test_path]
test_dir = '../input/dog-breed-identification/test'
test_path = [os.path.join(test_dir,name) for name in test_path]
test_df = pd.DataFrame(test_path,columns=['path'])

In [None]:
train_df.head(3)

In [None]:
test_df.head(3)

# Create custom dataset module

In [None]:
# input the tools we need
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision import models

In [None]:
# Custom Data class
class Dog_data(nn.Module):
    def __init__(self, csv_file, transform=None, test=False):
        self.transform = transform
        self.data = csv_file
        self.len = self.data.shape[0]
        self.test = test
        
    def __len__(self):
        return self.len
    
    def __getitem__(self,index):
        img_path = self.data.iloc[index,0]
        if self.test==False:
            label = self.data.iloc[index,1]
    
        open_image = Image.open(img_path)
        
        if self.transform:
            open_image = self.transform(open_image)
        
        if self.test==False: 
            return open_image, label
        elif self.test==True:
            return open_image

#### Split into training set and validation set

In [None]:
# use 30% of the data to be validation set
val_size = round(train_df.shape[0]*0.3)

# Shuffle the dataset
train_df = train_df.sample(frac=1)

# split into train and val
validation_df = train_df.iloc[:val_size,:]
training_df = train_df.iloc[val_size:,:]

#### Create training set and validation set
- Since I will use the pretrained model, the image must be normalized in the same way.
- Size: 224 x 224
- Then normalized using mean = [0.485, 0.456, 0.406] and std = [0.229, 0.224, 0.225] 

In [None]:
mean = [0.485, 0.456, 0.406]
std= [0.229, 0.224, 0.225]

compose = transforms.Compose([transforms.Resize((256,256)),
                              transforms.CenterCrop(224),
                             transforms.ToTensor(),
                             transforms.Normalize(mean,std)])
train_set = Dog_data(training_df, transform=compose)
val_set = Dog_data(validation_df, transform=compose)
test_set = Dog_data(test_df, transform=compose, test=True)

In [None]:
# Check for the dataset
plt.subplots()
plt.imshow(test_set[0].permute(1,2,0))
plt.subplots()
plt.imshow(train_set[0][0].permute(1,2,0))

In [None]:
# Now the dataset is ready, time to create the model and other stuff

# Prepare all the stuff
- GPU
- Data loader for training set ,validation set, test set
- Model - suing Resnet50
- Optimizer - SGD with momentum
- Cost function - CrossEntropy
- Training function

In [None]:
# GPU
print(torch.cuda.is_available())
device = torch.device('cuda:0')

#### Model

In [None]:
model = models.resnet50(pretrained=True)

# disable the tuning of parameter in model
for param in model.parameters():
    param.requires_grad=False
    
# Modify the output layer
model.fc= nn.Linear(in_features=2048, out_features=120, bias=True)

# Transfer to GPU
model = model.to(device)

#### Dataset loader

In [None]:
train_loader = DataLoader(dataset=train_set, batch_size=32)
val_loader = DataLoader(dataset=val_set, batch_size=64)
test_loader = DataLoader(dataset=test_set,batch_size=1)

#### Optimizer and Cost function

In [None]:
learning_rate = 0.0003

cost_function = nn.CrossEntropyLoss()

params = [param for param in model.parameters() if param.requires_grad]
optimizer = torch.optim.SGD(params, lr=learning_rate,momentum=0.9)

#### Training function

In [None]:
# length of validation set
n_val = len(val_set)

def train_model(model, train, validation, optimizer, cost_function, epochs=10, patience=5):
    train_cost_list = [] # Store the cost of training set
    val_cost_list = [] # Store the cost of validation set
    accuracy_list = [] # Store the accuarcy of validation set

    for epoch in range(epochs):
        train_cost_sublist = []
        print('Start training....{}/{} epochs'.format(epoch+1, epochs))
        for x,y in train:
            x,y = x.to(device),y.to(device)
            model.train() # Activate training mode
            optimizer.zero_grad()
            z = model(x)
            loss = cost_function(z,y)
            train_cost_sublist.append(loss.item())
            loss.backward()
            optimizer.step()
        train_cost_list.append(np.mean(train_cost_sublist))
        
        correct = 0
        val_cost_sublist = []
        for x_val, y_val in validation:
            model.eval() # Actiavte evaluation mode
            x_val,y_val = x_val.to(device),y_val.to(device)
            z = model(x_val)
            loss = cost_function(z,y_val)
            val_cost_sublist.append(loss.item())
            _,yhat = torch.max(z.data,1)
            correct = correct + (yhat==y_val).sum().item()
        val_cost_list.append(np.mean(val_cost_sublist))
        
        # Count accuarcy
        accuracy = correct/n_val
        accuracy_list.append(accuracy)
        
        print('Training loss: {:.3f}, Validation loss: {:.3f}, Accuracy: {:.3f}'.format(np.mean(train_cost_sublist),
                                                                                        np.mean(val_cost_sublist),
                                                                                        accuracy))
        # Early stopping
        if epoch+1 >= 10: # Start when model is training for more than 10 epochs
            growth_list = []
            this_turn_acc = accuracy_list[-1]
            for i in range(patience):
                growth = this_turn_acc - accuracy_list[-1-(i+1)]
                if growth>0.001:
                    growth_list.append(growth)
            if len(growth_list) ==0:
                print('Early stopped.')
                print('Finish training.')
                break
            
    print('Finish training.')
    return (train_cost_list,val_cost_list, accuracy_list)

# Start training

In [None]:
train_cost,val_cost, acc = train_model(model,train_loader,val_loader,optimizer, cost_function, epochs=50,patience=3)

In [None]:
# plot the cost
def plot_cost(train, validation):
    plt.plot(train, label='Training Loss',color='blue')
    plt.plot(validation, label='validation Loss',color='orange')
    plt.title("Training Loss vs Validation Loss")
    plt.xlabel('iteration')
    plt.ylabel('loss')
    plt.legend()
    plt.grid('on')
    plt.show
    
plot_cost(train_cost,val_cost)

# Prediction on test data

In [None]:
answer_id = answer_df.id.tolist()
answer_prediction = []
model.eval()
for image in test_loader:
    image = image.to(device)
    model.eval()
    y_pred = model(image)
    answer = y_pred[0].cpu().detach().numpy().T.tolist()
    answer_prediction.append(answer)

In [None]:
# Save the answer in dataframe, set the file name to index
answer_prediction = pd.DataFrame(answer_prediction)
answer_prediction['id'] = answer_id
answer_prediction = answer_prediction.set_index('id',drop=True)

In [None]:
# get the breed name <--> index dictionary
match_id_df = match_id_df.drop_duplicates(subset='breed_label')
match_id_df = match_id_df.set_index('breed_label',drop=True)
id_dict = match_id_df.to_dict()
id_dict = id_dict['breed'] 

In [None]:
# Change the dataframe column name from index to breed name
prediction = answer_prediction.copy()
prediction.columns = answer_prediction.columns.map(id_dict)

In [None]:
# Save the prediction
prediction.to_csv('answer.csv')

# Fix the problem
- Apply softmax() to the output

In [None]:
from torch.nn.functional import softmax

In [None]:
answer_csv = pd.read_csv('./answer.csv')
final_prediction = answer_csv.copy()
final_prediction.set_index('id',drop=True,inplace=True)

In [None]:
for i in range(final_prediction.shape[0]):
    proba_row = softmax(torch.Tensor(answer_csv.iloc[i][1:].tolist()))
    proba_row = proba_row.tolist()
    final_prediction.iloc[i] = proba_row

In [None]:
final_prediction.to_csv('answer02.csv')

Thank you