In [1]:
import torch
import torchvision
import pandas as pd

from __future__ import print_function, division
import os
from skimage import io
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
from PIL import Image

In [3]:
class PicturesDataset(Dataset):
    """Pictures dataset."""
    
    def __init__(self, csv_file, root_dir, transform):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied on sample.
        """
        self.pic_frame = pd.read_csv(csv_file)
        self.root_dir = root_dir
        self.transform = transform
    
    def resize(self, image):
        # Resize
        transformer = transforms.Compose([
            transforms.Resize(size=(32, 32)),
            transforms.ToTensor()])
        image = transformer(image)
        return image
        
    def __len__(self):
        """
        Return: how many pictures are in the dataset.
        """
        return len(self.pic_frame)
    
    def __getitem__(self, idx):
        # print(self.pic_frame.iloc[idx, 0])
        img_name = os.path.join(self.root_dir,
                                self.pic_frame.iloc[idx, 0] + '.jpg')
        
        # image = io.imread(img_name)
        image = Image.open(img_name)
        tags = self.pic_frame.iloc[idx, 1].split()
        tags = [int(x) for x in tags]
        tags = multi_hot(tags, 17)
        sample = {'image': image, 'tags': tags}

        if self.transform:
            sample['image'] = self.resize(sample['image'])
            
        return sample['image'], sample['tags']

In [4]:
def multi_hot(labels, n):
    """
    @labels : a 2D numpy array of indices for multi-hot encoding 
    @n: number of labels available 
    Output: assume m labels input. return a m by n tensor. 
    """
    size = len(labels)
    labels = torch.LongTensor(labels).view(1, -1)  # labels have to be 2D so need the view() function
    a = torch.zeros(1, size).long()
    i = torch.cat((a, labels))  # indices is a 2D vector..
    v = torch.ones(size)
    out = torch.sparse.FloatTensor(i, v, torch.Size([1,n])).to_dense()
    return out

In [None]:
def precision(out, labels):
    """
    A function to calcualte top1 and top3 precision of predictions.
    @out: the output of the final layer of the network. n by 17 tensor, n is the batch size
    @labels: the original labels. n by 17 tensor where n is the batch size
    """
    def intersection(lst1, lst2): 
        lst3 = [value for value in lst1 if value in lst2] 
        return lst3 

    true = labels.nonzero().numpy().tolist()  # all the nonzero values. list of pairs, 0th is the row id and 1th is col id
    
    rows = np.arange(250).tolist()  # list of row ids
    
    top1_ids = torch.topk(out, 1, dim=1)[1].squeeze().numpy().tolist()
    top1_pred = [[a, b] for (a, b) in zip(rows, top1_ids)]    # top1 predictions
    top1_correct = intersection(true, top1_pred)
    top1_precision = len(top1_correct)/len(top1_pred)

    top3_ids = torch.topk(out, 3, dim=1)[1].squeeze().numpy().tolist()
    
    top3_pred = []
    count = 0
    for x in top3_ids:
        r = [count] * 3  # row id 
        pairs = [[a, b] for (a, b) in zip(r, x)]  # make pairs
        top3_pred.extend(pairs)  # add to results
        count += 1

    top3_correct = intersection(true, top3_pred)
    top3_precision = len(top3_correct)/len(top3_pred)
    
    return top1_precision, top3_precision

In [5]:
complete_dataset = PicturesDataset(csv_file='kaggleamazon/train.csv',
                                   root_dir='kaggleamazon/train-jpg/',
                                   transform = True)

In [6]:
train_loader = torch.utils.data.DataLoader(dataset=complete_dataset, 
                                           batch_size=250, 
                                           shuffle=True)

In [7]:
import torch.nn as nn

In [8]:
# Convolutional neural network (two convolutional layers)
class ConvNet(nn.Module):
    def __init__(self, num_classes):
        super(ConvNet, self).__init__()
        self.conv1 = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3),
            nn.MaxPool2d(kernel_size=2),            
            nn.ReLU())
        self.conv2 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=3),
            nn.Dropout2d(),
            nn.MaxPool2d(kernel_size=2),
            nn.ReLU())
        self.fc1 = nn.Sequential(
            nn.Linear(2304, 256),
            nn.Dropout(),
            nn.ReLU())
        self.fc2 = nn.Sequential(
            nn.Linear(256, 17),
            nn.Sigmoid())
        
    def forward(self, x):
        out = self.conv1(x)
        out = self.conv2(out)
        out = out.view(-1, 2304)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

In [9]:
num_epochs = 1

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

num_classes = 17
learning_rate = 0.01
model = ConvNet(num_classes)

In [12]:
criterion = torch.nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum = 0.9)

In [151]:
# Train the model
total_step = len(train_loader)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        images = images[:, :3, :, :]  # drop the 4th channel
        labels = labels.squeeze()
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        precision1, precision3 = precision(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 10 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Precision@1: {:.4f}, Precision@3: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item(), precision1, precision3))

Epoch [1/1], Step [10/120], Loss: 0.6151, Precision@1: 0.8400, Precision@3: 0.6053
Epoch [1/1], Step [20/120], Loss: 0.3729, Precision@1: 0.9200, Precision@3: 0.6267
Epoch [1/1], Step [30/120], Loss: 0.2969, Precision@1: 0.9120, Precision@3: 0.6227
Epoch [1/1], Step [40/120], Loss: 0.2965, Precision@1: 0.9520, Precision@3: 0.6413
Epoch [1/1], Step [50/120], Loss: 0.2701, Precision@1: 0.9120, Precision@3: 0.6227
Epoch [1/1], Step [60/120], Loss: 0.2628, Precision@1: 0.9080, Precision@3: 0.6293
Epoch [1/1], Step [70/120], Loss: 0.2582, Precision@1: 0.9400, Precision@3: 0.6613
Epoch [1/1], Step [80/120], Loss: 0.2544, Precision@1: 0.9200, Precision@3: 0.6333
Epoch [1/1], Step [90/120], Loss: 0.2632, Precision@1: 0.9160, Precision@3: 0.6400
Epoch [1/1], Step [100/120], Loss: 0.2816, Precision@1: 0.8960, Precision@3: 0.6213
Epoch [1/1], Step [110/120], Loss: 0.2559, Precision@1: 0.9320, Precision@3: 0.6467
Epoch [1/1], Step [120/120], Loss: 0.2601, Precision@1: 0.9160, Precision@3: 0.6333
