<a href="https://colab.research.google.com/github/victor7246/fashion-iq/blob/master/baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import files

# Install Kaggle library
!pip install -q kaggle

In [2]:
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [0]:
!mkdir /root/.kaggle
!cp kaggle.json /root/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

In [4]:
!kaggle datasets download -d datamafia7/fashion-iq

Downloading fashion-iq.zip to /content
 98% 633M/644M [00:07<00:00, 97.4MB/s]
100% 644M/644M [00:07<00:00, 92.7MB/s]


In [5]:
!unzip ./fashion-iq.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: resized_images/toptee/B00C10SE6O.jpg  
  inflating: resized_images/toptee/B00C1180OE.jpg  
  inflating: resized_images/toptee/B00C1186SO.jpg  
  inflating: resized_images/toptee/B00C1187T2.jpg  
  inflating: resized_images/toptee/B00C11SHNS.jpg  
  inflating: resized_images/toptee/B00C11UJZM.jpg  
  inflating: resized_images/toptee/B00C11UL28.jpg  
  inflating: resized_images/toptee/B00C11UPY2.jpg  
  inflating: resized_images/toptee/B00C11UTWK.jpg  
  inflating: resized_images/toptee/B00C11UWZE.jpg  
  inflating: resized_images/toptee/B00C11UYUM.jpg  
  inflating: resized_images/toptee/B00C11V9JM.jpg  
  inflating: resized_images/toptee/B00C11VBB8.jpg  
  inflating: resized_images/toptee/B00C11VGGI.jpg  
  inflating: resized_images/toptee/B00C11VOII.jpg  
  inflating: resized_images/toptee/B00C11VQMC.jpg  
  inflating: resized_images/toptee/B00C11VSAW.jpg  
  inflating: resized_images/toptee/B00C124H2C.jpg  

In [6]:
uploaded = files.upload()

Saving build_vocab.py to build_vocab.py
Saving data_loader.py to data_loader.py
Saving eval.py to eval.py
Saving image_downloader.py to image_downloader.py
Saving models.py to models.py
Saving resize_images.py to resize_images.py
Saving train.py to train.py
Saving utils.py to utils.py


In [0]:
import argparse
import time
import json
import os
import torch
import torch.nn as nn
from torchvision import transforms
from data_loader import get_loader
from build_vocab import Vocabulary
#from models import DummyImageEncoder, DummyCaptionEncoder
from utils import create_exp_dir, Ranker

In [15]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print (device)

# Paths to data
IMAGE_ROOT = './resized_images/{}/'
CAPT = './captions/cap.{}.{}.json'
DICT = './captions/dict.{}.json'
SPLIT = './data_splits/split.{}.{}.json'

cuda


In [0]:
import torch
import torch.nn as nn
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence

class DummyImageEncoder(nn.Module):
    def __init__(self, embed_size):
        """Load the pretrained ResNet-50 and replace top fc layer."""
        super(DummyImageEncoder, self).__init__()
        resnet = models.resnet50(pretrained=True)
        modules = list(resnet.children())[:-1]  # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        self.linear = nn.Linear(resnet.fc.in_features, embed_size)
        self.bn = nn.BatchNorm1d(resnet.fc.in_features, momentum=0.01)

    def get_trainable_parameters(self):
        return list(self.bn.parameters()) + list(self.linear.parameters())

    def load_resnet(self, resnet=None):
        if resnet is None:
            resnet = models.resnet50(pretrained=True)
            modules = list(resnet.children())[:-1]  # delete the last fc layer.
            self.resnet = nn.Sequential(*modules)
            self.resnet_in_features = resnet.fc.in_features
        else:
            self.resnet = resnet
        return

    def delete_resnet(self):
        resnet = self.resnet
        self.resnet = None
        return resnet

    def forward(self, image):
        with torch.no_grad():
            img_ft = self.resnet(image)

        if img_ft.size(0) > 1:
            out = self.linear(self.bn(img_ft.reshape(img_ft.size(0), -1)))
        else:
            out = self.linear(img_ft.reshape(img_ft.size(0), -1))
        return out


class DummyCaptionEncoder(nn.Module):
    def __init__(self, vocab_size, vocab_embed_size, embed_size):
        super(DummyCaptionEncoder, self).__init__()
        self.out_linear = nn.Linear(embed_size, embed_size, bias=False)
        self.rnn = nn.GRU(vocab_embed_size, embed_size)
        self.embed = nn.Embedding(vocab_size, vocab_embed_size)

    def forward(self, input, lengths):
        input = self.embed(input)
        lengths = torch.LongTensor(lengths)
        [_, sort_ids] = torch.sort(lengths, descending=True)
        sorted_input = input[sort_ids]
        sorted_length = lengths[sort_ids]
        reverse_sort_ids = sort_ids.clone()
        for i in range(sort_ids.size(0)):
            reverse_sort_ids[sort_ids[i]] = i
        packed = pack_padded_sequence(sorted_input, sorted_length, batch_first=True)
        output, _ = self.rnn(packed)
        padded, output_length = torch.nn.utils.rnn.pad_packed_sequence(output)
        output = [padded[output_length[i]-1, i, :] for i in range(len(output_length))]
        output = torch.stack([output[reverse_sort_ids[i]] for i in range(len(output))], dim=0)
        output = self.out_linear(output)
        return output

    def get_trainable_parameters(self):
        return list(self.parameters())

In [0]:
triplet_avg = nn.TripletMarginLoss(reduction='elementwise_mean', margin=1)


def eval_batch(data_loader, image_encoder, caption_encoder, ranker):
    ranker.update_emb(image_encoder)
    rankings = []
    loss = []
    for i, (target_images, candidate_images, captions, lengths, meta_info) in enumerate(data_loader):
        with torch.no_grad():
            target_images = target_images.to(device)
            target_ft = image_encoder.forward(target_images)
            candidate_images = candidate_images.to(device)
            candidate_ft = image_encoder.forward(candidate_images)
            captions = captions.to(device)
            caption_ft = caption_encoder(captions, lengths)
            target_asins = [ meta_info[m]['target'] for m in range(len(meta_info)) ]
            rankings.append(ranker.compute_rank(candidate_ft + caption_ft, target_asins))
            m = target_images.size(0)
            random_index = [m - 1 - n for n in range(m)]
            random_index = torch.LongTensor(random_index)
            negative_ft = target_ft[random_index]
            loss.append(triplet_avg(anchor=(candidate_ft + caption_ft),
                               positive=target_ft, negative=negative_ft))

    metrics = {}
    rankings = torch.cat(rankings, dim=0)
    metrics['score'] = 1 - rankings.mean().item() / ranker.data_emb.size(0)
    metrics['loss'] = torch.stack(loss, dim=0).mean().item()
    return metrics

In [0]:
def train(args):
    # Image preprocessing, normalization for the pretrained resnet
    transform = transforms.Compose([ 
        transforms.RandomCrop(args.crop_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(), 
        transforms.Normalize((0.485, 0.456, 0.406), 
                             (0.229, 0.224, 0.225))])

    transform_dev = transforms.Compose([
        transforms.CenterCrop(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406),
                             (0.229, 0.224, 0.225))])

    vocab = Vocabulary()
    vocab.load(DICT.format(args.data_set))

    # Build data loader
    data_loader = get_loader(IMAGE_ROOT.format(args.data_set),
                             CAPT.format(args.data_set, 'train'),
                             vocab, transform,
                             args.batch_size, shuffle=True, return_target=True, num_workers=args.num_workers)

    data_loader_dev = get_loader(IMAGE_ROOT.format(args.data_set),
                                 CAPT.format(args.data_set, 'val'),
                                 vocab, transform_dev,
                                 args.batch_size, shuffle=False, return_target=True, num_workers=args.num_workers)

    ranker = Ranker(root=IMAGE_ROOT.format(args.data_set), image_split_file=SPLIT.format(args.data_set, 'val'),
                    transform=transform_dev, num_workers=args.num_workers)

    save_folder = '{}/{}'.format(args.save, args.data_set)
    create_exp_dir(save_folder, scripts_to_save=None)

    def logging(s, print_=True, log_=True):
        if print_:
            print(s)
        if log_:
            with open(os.path.join(save_folder, 'log.txt'), 'a+') as f_log:
                f_log.write(s + '\n')

    logging(str(args))
    # Build the dummy models
    image_encoder = DummyImageEncoder(args.embed_size).to(device)
    caption_encoder = DummyCaptionEncoder(vocab_size=len(vocab), vocab_embed_size=args.embed_size * 2,
                                          embed_size=args.embed_size).to(device)

    image_encoder.train()
    caption_encoder.train()
    params = image_encoder.get_trainable_parameters() + caption_encoder.get_trainable_parameters()

    current_lr = args.learning_rate
    optimizer = torch.optim.Adam(params, lr=current_lr)

    cur_patient = 0
    best_score = float('-inf')
    stop_train = False
    total_step = len(data_loader)
    # epoch = 1 for dummy setting
    for epoch in range(args.epochs):

        for i, (target_images, candidate_images, captions, lengths, meta_info) in enumerate(data_loader):

            target_images = target_images.to(device)
            target_ft = image_encoder.forward(target_images)

            candidate_images = candidate_images.to(device)
            candidate_ft = image_encoder.forward(candidate_images)

            captions = captions.to(device)
            caption_ft = caption_encoder(captions, lengths)

            # random select negative examples
            m = target_images.size(0)
            random_index = [m - 1 - n for n in range(m)]
            random_index = torch.LongTensor(random_index)
            negative_ft = target_ft[random_index]

            loss = triplet_avg(anchor=(candidate_ft + caption_ft),
                               positive=target_ft, negative=negative_ft)

            caption_encoder.zero_grad()
            image_encoder.zero_grad()
            loss.backward()
            optimizer.step()

            if i % args.log_step == 0:
                logging(
                    '| epoch {:3d} | step {:6d}/{:6d} | lr {:06.6f} | train loss {:8.3f}'.format(epoch, i, total_step,
                                                                                                 current_lr,
                                                                                                 loss.item()))

        image_encoder.eval()
        caption_encoder.eval()
        logging('-' * 77)
        metrics = eval_batch(data_loader_dev, image_encoder, caption_encoder, ranker)
        logging('| eval loss: {:8.3f} | score {:8.5f} / {:8.5f} '.format(
            metrics['loss'], metrics['score'], best_score))
        logging('-' * 77)

        image_encoder.train()
        caption_encoder.train()

        dev_score = metrics['score']
        if dev_score > best_score:
            best_score = dev_score
            # save best model
            resnet = image_encoder.delete_resnet()
            torch.save(image_encoder.state_dict(), os.path.join(
                save_folder,
                'image-{}.th'.format(args.embed_size)))
            image_encoder.load_resnet(resnet)

            torch.save(caption_encoder.state_dict(), os.path.join(
                save_folder,
                'cap-{}.th'.format(args.embed_size)))

            cur_patient = 0
        else:
            cur_patient += 1
            if cur_patient >= args.patient:
                current_lr /= 2
                for param_group in optimizer.param_groups:
                    param_group['lr'] = current_lr
                if current_lr < args.learning_rate * 1e-3:
                    stop_train = True
                    break

        if stop_train:
            break
    logging('best_dev_score: {}'.format(best_score))

In [0]:
def main(dataset='dress',epochs=20,embed_size=512):
    parser = argparse.ArgumentParser()
    parser.add_argument('--save', type=str, default='./',
                        help='path for saving trained models')
    parser.add_argument('--crop_size', type=int, default=224,
                        help='size for randomly cropping images')

    parser.add_argument('--data_set', type=str, default=dataset)
    parser.add_argument('--log_step', type=int, default=50,
                        help='step size for printing log info')
    parser.add_argument('--patient', type=int, default=3,
                        help='patient for reducing learning rate')

    # Model parameters
    parser.add_argument('--embed_size', type=int , default=embed_size,
                        help='dimension of word embedding vectors')
    # Learning parameters
    parser.add_argument('--epochs', type=int, default=epochs)
    parser.add_argument('--batch_size', type=int, default=8)
    parser.add_argument('--num_workers', type=int, default=1)
    parser.add_argument('--learning_rate', type=float, default=0.0005)

    args, _ = parser.parse_known_args()

    train(args)

In [17]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [22]:
main('dress',epochs=30)

Experiment dir : .//dress
Namespace(batch_size=8, crop_size=224, data_set='dress', embed_size=512, epochs=30, learning_rate=0.0005, log_step=50, num_workers=1, patient=3, save='./')




| epoch   0 | step      0/   749 | lr 0.000500 | train loss    0.875
| epoch   0 | step     50/   749 | lr 0.000500 | train loss    1.285
| epoch   0 | step    100/   749 | lr 0.000500 | train loss    1.862
| epoch   0 | step    150/   749 | lr 0.000500 | train loss    0.149
| epoch   0 | step    200/   749 | lr 0.000500 | train loss    1.245
| epoch   0 | step    250/   749 | lr 0.000500 | train loss    0.938
| epoch   0 | step    300/   749 | lr 0.000500 | train loss    0.936
| epoch   0 | step    350/   749 | lr 0.000500 | train loss    1.400
| epoch   0 | step    400/   749 | lr 0.000500 | train loss    2.419
| epoch   0 | step    450/   749 | lr 0.000500 | train loss    2.082
| epoch   0 | step    500/   749 | lr 0.000500 | train loss    2.799
| epoch   0 | step    550/   749 | lr 0.000500 | train loss    0.062
| epoch   0 | step    600/   749 | lr 0.000500 | train loss    1.377
| epoch   0 | step    650/   749 | lr 0.000500 | train loss    0.000
| epoch   0 | step    700/   749 |

In [23]:
main('shirt',epochs=30)

Experiment dir : .//shirt
Namespace(batch_size=8, crop_size=224, data_set='shirt', embed_size=512, epochs=30, learning_rate=0.0005, log_step=50, num_workers=1, patient=3, save='./')




| epoch   0 | step      0/   749 | lr 0.000500 | train loss    0.218
| epoch   0 | step     50/   749 | lr 0.000500 | train loss    1.610
| epoch   0 | step    100/   749 | lr 0.000500 | train loss    2.642
| epoch   0 | step    150/   749 | lr 0.000500 | train loss    0.110
| epoch   0 | step    200/   749 | lr 0.000500 | train loss    0.013
| epoch   0 | step    250/   749 | lr 0.000500 | train loss    1.913
| epoch   0 | step    300/   749 | lr 0.000500 | train loss    3.282
| epoch   0 | step    350/   749 | lr 0.000500 | train loss    3.101
| epoch   0 | step    400/   749 | lr 0.000500 | train loss    1.065
| epoch   0 | step    450/   749 | lr 0.000500 | train loss    2.157
| epoch   0 | step    500/   749 | lr 0.000500 | train loss    2.610
| epoch   0 | step    550/   749 | lr 0.000500 | train loss    0.441
| epoch   0 | step    600/   749 | lr 0.000500 | train loss    2.298
| epoch   0 | step    650/   749 | lr 0.000500 | train loss    1.437
| epoch   0 | step    700/   749 |

In [24]:
main('toptee',epochs=30)

Experiment dir : .//toptee
Namespace(batch_size=8, crop_size=224, data_set='toptee', embed_size=512, epochs=30, learning_rate=0.0005, log_step=50, num_workers=1, patient=3, save='./')




| epoch   0 | step      0/   754 | lr 0.000500 | train loss    0.726
| epoch   0 | step     50/   754 | lr 0.000500 | train loss    1.129
| epoch   0 | step    100/   754 | lr 0.000500 | train loss    1.704
| epoch   0 | step    150/   754 | lr 0.000500 | train loss    1.076
| epoch   0 | step    200/   754 | lr 0.000500 | train loss    1.801
| epoch   0 | step    250/   754 | lr 0.000500 | train loss    1.432
| epoch   0 | step    300/   754 | lr 0.000500 | train loss    1.777
| epoch   0 | step    350/   754 | lr 0.000500 | train loss    1.061
| epoch   0 | step    400/   754 | lr 0.000500 | train loss    1.481
| epoch   0 | step    450/   754 | lr 0.000500 | train loss    2.579
| epoch   0 | step    500/   754 | lr 0.000500 | train loss    5.239
| epoch   0 | step    550/   754 | lr 0.000500 | train loss    2.740
| epoch   0 | step    600/   754 | lr 0.000500 | train loss    0.752
| epoch   0 | step    650/   754 | lr 0.000500 | train loss    1.781
| epoch   0 | step    700/   754 |

In [0]:
def evaluate(args):
    # Image pre-processing, normalization for the pre-trained resnet
    transform_test = transforms.Compose([
        transforms.CenterCrop(args.crop_size),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406),
                             (0.229, 0.224, 0.225))])
    vocab = Vocabulary()
    vocab.load(DICT.format(args.data_set))
    # Build data loader
    data_loader_test = get_loader(IMAGE_ROOT.format(args.data_set),
                                 CAPT.format(args.data_set, args.data_split),
                                 vocab, transform_test,
                                 args.batch_size, shuffle=False, return_target=False, num_workers=args.num_workers)
    ranker = Ranker(root=IMAGE_ROOT.format(args.data_set), image_split_file=SPLIT.format(args.data_set, args.data_split),
                    transform=transform_test, num_workers=args.num_workers)

    # Build the dummy models
    image_encoder = DummyImageEncoder(args.embed_size).to(device)
    caption_encoder = DummyCaptionEncoder(vocab_size=len(vocab), vocab_embed_size=args.embed_size * 2,
                                          embed_size=args.embed_size).to(device)
    # load trained models
    image_model = os.path.join(args.model_folder,
                               "image-{}.th".format(args.embed_size))
    resnet = image_encoder.delete_resnet()
    image_encoder.load_state_dict(torch.load(image_model, map_location=device))
    image_encoder.load_resnet(resnet)

    cap_model = os.path.join(args.model_folder,
                               "cap-{}.th".format(args.embed_size))
    caption_encoder.load_state_dict(torch.load(cap_model, map_location=device))

    ranker.update_emb(image_encoder)
    image_encoder.eval()
    caption_encoder.eval()

    output = json.load(open(CAPT.format(args.data_set, args.data_split)))

    index = 0
    for _, candidate_images, captions, lengths, meta_info in data_loader_test:
        with torch.no_grad():
            candidate_images = candidate_images.to(device)
            candidate_ft = image_encoder.forward(candidate_images)
            captions = captions.to(device)
            caption_ft = caption_encoder(captions, lengths)
            rankings = ranker.get_nearest_neighbors(candidate_ft + caption_ft)
            # print(rankings)
            for j in range(rankings.size(0)):
                output[index]['ranking'] = [ranker.data_asin[rankings[j, m].item()] for m in range(rankings.size(1))]
                index += 1

    json.dump(output, open("{}.{}.pred.json".format(args.data_set, args.data_split), 'w'), indent=4)
    print('eval completed. Output file: {}'.format("{}.{}.pred.json".format(args.data_set, args.data_split)))


In [0]:
def eval_main(modelpath='./dress/',dataset='dress',embed_size=512):
    parser = argparse.ArgumentParser()
    parser.add_argument('--model_folder', type=str, default=modelpath,
                        help='path for trained models')
    parser.add_argument('--crop_size', type=int, default=224,
                        help='size for randomly cropping images')
    parser.add_argument('--data_set', type=str, default=dataset)
    parser.add_argument('--data_split', type=str, default='test')
    # Model parameters
    parser.add_argument('--embed_size', type=int, default=embed_size,
                        help='dimension of word embedding vectors')
    # Learning parameters
    parser.add_argument('--batch_size', type=int, default=8)
    parser.add_argument('--num_workers', type=int, default=1)

    args, _ = parser.parse_known_args()
    #args = parser.parse_args()

    IMAGE_ROOT = 'data/resized_images/{}'.format(args.data_set)
    CAPT = 'data/captions/cap.{}.{}.json'
    DICT = 'data/captions/dict.{}.json'
    SPLIT = 'data/data_splits/split.{}.{}.json'

    evaluate(args)

In [35]:
eval_main(modelpath='./dress/',dataset='dress')

updating emb
emb updated
eval completed. Output file: dress.test.pred.json


In [36]:
eval_main(modelpath='./shirt/',dataset='shirt')

updating emb
emb updated
eval completed. Output file: shirt.test.pred.json


In [37]:
eval_main(modelpath='./toptee/',dataset='toptee')

updating emb
emb updated
eval completed. Output file: toptee.test.pred.json


In [0]:
!mv dress.test.pred.json dress.predict.json
!mv shirt.test.pred.json shirt.predict.json
!mv toptee.test.pred.json toptee.predict.json

In [39]:
!ls *.json

dress.predict.json  kaggle.json  shirt.predict.json  toptee.predict.json
