#### Chinese artworks dataset needs to be preprocessed using chinese_artworks_convert.ipynb notebook.

In [1]:
import torchvision
import torch
import json
import cv2
import os.path as osp
from PIL import Image
from tqdm import tqdm_notebook
import numpy as np
import nltk
from collections import Counter

In [2]:
dataset_name = 'artworks'
# dataset_name = 'chinese_artworks'

Use https://github.com/violetteshev/bottom-up-features to extract image features with faster-rcnn, pretrained models need to be downloaded for use.

```
python bottom-up-features/extract_features.py --image_dir artworks/test --out_dir artworks/features --cfg bottom-up-features/cfgs/
faster_rcnn_resnet101.yml --model bottom-up-features/models/bottomup_pretrained_10_100.pth
```

The above extraction process supports CUDA ONLY and pytorch 0.4 ONLY.

After extracted the features, the training and testing process of SCAN are irrelavant to botton-up attention image feature extraction.

## Image Feature

Combine the features extracted from above steps

In [3]:
content = json.load(open(osp.join(dataset_name, 'caption_data.json')))

In [4]:
images = {
    'train': [],
    'test': [],
    'val': []
}
for image in content['images']:
    filepath = image['filepath']
    if len(image['sentences']) == 0:
        continue
    images[filepath].append(image)

In [5]:
for phase in images.keys():
    features = []
    for image in tqdm_notebook(images[phase]):
        file_path = osp.join(dataset_name, 'features', image['filename'].split('.')[0] + '.npy')
        feature = np.load(file_path)
        features.append(feature[:10, :])
#     features = np.concatenate(features)
    features = np.stack(features)
    np.save(osp.join(dataset_name, phase), features)

HBox(children=(IntProgress(value=0, max=14351), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1793), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1795), HTML(value='')))




## Vocab

In [6]:
from vocab import Vocabulary, serialize_vocab

In [7]:
threshold = 4
counter = Counter()
for phase in images.keys():
    captions = [x['sentences'][0]['raw'] for x in images[phase]]
    for caption in captions:
        tokens = nltk.tokenize.word_tokenize(
            caption.lower())
        counter.update(tokens)
# Discard if the occurrence of the word is less than min_word_cnt.
words = [word for word, cnt in counter.items() if cnt >= threshold]

# Create a vocab wrapper and add some special tokens.
vocab = Vocabulary()
vocab.add_word('<pad>')
vocab.add_word('<start>')
vocab.add_word('<end>')
vocab.add_word('<unk>')

# Add words to the vocabulary.
for i, word in enumerate(words):
    vocab.add_word(word)
serialize_vocab(vocab, osp.join(dataset_name, 'vocab.json'))

In [8]:
with open(osp.join(dataset_name, 'data.json'), 'w') as f:
    json.dump(images, f)