In [1]:
import os
import json
import argparse
from random import shuffle, seed
import string
# non-standard dependencies:
import h5py
import numpy as np
import torch
import torchvision.models as models
import skimage.io
from PIL import Image



  from ._conv import register_converters as _register_converters


In [2]:
def build_vocab(imgs):
    count_thr = 5

  
    counts = {}
    for img in imgs:
        for sent in img['sentences']:
            for w in sent['tokens']:
                counts[w] = counts.get(w, 0) + 1
    cw = sorted([(count,w) for w,count in counts.items()], reverse=True)     # 按词频排序
    print('top words and their counts:')
    print('\n'.join(map(str,cw[:20])))      #打印前20个词频最高的词

  # 统计词频分布
    total_words = sum(counts.values())    #总单词数
    print('total words:', total_words)     
    bad_words = [w for w,n in counts.items() if n <= count_thr]   #统计词频小于5的单词
    vocab = [w for w,n in counts.items() if n > count_thr]      #用词频大于5的单词构建词典
    bad_count = sum(counts[w] for w in bad_words)                # 统计所有低频词出现的次数
    
    print('number of bad words: %d/%d = %.2f%%' % (len(bad_words), len(counts), len(bad_words)*100.0/len(counts)))
    print('number of words in vocab would be %d' % (len(vocab), ))
    print('number of UNKs: %d/%d = %.2f%%' % (bad_count, total_words, bad_count*100.0/total_words))

  # 统计句子长度分布
    sent_lengths = {}
    for img in imgs:
        for sent in img['sentences']:
            txt = sent['tokens']
            nw = len(txt)
            sent_lengths[nw] = sent_lengths.get(nw, 0) + 1
    max_len = max(sent_lengths.keys())
    print('max length sentence in raw data: ', max_len)
    print('sentence length distribution (count, number of words):')
    sum_len = sum(sent_lengths.values())
    for i in range(max_len+1):
        print('%2d: %10d   %f%%' % (i, sent_lengths.get(i,0), sent_lengths.get(i,0)*100.0/sum_len))
       

        
        
  # 添加未知词  （词频<=5的词归为未知词UNK ）
    if bad_count > 0:
        print('inserting the special UNK token')
        vocab.append('UNK') 
  
    for img in imgs:
        img['final_captions'] = []
        for sent in img['sentences']:
            txt = sent['tokens']
            caption = [w if counts.get(w,0) > count_thr else 'UNK' for w in txt]
            img['final_captions'].append(caption)

    return vocab

In [3]:
def encode_captions(imgs, wtoi):

    max_length = 16
    N = len(imgs)
    M = sum(len(img['final_captions']) for img in imgs) # 计算总的句子数

    label_arrays = []
    label_start_ix = np.zeros(N, dtype='uint32') # note: these will be one-indexed
    label_end_ix = np.zeros(N, dtype='uint32')
    label_length = np.zeros(M, dtype='uint32')
    caption_counter = 0
    counter = 1
    for i,img in enumerate(imgs):
        n = len(img['final_captions'])
        assert n > 0, 'error: some image has no captions'

        Li = np.zeros((n, max_length), dtype='uint32')
        for j,s in enumerate(img['final_captions']):
            label_length[caption_counter] = min(max_length, len(s)) # 判断当前图片caption 长度是否  大于最长句子
            caption_counter += 1
            for k,w in enumerate(s):
                if k < max_length:      #  超过长度的截断 
                    Li[j,k] = wtoi[w]  

        # note: word indices are 1-indexed, and captions are padded with zeros
        label_arrays.append(Li)
        label_start_ix[i] = counter    # 图像  第一个描述  标号
        label_end_ix[i] = counter + n - 1   # 图像结尾描述标号

        counter += n

    L = np.concatenate(label_arrays, axis=0) # 所有caption 拼接起来   （图像数 * 5， max_len）
    assert L.shape[0] == M, 'lengths don\'t match? that\'s weird'
    assert np.all(label_length > 0), 'error: some caption had no words?'

    print('encoded captions to array of size ', L.shape)
    return L, label_start_ix, label_end_ix, label_length



In [4]:
def main():
    
    ################################################
    
    imgs = json.load(open(input_json, 'r'))
    imgs = imgs['images']    

#     with open("/home/why/image-captioning-bottom-up-top-down/other_dataset/WORDMAP_coco_5_cap_per_img_5_min_word_freq.json", 'r') as j:
#         word_map = json.load(j)

    with open("./data/top_1_topic_class.json", 'r') as j:
        image_topclass= json.load(j)

    with open("./data/top_1_topic_word.json", 'r') as j:
        image_topword= json.load(j)

  #############################################################################  
    
    seed(123) # make reproducible
    vocab = build_vocab(imgs)
    
    
    wtoi = {w:i+1 for i,w in enumerate(vocab)} # 词  -->  数字
    itow = {i+1:w for i,w in enumerate(vocab)} # 数字  -->  词
    
    L, label_start_ix, label_end_ix, label_length = encode_captions(imgs, wtoi)
    
    N = len(imgs)
    f_lb = h5py.File(output_h5+'_label.h5', "w")
    f_lb.create_dataset("labels", dtype='uint32', data=L)                         
    f_lb.create_dataset("label_start_ix", dtype='uint32', data=label_start_ix)
    f_lb.create_dataset("label_end_ix", dtype='uint32', data=label_end_ix)
    f_lb.create_dataset("label_length", dtype='uint32', data=label_length)
    f_lb.close()

    out = {}
    out['ix_to_word'] = itow # encode the (1-indexed) vocab
    out['images'] = []
    for i,img in enumerate(imgs):
        jimg = {}
        jimg['split'] = img['split']
        if 'filename' in img: jimg['file_path'] = os.path.join(img['filepath'], img['filename']) # copy it over, might need
        if 'cocoid' in img: jimg['id'] = img['cocoid'] # copy over & mantain an id, if present (e.g. coco ids, useful)
        
        
        image_id = img['filename'].split('_')[2]
        image_id = int(image_id.lstrip("0").split('.')[0])

        top_class = image_topclass[str(image_id)]
        word = image_topword[str(image_id)]
        word = word.split(' ')
        enc_w = [wtoi.get(word, word_map['<unk>']) for word in word]
        jimg['topic_class'] = top_class
        jimg['topic_word'] = enc_w
        
        
        if images_root != '':
            with Image.open(os.path.join(images_root, img['filepath'], img['filename'])) as _img:
                jimg['width'], jimg['height'] = _img.size

        out['images'].append(jimg)

    json.dump(out, open(output_json, 'w'))
    print('wrote ', output_json)



In [5]:
images_root='/home/sdb1/why/COCO'
input_json='/home/why/data/caption/dataset_coco.json'
output_json ='/home/sdb1/why/self-critical/data/data.json'
output_h5 ='/home/sdb1/why/self-critical/data/data'

In [1]:
import json
output_json ='/home/sdb1/why/self-critical/len_16/data.json'

In [3]:
data = json.load(open(output_json,'r'))

In [4]:
a = data['images']

In [6]:
for i in a:
    if i['id'] == 177015:
        print(i['topic_class'])
        print(i['topic_word'])

8
[261, 74, 99, 98, 75, 102, 350, 80, 835, 93, 2835, 89, 905, 347, 254, 213, 157, 1794, 1732, 111]


In [7]:
w = data['ix_to_word']

In [10]:
t = [261, 74, 99, 98, 75, 102, 350, 80, 835, 93, 2835, 89, 905, 347, 254, 213, 157, 1794, 1732, 111]

In [12]:
for i in t:
    print(w[str(i)])

laptop
computer
desk
sitting
keyboard
table
mouse
monitor
office
computers
laptops
using
screen
lap
open
wooden
sits
desktop
monitors
working


In [6]:
main()

top words and their counts:
(1019785, 'a')
(224758, 'on')
(212689, 'of')
(206178, 'the')
(191793, 'in')
(161216, 'with')
(146755, 'and')
(102390, 'is')
(75957, 'man')
(71183, 'to')
(55190, 'sitting')
(51987, 'an')
(50467, 'two')
(44506, 'at')
(44297, 'standing')
(43707, 'people')
(42776, 'are')
(38867, 'next')
(37898, 'white')
(35372, 'woman')
total words: 6454115
number of bad words: 18443/27929 = 66.04%
number of words in vocab would be 9486
number of UNKs: 32382/6454115 = 0.50%
max length sentence in raw data:  49
sentence length distribution (count, number of words):
 0:          0   0.000000%
 1:          0   0.000000%
 2:          0   0.000000%
 3:          0   0.000000%
 4:          0   0.000000%
 5:          1   0.000162%
 6:         14   0.002270%
 7:       4851   0.786521%
 8:     101387   16.438461%
 9:     134531   21.812289%
10:     132558   21.492395%
11:      95206   15.436299%
12:      60590   9.823807%
13:      35233   5.712530%
14:      20016   3.245310%
15:      1147

In [2]:
from PIL import Image
import matplotlib.pyplot as plt

In [3]:
aa = json.load(open(output_json,'r'))

NameError: name 'json' is not defined

In [77]:
iii = aa['images'][55]['file_path']

In [78]:
aa['images'][55]

{'split': 'restval',
 'file_path': 'val2014/COCO_val2014_000000191381.jpg',
 'id': 191381,
 'topic_class': 41,
 'topic_word': [432,
  219,
  163,
  46,
  438,
  454,
  7,
  482,
  544,
  123,
  489,
  480,
  497,
  98,
  120,
  218,
  194,
  457,
  733,
  527],
 'width': 640,
 'height': 480}

In [79]:
iii = images_root+'/'+iii

In [80]:
iii

'/home/sdb1/why/COCO/val2014/COCO_val2014_000000191381.jpg'

In [1]:
image = Image.open(iii)
plt.imshow(image)
plt.show

NameError: name 'Image' is not defined