In [1]:
from os import listdir
import numpy as np
from keras.preprocessing import image
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.applications.vgg16 import preprocess_input
from keras.applications.vgg16 import VGG16
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.layers import Input, Embedding, Dropout, Dense, LSTM, add
from keras.models import Model
from keras.models import load_model
from keras.callbacks import ModelCheckpoint
from pickle import dump, load
import string

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
def extract_features(data_path):
    #我已经下载好VGG16预训练模型，无需在下载，注意将模型放置正确路径
    model = VGG16()
    model.layers.pop()
    model = Model(inputs=model.inputs, outputs=model.layers[-1].output)
    features = {}
    for i, f in enumerate(listdir(data_path)):
        filename = data_path + "/" + f
        image = load_img(filename, target_size=(224, 224))
        image = img_to_array(image)  # this is (224, 224, 3)
        image = image.reshape(-1, image.shape[0], image.shape[1], image.shape[2])  # this is (1, 224, 224, 3)
        image = preprocess_input(image)
        feature = model.predict(image)
        image_id = f.split(".")[0]
        features[image_id] = feature
        if i%100 == 0:
            print(i)
        model.summary()
    return features

In [3]:
data_path = "D:/attention模型/Flicker8k_Dataset" 
#features = extract_features(data_path) 
#这一步耗时，耗显存，重复运行时注意释放显存
#dump(features, open('D:/attention模型/master/features.pkl', 'wb')) 
#print("extracted features for %d photos" % len(features))

In [4]:
def load_doc(filepath):
    with open(filepath, 'r') as ifp:
        text = ifp.read()
    return  text

def load_descriptions(doc):
    mapping = dict()
    for line in doc.split('\n'):
        tokens = line.split()
        if len(line) < 2:
            continue
        image_id = tokens[0].split('.')[0]
        image_desc = ' '.join(tokens[1:])
        if image_id not in mapping:
            mapping[image_id] = []
        mapping[image_id].append(image_desc)
    return mapping

In [5]:
filepath = 'D:/attention模型/Flickr8k.token.txt'
doc = load_doc(filepath)
descriptions = load_descriptions(doc)
print("loaded %d descriptions" % len(descriptions))

loaded 8092 descriptions


In [6]:
def clean_descriptions(descriptions):
    table = str.maketrans('', '', string.punctuation)
    for key, desc_list in descriptions.items():
        for i, desc in enumerate(desc_list):
            desc = desc.split()
            desc = [w.lower() for w in desc]
            desc = [w.translate(table) for w in desc]
            desc = [w for w in desc if len(w) > 1]
            desc = [w for w in desc if w.isalpha()]
            descriptions[key][i] = ' '.join(desc)

def to_vocabulary(descriptions):
    words = set()
    for key in descriptions.keys():
        for d in descriptions[key]:
            words.update(d.split())
    return words

def save_descriptions(descriptions, output_filepath):
    with open(output_filepath, 'w') as ofp:
        for key,desc_list in descriptions.items():
            for d in desc_list:
                ofp.write(key + ' ' + d + '\n')

In [7]:
clean_descriptions(descriptions)
vocabulary = to_vocabulary(descriptions)
print("vocabulary size : %d" % len(vocabulary))
save_descriptions(descriptions, 'D:/attention模型/master/cleaned_descriptions.txt')

vocabulary size : 8763


In [8]:
def load_identifiers(filepath):
    doc = load_doc(filepath)
    ids = set()
    for line in doc.split('\n'):
        if len(line) < 1:
            continue
        ids.add(line.split('.')[0])
    return ids

def load_clean_descriptions(filepath, ids):
    doc = load_doc(filepath)
    descriptions = {}
    for line in doc.split('\n'):
        tokens = line.split()
        if len(tokens) < 1:
            continue
        image_id, image_desc = tokens[0], tokens[1:]
        if image_id in ids:
            if image_id not in descriptions:
                descriptions[image_id] = []
            desc = 'startseq ' + ' '.join(image_desc) + ' endseq'
            descriptions[image_id].append(desc)
    return descriptions

def load_photo_features(filepath, ids):
    features = load(open(filepath, 'rb'))
    return {k:features[k] for k in ids}

In [9]:
filepath = 'D:/attention模型/Flickr_8k.trainImages.txt'
train_ids = load_identifiers(filepath)
print('Dataset: %d' % len(train_ids))
train_descriptions = load_clean_descriptions('D:/attention模型/master/cleaned_descriptions.txt', train_ids)
print('Descriptions: train=%d' % len(train_descriptions))
train_features = load_photo_features('D:/attention模型/master/features.pkl', train_ids)
print('Photos: train=%d' % len(train_features))

Dataset: 6000
Descriptions: train=6000
Photos: train=6000


In [10]:
def to_lines(descriptions):
    desc_list = []
    for key in descriptions:
        for d in descriptions[key]:
            desc_list.append(d)
    return desc_list

def create_tokenizer(descriptions):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(to_lines(descriptions))
    return tokenizer

In [11]:
tokenizer = create_tokenizer(train_descriptions)
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary size based on tokenizer from train data: %d' % vocab_size)
#print(tokenizer.word_index)
new_dict = {v : k for k, v in tokenizer.word_index.items()}
#print(new_dict)


Vocabulary size based on tokenizer from train data: 7579


In [12]:
def create_sequences(tokenizer, max_length, desc_list, photo):
    X1, X2, y = [], [], []
    vocab_size = len(tokenizer.word_index) + 1
    for desc in desc_list:
        seq = tokenizer.texts_to_sequences([desc])[0]
        for i in range(1, len(seq)):
            in_seq, out_seq = seq[:i], seq[i]
            in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
            out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
            X1.append(photo)
            X2.append(in_seq)
            y.append(out_seq)
    return np.array(X1), np.array(X2), np.array(y)

def max_length(descriptions):
    lines = to_lines(descriptions)
    return max(len(d.split()) for d in lines)

In [13]:
max_length = max_length(train_descriptions)
print('Max description Length: %d' % max_length)
X1train, X2train, ytrain = create_sequences(tokenizer, max_length, train_descriptions, train_features)

Max description Length: 34


In [14]:
# dev data

filepath = 'D:/attention模型/Flickr_8k.devImages.txt'
test_ids = load_identifiers(filepath)
print("Dataset: %d" %  len(test_ids))
test_descriptions = load_clean_descriptions('D:/attention模型/master/cleaned_descriptions.txt', test_ids)
print("Descriptions: test = %d" % len(test_descriptions))
test_features = load_photo_features('D:/attention模型/master/features.pkl', test_ids)
print('Photos: test=%d' % len(test_features))
X1test, X2test, ytest = create_sequences(tokenizer, max_length, test_descriptions, test_features)

Dataset: 1000
Descriptions: test = 1000
Photos: test=1000


In [15]:
def define_model(vobab_size, max_length):
    inputs1 = Input(shape = (4096, ))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)
    inputs2 = Input(shape = (max_length, ))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vobab_size, activation='softmax')(decoder2)
    model = Model(inputs = [inputs1, inputs2], outputs = outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    print(model.summary())
    #plot_model(model, to_file = 'res/model.png', show_shapes=True)
    return model


def data_generator(tokenizer, max_length, descriptions, photos):
    while True:
       for key, desc_list in descriptions.items():
           photo = photos[key][0]
           in_img, in_seq, out_word = create_sequences(tokenizer, max_length, desc_list, photo)
           yield [[in_img, in_seq], out_word]

In [16]:
# fit model

model = define_model(vocab_size, max_length)
filepath = 'model-ep{epoch:03d}-loss{loss:.3f}-val_loss{val_loss:.3f}.h5'
checkpoint = ModelCheckpoint(filepath, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
generator = data_generator(tokenizer, max_length, train_descriptions, train_features)
generator_eval = data_generator(tokenizer, max_length, test_descriptions, test_features)
num_epochs = 1
for i in range(num_epochs):
    model.fit_generator(generator, epochs=1, steps_per_epoch = len(train_descriptions), verbose=1)
    model.save('D:/attention模型/master/model_' + str(i) + '.h5')

Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 34)           0                                            
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 4096)         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 34, 256)      1940224     input_2[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 4096)         0        

In [20]:
#载入训练好的模型
model = load_model('D:/attention模型/master/model_0.h5')
#评估训练好的模型
print('测试集准确率为：',model.evaluate_generator(generator_eval,steps=1000))

#给出图片的绝对路径
file_path = 'D:\\attention模型\\Flicker8k_Dataset\\667626_18933d713e.jpg'
image = load_img(file_path, target_size=(224, 224))
image = img_to_array(image)  # this is (224, 224, 3)
image = image.reshape(-1, image.shape[0], image.shape[1], image.shape[2])  # this is (1, 224, 224, 3)
image = preprocess_input(image)
premodel = VGG16()
premodel.layers.pop()
premodel = Model(inputs=premodel.inputs, outputs=premodel.layers[-1].output)
feature = premodel.predict(image)
res=np.reshape(model.predict([feature,np.random.rand(1,34)]),(7579,)).tolist()
#print(res)
a=[]
for i in res:
    if i > 0.1:
        a.append(res.index(i))
print(a)
for i in a:
    print(new_dict[i],end=' ')

测试集准确率为： 4.147578559970888
[2, 3, 5, 7, 9, 11, 73]
endseq in on and with of from 