In [None]:
# 计算并存储每张图片的score
import numpy as np

def get_score(zan_num, cai_num, clk_num):
    zan_num = max(zan_num - 1, 0)
    return zan_num - cai_num + np.round(np.log(clk_num+1))

img_score_by_date = {}
with open("../data/img_attr.csv", 'r') as fin:
    for line in fin:
        fields = line.strip().split("\t")
        if len(fields) != 7:
            continue
        imgid = fields[0]
        zan_num = int(fields[1])
        cai_num = int(fields[2])
        clk_num = int(fields[3])
        score = get_score(zan_num, cai_num, clk_num)
        date = fields[5]
        if date >= "2015/01" and date < "2017/01":
            img_score_by_date.setdefault(date, [])
            img_score_by_date[date].append((imgid, score))

print "#count of images: %d"%sum(map(len, img_score_by_date.values()))

In [None]:
# 构造用于模型训练和验证的pair数据对
import random

def split_dataset(dataset, p):
    """
    return train_dataset, valid_dataset
    """
    random.shuffle(dataset)
    valid_size = int(len(dataset)*p)
    return dataset[:-valid_size], dataset[-valid_size:]

train_list = []
valid1_list = []
valid2_list = []
p = 0.1 # valid_data ratio
k = 15 # compare with k images for each train image
delta = 0.5 # if abs(sa-sb) < delta, then abort this comparation pair
for date, imgs in img_score_by_date.items():
    train_imgs, valid_imgs = split_dataset(imgs, p)
    cmp_k = len(train_imgs) > k and k or len(train_imgs)
    
    for img_a, s_a in train_imgs:
        cmp_imgs = random.sample(train_imgs, cmp_k)
        for img_b, s_b in cmp_imgs:
            if abs(s_a-s_b) < delta:
                continue
            cmp_ret = s_a > s_b and 1 or 0
            train_list.append((img_a, s_a, img_b, s_b, cmp_ret))
            
    for img_a, s_a in valid_imgs:
        cmp_imgs = random.sample(train_imgs, cmp_k)
        for img_b, s_b in cmp_imgs:
            if abs(s_a-s_b) < delta:
                continue
            cmp_ret = s_a > s_b and 1 or 0
            valid1_list.append((img_a, s_a, img_b, s_b, cmp_ret))
            
    for img_a, s_a in valid_imgs:
        img_b, s_b = random.choice(valid_imgs)
        if abs(s_a-s_b) < delta:
            continue
        cmp_ret = s_a > s_b and 1 or 0
        valid2_list.append((img_a, s_a, img_b, s_b, cmp_ret))
            
print "Length of Train List: %d"%len(train_list)
print "Length of Valid1 List: %d"%len(valid1_list)
print "Length of Valid2 List: %d"%len(valid2_list)

In [None]:
# 构造用于模型训练和验证的pair数据对--V2
# 将训练数据集分成两个part
# 对于partB中的每个图片，依次从partA中取01等量的图片用于构造训练数据集
# 这样可以避免partB中数据01分类不均带来的误拟合问题
# 验证集则是将每张图片同partB里的图片进行比较，partA里的图片不出现在验证集合里
import random
import cPickle as pickle

max_pair_num = 10 # 对于每张图片最多构造10×2张训练样本
min_imgs_num = 50 # 如果当天的照片数量小于50张，则认为该天的数据量不足，弃之不用
valid_p = 0.1 # 验证集占的比例

def split_dataset(dataset):
    """
    return train_dataset, valid_dataset
    """
    if len(dataset) < min_imgs_num:
        return ([], [], [])
    random.shuffle(dataset)
    valid_size = int(len(dataset)*valid_p)
    trainset, validset = dataset[:-valid_size], dataset[-valid_size:]
    trainsetA, trainsetB = trainset[:len(trainset)/2], trainset[len(trainset)/2:]
    return trainsetA, trainsetB, validset

def img_cmp(trainsetA, img, score):
    img_pairs = []
    high_imgs = []
    low_imgs = []
    for imgA, scoreA in trainsetA:
        if scoreA > score:
            high_imgs.append((imgA, scoreA))
        elif scoreA < score:
            low_imgs.append((imgA, scoreA))
    pair_num = min(max_pair_num, len(low_imgs), len(high_imgs))
    for imgA, scoreA in random.sample(high_imgs, pair_num):
        #if random.random() < 0.5:
        img_pairs.append((imgA, scoreA, img, score, 1))
        #else:
        img_pairs.append((img, score, imgA, scoreA, 0))
    for imgA, scoreA in random.sample(low_imgs, pair_num):
        #if random.random() < 0.5:
        img_pairs.append((imgA, scoreA, img, score, 0))
        #else:
        img_pairs.append((img, score, imgA, scoreA, 1))
    return img_pairs

train_list = []
valid_list = []

for date, imgs in img_score_by_date.items():
    trainsetA, trainsetB, validset = split_dataset(imgs)
    valid_trainsetB = []
    for imgB, scoreB in trainsetB:
        img_pairs = img_cmp(trainsetA, imgB, scoreB)
        if len(img_pairs) != 0:
            valid_trainsetB.append((imgB, scoreB))
            train_list.extend(img_pairs)
            
    for imgV, scoreV in validset:
        pair_num = min(len(valid_trainsetB), max_pair_num)
        for imgB, scoreB in random.sample(valid_trainsetB, pair_num):
            if scoreV > scoreB:
                valid_list.append((imgV, scoreV, imgB, scoreB, 1))
            elif scoreV < scoreB:
                valid_list.append((imgV, scoreV, imgB, scoreB, 0))
            
print "Length of Train List: %d"%len(train_list)
print "Length of Valid List: %d"%len(valid_list)

pickle.dump(train_list, open("./data/train.list", 'wb'))
pickle.dump(valid_list, open("./data/valid.list", 'wb'))

In [None]:
# 测试：训练数据集的合理性
%matplotlib inline
from PIL import Image
import matplotlib.pyplot as plt
import random
import os

img_path = "../data/img/"
def cmpPlot(imgid_a, s_a, imgid_b, s_b):
    plt.figure()
    
    plt.subplot(1, 2, 1)
    plt.title("%s_%s"%(imgid_a, s_a))
    img = Image.open(img_path+"%s.jpg"%imgid_a)
    plt.imshow(img)
    plt.axis('off')
    
    plt.subplot(1, 2, 2)
    plt.title("%s_%s"%(imgid_b, s_b))
    img = Image.open(img_path+"%s.jpg"%imgid_b)
    plt.imshow(img)
    plt.axis('off')
    
    plt.show()
    
for imgA, sA, imgB, sB, cmpret in random.sample(train_list, 30):
    if os.path.exists(img_path+"%s.jpg"%imgA) and os.path.exists(img_path+"%s.jpg"%imgB):
        cmpPlot(imgA, sA, imgB, sB)

In [None]:
# 定义InvecptionV3的预处理模型
from keras.models import Model
from keras.applications.inception_v3 import InceptionV3
from keras.layers import GlobalAveragePooling2D
from keras.preprocessing import image
from keras.applications.inception_v3 import preprocess_input
import numpy as np

base_model = InceptionV3(weights='imagenet', include_top=False)
x = base_model.output
x = GlobalAveragePooling2D()(x) # add a global spatial average pooling layer
model = Model(inputs=base_model.input, outputs=x)

In [None]:
# 对每张照片预处理并存储在lmdb里
import lmdb

batch_size = 128
img_path = "../data/img/"

imgid_set = set(map(lambda x: x[0], train_list) + \
            map(lambda x: x[2], train_list) + \
            map(lambda x: x[0], valid_list) + \
            map(lambda x: x[2], valid_list))
env = lmdb.open("./data/features", map_size=8192*3*len(imgid_set))

def preprocess(imgid):
    try:
        filename = img_path+"%s.jpg"%imgid
        img = image.load_img(filename, target_size=(299, 299))
        x = image.img_to_array(img)
        x = preprocess_input(x)
        return x
    except Exception, e:
        print str(e)
        return None

imgids = []
X = []
for i, imgid in enumerate(imgid_set):
    x = preprocess(imgid)
    if x is None:
        continue
    imgids.append(imgid)
    X.append(x)
    if len(X) == batch_size:
        features = model.predict_on_batch(np.array(X))
        
        txn = env.begin(write=True)
        for _imgid, _feature in zip(imgids, features):
            str_feature = _feature.tostring()
            txn.put(_imgid, str_feature)
        txn.commit()
        
        imgids = []
        X = []
        print "%d/%d"%(i, len(imgid_set))
        
if len(X) > 0:
    features = model.predict_on_batch(np.array(X))
    txn = env.begin(write=True)
    for _imgid, _feature in zip(imgids, features):
        str_feature = _feature.tostring()
        txn.put(_imgid, str_feature)
    txn.commit()

env.close()