In [1]:
# 计算并存储每张图片的score
import numpy as np

def get_score(zan_num, cai_num, clk_num):
    zan_num = max(zan_num - 1, 0)
    return zan_num - cai_num + np.log(clk_num+1)

img_score_by_date = {}
with open("../data/img_attr.csv", 'r') as fin:
    for line in fin:
        fields = line.strip().split("\t")
        if len(fields) != 7:
            continue
        imgid = fields[0]
        zan_num = int(fields[1])
        cai_num = int(fields[2])
        clk_num = int(fields[3])
        score = get_score(zan_num, cai_num, clk_num)
        date = fields[5]
        if date >= "2015/01" and date < "2017/05":
            img_score_by_date.setdefault(date, [])
            img_score_by_date[date].append((imgid, score))

In [None]:
# 测试：score的合理性
%matplotlib inline
from PIL import Image
import matplotlib.pyplot as plt
import random
import os

date = random.choice(img_score_by_date.keys())
#date = "2015/10/12"
#date = "2016/03/07"
print date

#demo_imgs = [(imgid, img_score[imgid]) for imgid in img_by_date[date]]
demo_imgs = img_score_by_date[date]

plt.hist(map(lambda x: x[1], demo_imgs), bins=30)

img_path = "../data/img/"
def testPlot(imgid, score):
    plt.figure()
    
    plt.title("%s_%s"%(imgid, score))
    img = Image.open(img_path+"%s.jpg"%imgid)
    plt.imshow(img)
    plt.axis('off')
    
    plt.show()
    
for imgid, score in sorted(demo_imgs, key=lambda x: x[1], reverse=True):
    if os.path.exists(img_path+"%s.jpg"%imgid):
        testPlot(imgid, score)

In [2]:
# 构造用于模型训练和验证的pair数据对
import random

def split_dataset(dataset, p):
    """
    return train_dataset, valid_dataset
    """
    random.shuffle(dataset)
    valid_size = int(len(dataset)*p)
    return dataset[:-valid_size], dataset[-valid_size:]

train_list = []
valid_list = []
p = 0.1 # valid_data ratio
k = 10 # compare with k images for each train image
delta = 0.5 # if abs(sa-sb) < delta, then abort this comparation pair
for date, imgs in img_score_by_date.items():
    train_imgs, valid_imgs = split_dataset(imgs, p)
    cmp_k = len(train_imgs) > k and k or len(train_imgs)
    
    for img_a, s_a in train_imgs:
        cmp_imgs = random.sample(train_imgs, cmp_k)
        for img_b, s_b in cmp_imgs:
            if abs(s_a-s_b) < delta:
                continue
            cmp_ret = s_a > s_b and 1 or 0
            train_list.append((img_a, s_a, img_b, s_b, cmp_ret))
            
    for img_a, s_a in valid_imgs:
        cmp_imgs = random.sample(train_imgs, cmp_k)
        for img_b, s_b in cmp_imgs:
            if abs(s_a-s_b) < delta:
                continue
            cmp_ret = s_a > s_b and 1 or 0
            valid_list.append((img_a, s_a, img_b, s_b, cmp_ret))
            
print "Length of Train List: %d"%len(train_list)
print "Length of Valid List: %d"%len(valid_list)

Length of Train List: 557171
Length of Valid List: 59875


In [None]:
# 测试：训练数据集的合理性
%matplotlib inline
from PIL import Image
import matplotlib.pyplot as plt
import random
import os

img_path = "../data/img/"
def cmpPlot(imgid_a, s_a, imgid_b, s_b):
    plt.figure()
    
    plt.subplot(1, 2, 1)
    plt.title("%s_%s"%(imgid_a, s_a))
    img = Image.open(img_path+"%s.jpg"%imgid_a)
    plt.imshow(img)
    plt.axis('off')
    
    plt.subplot(1, 2, 2)
    plt.title("%s_%s"%(imgid_b, s_b))
    img = Image.open(img_path+"%s.jpg"%imgid_b)
    plt.imshow(img)
    plt.axis('off')
    
    plt.show()
    
for imgA, sA, imgB, sB, cmpret in random.sample(train_list, 30):
    if os.path.exists(img_path+"%s.jpg"%imgA) and os.path.exists(img_path+"%s.jpg"%imgB):
        cmpPlot(imgA, sA, imgB, sB)

In [3]:
# 定义InvecptionV3的预处理模型
from keras.models import Model
from keras.applications.inception_v3 import InceptionV3
from keras.layers import GlobalAveragePooling2D
from keras.preprocessing import image
from keras.applications.inception_v3 import preprocess_input
import numpy as np

base_model = InceptionV3(weights='imagenet', include_top=False)
x = base_model.output
x = GlobalAveragePooling2D()(x) # add a global spatial average pooling layer
model = Model(inputs=base_model.input, outputs=x)

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 1060 6GB (CNMeM is disabled, cuDNN 5110)


In [5]:
# 对每张照片预处理并存储在lmdb里
import lmdb

batch_size = 128
img_path = "../data/img/"

imgid_set = set(map(lambda x: x[0], train_list) + \
            map(lambda x: x[2], train_list) + \
            map(lambda x: x[0], valid_list) + \
            map(lambda x: x[2], valid_list))
env = lmdb.open("./data/features", map_size=8192*3*len(imgid_set))

def preprocess(imgid):
    try:
        filename = img_path+"%s.jpg"%imgid
        img = image.load_img(filename, target_size=(299, 299))
        x = image.img_to_array(img)
        x = preprocess_input(x)
        return x
    except Exception, e:
        print str(e)
        return None

imgids = []
X = []
for i, imgid in enumerate(imgid_set):
    x = preprocess(imgid)
    if x is None:
        continue
    imgids.append(imgid)
    X.append(x)
    if len(X) == batch_size:
        features = model.predict_on_batch(np.array(X))
        
        txn = env.begin(write=True)
        for _imgid, _feature in zip(imgids, features):
            str_feature = _feature.tostring()
            txn.put(_imgid, str_feature)
        txn.commit()
        
        imgids = []
        X = []
        print "%d/%d"%(i, len(imgid_set))
        
if len(X) > 0:
    features = model.predict_on_batch(np.array(X))
    txn = env.begin(write=True)
    for _imgid, _feature in zip(imgids, features):
        str_feature = _feature.tostring()
        txn.put(_imgid, str_feature)
    txn.commit()

env.close()

127/106726
255/106726
383/106726
511/106726
cannot identify image file '../data/img/38565.jpg'
640/106726
[Errno 2] No such file or directory: '../data/img/48630.jpg'
769/106726
[Errno 2] No such file or directory: '../data/img/59201.jpg'
898/106726
1026/106726
1154/106726
1282/106726
cannot identify image file '../data/img/62691.jpg'
cannot identify image file '../data/img/46088.jpg'
1412/106726
[Errno 2] No such file or directory: '../data/img/143040.jpg'
1541/106726
1669/106726
1797/106726
[Errno 2] No such file or directory: '../data/img/46718.jpg'
1926/106726
2054/106726
2182/106726
2310/106726
2438/106726
2566/106726
[Errno 2] No such file or directory: '../data/img/58870.jpg'
2695/106726
2823/106726
2951/106726
3079/106726
3207/106726
3335/106726
3463/106726
3591/106726
3719/106726
3847/106726
3975/106726
cannot identify image file '../data/img/136637.jpg'
[Errno 2] No such file or directory: '../data/img/143257.jpg'
4105/106726
4233/106726
4361/106726
4489/106726
4617/106726
[E

37580/106726
[Errno 2] No such file or directory: '../data/img/87704.jpg'
[Errno 2] No such file or directory: '../data/img/19642.jpg'
37710/106726
37838/106726
[Errno 2] No such file or directory: '../data/img/115225.jpg'
37967/106726
[Errno 2] No such file or directory: '../data/img/148730.jpg'
38096/106726
[Errno 2] No such file or directory: '../data/img/101254.jpg'
38225/106726
[Errno 2] No such file or directory: '../data/img/146188.jpg'
38354/106726
38482/106726
38610/106726
38738/106726
[Errno 2] No such file or directory: '../data/img/78083.jpg'
38867/106726
38995/106726
39123/106726
39251/106726
39379/106726
39507/106726
39635/106726
39763/106726
39891/106726
40019/106726
40147/106726
40275/106726
40403/106726
40531/106726
40659/106726
40787/106726
[Errno 2] No such file or directory: '../data/img/110987.jpg'
40916/106726
cannot identify image file '../data/img/77574.jpg'
41045/106726
41173/106726
41301/106726
41429/106726
41557/106726
41685/106726
41813/106726
41941/106726
4

73242/106726
73370/106726
73498/106726
73626/106726
73754/106726
73882/106726
74010/106726
cannot identify image file '../data/img/56441.jpg'
74139/106726
74267/106726
74395/106726
74523/106726
74651/106726
74779/106726
74907/106726
75035/106726
[Errno 2] No such file or directory: '../data/img/54566.jpg'
75164/106726
75292/106726
75420/106726
75548/106726
75676/106726
75804/106726
75932/106726
76060/106726
76188/106726
76316/106726
[Errno 2] No such file or directory: '../data/img/29487.jpg'
76445/106726
76573/106726
76701/106726
76829/106726
76957/106726
77085/106726
cannot identify image file '../data/img/106897.jpg'
77214/106726
77342/106726
[Errno 2] No such file or directory: '../data/img/81377.jpg'
77471/106726
77599/106726
77727/106726
77855/106726
77983/106726
cannot identify image file '../data/img/41432.jpg'
78112/106726
78240/106726
[Errno 2] No such file or directory: '../data/img/58172.jpg'
78369/106726
78497/106726
78625/106726
78753/106726
[Errno 2] No such file or dire

105966/106726
[Errno 2] No such file or directory: '../data/img/32566.jpg'
106095/106726
cannot identify image file '../data/img/41867.jpg'
[Errno 2] No such file or directory: '../data/img/122425.jpg'
106225/106726
106353/106726
[Errno 2] No such file or directory: '../data/img/141011.jpg'
106482/106726
cannot identify image file '../data/img/81105.jpg'
106611/106726


In [7]:
import cPickle as pickle

pickle.dump(train_list, open("./data/train.list", 'wb'))
pickle.dump(valid_list, open("./data/valid.list", 'wb'))