预处理：生成pair-wise的训练数据对

1. 读取img_attr.csv文件，按天或月随机生成图片pair对并标记：
    1. 若图片A的点赞数高于图片B的点赞数，则输出1；反之为0
    2. 若点赞数相同，则比较被踩数：图片A的被踩数低于图片B的被踩数，则输出1；反之为0
    3. 若被踩数相同，则比较浏览次数：图片A的浏览次数高于图片B的浏览次数，则输出1；反之为0
2. 用InceptionV3网络对图像pair对进行预处理，持久化到硬盘上

In [None]:
img_path = "../data/img/"
img_attr_file = "../data/img_attr.csv"
save_path = "./data/"

batch_size = 64 # for model predict
pair_num = 30 # compare with 30 random images for each image

In [None]:
# define inception-V3 model
from keras.models import Model
from keras.applications.inception_v3 import InceptionV3
from keras.layers import GlobalAveragePooling2D
from keras.preprocessing import image
from keras.applications.inception_v3 import preprocess_input
import numpy as np

base_model = InceptionV3(weights='imagenet', include_top=False)
x = base_model.output
x = GlobalAveragePooling2D()(x) # add a global spatial average pooling layer
model = Model(inputs=base_model.input, outputs=x)

def img2array(imgid_list):
    imgid2indice = {}
    img_array = []
    for imgid in imgid_list:
        try:
            filename = img_path+"%s.jpg"%imgid
            img = image.load_img(filename, target_size=(299, 299))
            x = image.img_to_array(img)
            x = preprocess_input(x)
        except Exception, e:
            print str(e)
            continue
        imgid2indice[imgid] = len(imgid2indice)
        img_array.append(x)
    feature_array = model.predict(np.array(img_array), batch_size)
    return imgid2indice, feature_array

In [None]:
# load img_attr.csv

def timestamp2key(timestamp):
    return timestamp[:8] # use month as key

img_attr_dict = {}
with open(img_attr_file, 'r') as fin:
    for line in fin:
        fields = line.strip().split("\t")
        if len(fields) < 7:
            continue
        imgid = fields[0]
        zan_num = int(fields[1])
        cai_num = int(fields[2])
        clk_num = int(fields[3])
        key = timestamp2key(fields[5])
        if key < "2015" or key == "2017/05/" or \
            (zan_num == 0 and cai_num == 0):
            continue
        img_attr_dict.setdefault(key, [])
        img_attr_dict[key].append((imgid, zan_num, cai_num, clk_num))

In [None]:
%matplotlib inline
from PIL import Image
import matplotlib.pyplot as plt
import random
import cPickle as pickle

def imgCmp(img_a, img_b):
    imgid_a, zan_num_a, cai_num_a, clk_num_a = img_a
    imgid_b, zan_num_b, cai_num_b, clk_num_b = img_b
    
    s_a = zan_num_a - cai_num_a
    s_b = zan_num_b - cai_num_b
    
    if s_a > s_b:
        return 1
    elif s_a < s_b:
        return -1
    else:
        return 0

def testPlot(imgid_a, imgid_b, cmp_ret):
    plt.figure("%s"%cmp_ret)
    
    plt.subplot(1,2,1)
    plt.title("%s"%imgid_a)
    img = Image.open(img_path+"%s.jpg"%imgid_a)
    plt.imshow(img)
    plt.axis('off')
    
    plt.subplot(1,2,2)
    plt.title("%s"%imgid_b)
    img = Image.open(img_path+"%s.jpg"%imgid_b)
    plt.imshow(img)
    plt.axis('off')
    
    plt.show()

pair_train = []
pair_valid = []
for key, img_attrs in sorted(img_attr_dict.items()):
    if key == "2017/04/":
        pair_samples = pair_valid
    else:
        pair_samples = pair_train
    print "current_datestamp=%s"%key
    imgid_list = map(lambda x: x[0], img_attrs)
    imgid2indice, feature_array = img2array(imgid_list)
    print "image_features extraction finished."
    #pair_samples = []
    for img_a in img_attrs:
        if img_a[0] not in imgid2indice:
            continue
        feature_a = feature_array[imgid2indice[img_a[0]]]
        for img_b in random.sample(img_attrs, pair_num):
            if img_b[0] not in imgid2indice:
                continue
            feature_b = feature_array[imgid2indice[img_b[0]]]
            cmp_ret = imgCmp(img_a, img_b)
            if cmp_ret != 0:
                pair_samples.append((img_a[0], img_b[0], feature_a, feature_b, cmp_ret))
            #pair_samples.append((img_b[0], img_a[0], feature_b, feature_a, imgCmp(img_b, img_a)))
            #cmp_ret = imgCmp(img_a, img_b)
            #print cmp_ret
            #testPlot(int(img_a[0]), int(img_b[0]), cmp_ret)
    print "pair_samples construction finished"
    #pickle.dump(pair_samples, open(save_path+"%s.pkl"%(key.replace("/", "")), "wb"), True)
pickle.dump(pair_train, open(save_path+"sybj.pkl", "wb"), True)
pickle.dump(pair_valid, open(save_path+"sybj.valid.pkl", "wb"), True)

In [None]:
pair_samples = pickle.load(open(save_path+"201704.pkl", "rb"))

In [None]:
len(pair_train)

In [None]:
%matplotlib inline
from PIL import Image
import matplotlib.pyplot as plt
import random
import cPickle as pickle
import numpy as np

def imgCmp(img_a, img_b):
    imgid_a, zan_num_a, cai_num_a, clk_num_a = img_a
    imgid_b, zan_num_b, cai_num_b, clk_num_b = img_b
    
    s_a = zan_num_a - cai_num_a
    s_b = zan_num_b - cai_num_b
    
    if s_a > s_b:
        return 2
    elif s_a < s_b:
        return 0
    else:
        return 1

def testPlot(imgid_a, imgid_b, cmp_ret):
    plt.figure("%s"%cmp_ret)
    
    plt.subplot(1,2,1)
    plt.title("%s"%imgid_a)
    img = Image.open(img_path+"%s.jpg"%imgid_a)
    plt.imshow(img)
    plt.axis('off')
    
    plt.subplot(1,2,2)
    plt.title("%s"%imgid_b)
    img = Image.open(img_path+"%s.jpg"%imgid_b)
    plt.imshow(img)
    plt.axis('off')
    
    plt.show()

for key, img_attrs in sorted(img_attr_dict.items()):
    random.shuffle(img_attrs)
    for img_a in img_attrs:
        for img_b in random.sample(img_attrs, pair_num):
            cmp_ret = imgCmp(img_a, img_b)
            #if cmp_ret != 0:
            print cmp_ret
            testPlot(int(img_a[0]), int(img_b[0]), cmp_ret)
        break
    break

In [None]:
for k,v in img_attr_dict.items():
    print k, len(v)