In [None]:
import numpy as np

def get_score(zan_num, cai_num, clk_num):
    zan_num = max(zan_num - 1, 0)
    return zan_num - cai_num + np.log(clk_num+1)

img_score_by_date = {}
with open("../data/img_attr.csv", 'r') as fin:
    for line in fin:
        fields = line.strip().split("\t")
        if len(fields) != 7:
            continue
        imgid = fields[0]
        zan_num = int(fields[1])
        cai_num = int(fields[2])
        clk_num = int(fields[3])
        score = get_score(zan_num, cai_num, clk_num)
        date = fields[5]
        if date >= "2015/01" and date < "2017/05":
            img_score_by_date.setdefault(date, [])
            img_score_by_date[date].append((imgid, score))

In [None]:
%matplotlib inline
from PIL import Image
import matplotlib.pyplot as plt
import random
import os

date = random.choice(img_score_by_date.keys())
#date = "2015/10/12"
#date = "2016/03/07"
print date

#demo_imgs = [(imgid, img_score[imgid]) for imgid in img_by_date[date]]
demo_imgs = img_score_by_date[date]

plt.hist(map(lambda x: x[1], demo_imgs), bins=30)

img_path = "../data/img/"
def testPlot(imgid, score):
    plt.figure()
    
    plt.title("%s_%s"%(imgid, score))
    img = Image.open(img_path+"%s.jpg"%imgid)
    plt.imshow(img)
    plt.axis('off')
    
    plt.show()
    
for imgid, score in sorted(demo_imgs, key=lambda x: x[1], reverse=True):
    if os.path.exists(img_path+"%s.jpg"%imgid):
        testPlot(imgid, score)

In [None]:
# prepare train and valid dataset
import random

def split_dataset(dataset, p):
    """
    return train_dataset, valid_dataset
    """
    random.shuffle(dataset)
    valid_size = int(len(dataset)*p)
    return dataset[:-valid_size], dataset[-valid_size:]

train_list = []
valid_list = []
p = 0.1 # valid_data ratio
k = 10 # compare with k images for each train image
delta = 0.5 # if abs(sa-sb) < delta, then abort this comparation pair
for date, imgs in img_score_by_date.items():
    train_imgs, valid_imgs = split_dataset(imgs, p)
    cmp_k = len(train_imgs) > k and k or len(train_imgs)
    
    for img_a, s_a in train_imgs:
        cmp_imgs = random.sample(train_imgs, cmp_k)
        for img_b, s_b in cmp_imgs:
            if abs(s_a-s_b) < delta:
                continue
            cmp_ret = s_a > s_b and 1 or 0
            train_list.append((img_a, s_a, img_b, s_b, cmp_ret))
            
    for img_a, s_a in valid_imgs:
        cmp_imgs = random.sample(train_imgs, cmp_k)
        for img_b, s_b in cmp_imgs:
            if abs(s_a-s_b) < delta:
                continue
            cmp_ret = s_a > s_b and 1 or 0
            valid_list.append((img_a, s_a, img_b, s_b, cmp_ret))
            
print "Length of Train List: %d"%len(train_list)
print "Length of Valid List: %d"%len(valid_list)

In [None]:
%matplotlib inline
from PIL import Image
import matplotlib.pyplot as plt
import random
import os

img_path = "../data/img/"
def cmpPlot(imgid_a, s_a, imgid_b, s_b):
    plt.figure()
    
    plt.subplot(1, 2, 1)
    plt.title("%s_%s"%(imgid_a, s_a))
    img = Image.open(img_path+"%s.jpg"%imgid_a)
    plt.imshow(img)
    plt.axis('off')
    
    plt.subplot(1, 2, 2)
    plt.title("%s_%s"%(imgid_b, s_b))
    img = Image.open(img_path+"%s.jpg"%imgid_b)
    plt.imshow(img)
    plt.axis('off')
    
    plt.show()
    
for imgA, sA, imgB, sB, cmpret in random.sample(train_list, 30):
    if os.path.exists(img_path+"%s.jpg"%imgA) and os.path.exists(img_path+"%s.jpg"%imgB):
        cmpPlot(imgA, sA, imgB, sB)