In [1]:
import pandas as pd
import numpy as np
from PIL import Image
from keras.preprocessing import image
import os, random
import cPickle as pickle
from tqdm import tqdm

Using TensorFlow backend.


In [12]:
VALID_IMGID_SET = set()
R, G, B = 0, 0, 0

img_path = "../dataset/AVA/imgs/"
for imgfile in tqdm(os.listdir(img_path)):
    try:
        im = image.load_img(img_path + imgfile)
        if im.size[0] < 256 or im.size[1] < 256:
            continue
        x = image.img_to_array(im)
        pixes = x.shape[0] * x.shape[1] * 1.0
        R += x[:,:,0].sum() / pixes
        G += x[:,:,1].sum() / pixes
        B += x[:,:,2].sum() / pixes
        imgid = os.path.splitext(imgfile)[0]
        VALID_IMGID_SET.add(imgid)
    except:
        continue

R /= len(VALID_IMGID_SET)
G /= len(VALID_IMGID_SET)
B /= len(VALID_IMGID_SET)

pickle.dump(VALID_IMGID_SET, open("./data/imgids.h5", 'wb'))
pickle.dump((R, G, B), open("./data/RGB.h5", 'wb'))

print "Number of valid images: %d" % len(VALID_IMGID_SET)
print "Average RGB: (%.2f, %.2f, %.2f)" % (R, G, B)

100%|██████████| 254782/254782 [29:17<00:00, 144.96it/s]


Number of valid images: 252810
Average RGB: (107.75, 99.35, 90.52)


In [2]:
VALID_IMGID_SET = pickle.load(open("./data/imgids.h5", 'rb'))
ava_file = "../dataset/AVA/AVA.txt"
train_delta = 0
test_delta = 0
p = 0.1
min_voting_num = 100

def classify(rating, mean, delta):
    if rating > mean + delta:
        return 1
    elif rating < mean - delta:
        return 0
    else:
        return None

high_quality_trainset = []
low_quality_trainset = []
high_quality_testset = []
low_quality_testset = []
with open(ava_file, 'r') as fin:
    for line in fin:
        X = line.strip().split()
        imgid = X[1]
        ratings = np.array(X[2:12], dtype=np.int)
        values = np.arange(1, 11.)
        voting_num = ratings.sum()
        rating_mean = (ratings * values).sum() / voting_num
        if imgid not in VALID_IMGID_SET or voting_num <= min_voting_num:
            continue
        if random.random() > p:
            # add to train
            label = classify(rating_mean, 5, train_delta)
            if label == 1:
                high_quality_trainset.append((imgid, rating_mean, label))
            elif label == 0:
                low_quality_trainset.append((imgid, rating_mean, label))
        else:
            # add to test
            label = classify(rating_mean, 5, test_delta)
            if label == 1:
                high_quality_testset.append((imgid, rating_mean, label))
            elif label == 0:
                low_quality_testset.append((imgid, rating_mean, label))
                
#size_of_train = min(len(high_quality_trainset), len(low_quality_trainset))
#size_of_test = min(len(high_quality_testset), len(low_quality_testset))

#trainset = random.sample(high_quality_trainset, size_of_train) + random.sample(low_quality_trainset, size_of_train)
#testset = random.sample(high_quality_testset, size_of_test) + random.sample(low_quality_testset, size_of_test)

trainset = high_quality_trainset + low_quality_trainset
testset = high_quality_testset + low_quality_testset

print "Size of train dataset: %d"%len(trainset)
print "Size of test dataset: %d"%len(testset)

trainset = pd.DataFrame(trainset, columns=["imgid", "rating", "label"])
testset = pd.DataFrame(testset, columns=["imgid", "rating", "label"])

trainset.to_csv("./data/train.lst", index=False)
testset.to_csv("./data/test.lst", index=False)

Size of train dataset: 225952
Size of test dataset: 25179


In [None]:
trainset = pd.read_csv("./data/train.lst", dtype={"imgid": np.str, "rating": np.float, "label": np.int})
testset = pd.read_csv("./data/test.lst", dtype={"imgid": np.str, "rating": np.float, "label": np.int})

In [None]:
%matplotlib inline
from PIL import Image
import matplotlib.pyplot as plt
import random

img_path = "../dataset/AVA/imgs/"
def imgPlot(imgid, label):
    plt.figure()
    
    plt.title("%s_%s"%(imgid, label))
    img = Image.open(img_path+"%s.jpg"%imgid)
    plt.imshow(img)
    plt.axis('off')
    
    plt.show()
    
for imgid, rating, label in trainset[trainset.label==1].sample(10).values:
    imgPlot(imgid, label)
    
for imgid, rating, label in trainset[trainset.label==0].sample(10).values:
    imgPlot(imgid, label)