In [39]:
from bs４ import BeautifulSoup
import urllib.request
import time
import glob
import os
import sys
from PIL import Image, ImageFilter
import cv2
import numpy as np
import shutil

### Scrape image urls from Pinterest

In [48]:
FILE_NAME = './pinterest-explorer.html'

f = open(FILE_NAME, 'r')
soup = BeautifulSoup(f.read(), 'html5lib')
f.close()

items = soup.findAll('div', {"class":"Grid__Item"})

urls = []
for item in items:
    srcset = item.findAll('img')[0].attrs['srcset']
    src = srcset.split('2x, ')[1]
    src = src.replace(' 3x', '')
    urls.append(src)
    
urls = list(set(urls))
print(len(urls))

491


### Download images

In [50]:
DEST_DIR = './images/'

for i, url in enumerate(urls):
    name = url.split('/')[-1]
    if not os.path.exists(DEST_DIR + name):
        try:
            res = urllib.request.urlretrieve(url, DEST_DIR + name)
        except UnicodeEncodeError:
            continue

    time.sleep(0.4)
    
    if i % 100 == 0:
        print(str(i) + '/' + str(len(urls)))

0/491
100/491
200/491
300/491
400/491


### Resize images

In [104]:
IN_DIR     = './images/'
OUT_DIR    = './resized/'
MIN_LENGTH = 256
DIFF       = 10

if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

files = glob.glob(IN_DIR + '/*')

for f in files:
    name_arr = f.rsplit('/', 1)[1].split('.')
    file_name, file_extension = name_arr[0], name_arr[1]

    image = Image.open(f)

    # Get dimensions
    width, height = image.size
    
    if width < MIN_LENGTH or height < MIN_LENGTH:
        continue
    
    index = 0
    if width >= height:
        resized_image = image.resize((int(MIN_LENGTH * width / height), MIN_LENGTH))
        width, height = resized_image.size

        for left in range(0, width - MIN_LENGTH, DIFF):
            right = left + MIN_LENGTH
            out_image = resized_image.crop((left, 0, right, MIN_LENGTH))
            
            out_path = OUT_DIR + file_name + '_' + str(index) + '.' + file_extension
            out_image.save(out_path)
            out_path = OUT_DIR + file_name + '_' + str(index) + '_T.' + file_extension
            out_image.transpose(Image.FLIP_LEFT_RIGHT).save(out_path)
            index += 1
        
    else:
        resized_image = image.resize((int(MIN_LENGTH * width / height), MIN_LENGTH))
        width, height = resized_image.size

        for top in range(0, height - MIN_LENGTH, DIFF):
            bottom = top + MIN_LENGTH
            out_image = resized_image.crop((0, top, MIN_LENGTH, bottom))
    
            out_path = OUT_DIR + file_name + '_' + str(index) + '.' + file_extension
            out_image.save(out_path)
            out_path = OUT_DIR + file_name + '_' + str(index) + '_T.' + file_extension
            out_image.transpose(Image.FLIP_LEFT_RIGHT).save(out_path)
            index += 1            

### Find edges

In [105]:
IN_DIR  = './resized/'
OUT_DIR = './line/'

if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)
    
files = glob.glob(IN_DIR + '/*')

for f in files:
    file_path = f.rsplit('/', 1)[1]
    image = cv2.imread(f, 1)
    
    ind = np.random.rand()
    if ind < 0.3:
        result = cv2.GaussianBlur(image, ksize=(3, 3), sigmaX=3)
    elif ind < 0.6:
        result = cv2.GaussianBlur(image, ksize=(5, 5), sigmaX=3)
    elif ind < 0.9:
        result = cv2.blur(image, ksize=(3, 3))
    else:
        result = cv2.blur(image, ksize=(5,5))

    threshold1 = int(np.random.rand() * 30) + 1
    threshold2 = int(np.random.rand() * 70) + 130
    
    result = cv2.Canny(result, threshold1, threshold2)
    
    row,col = result.shape
    mean = 0
    var = np.random.rand() * 0.2
    sigma = var**0.5
    gauss = np.random.normal(mean,sigma,(row,col))
    gauss = gauss.reshape(row,col).astype(np.uint8)
    result += gauss
    
    out_path = OUT_DIR + file_path
    cv2.imwrite(out_path, result)

### Create Dataset

In [106]:
A_SRC_DIR = './line/'
B_SRC_DIR = './resized/'

A_DST_DIR = './A/'
B_DST_DIR = './B/'

for root_dir in [A_DST_DIR, B_DST_DIR]:
    
    if not os.path.exists(root_dir):
        os.makedirs(root_dir)
        
    for sub_dir in ['train', 'test', 'val']:
        
        if not os.path.exists(root_dir + sub_dir):
            os.makedirs(root_dir + sub_dir)
    
files = glob.glob(A_SRC_DIR + '/*')

np.random.seed(0)

for a_src in files:
    file_path = a_src.rsplit('/', 1)[1]
    
    b_src = B_SRC_DIR + file_path
    
    ind = int(np.random.rand(1) * 20)
    directory = ''
    if ind == 0:
        directory = 'test'
    elif ind == 1:
        directory = 'val'
    else:
        directory = 'train'

    shutil.copy2(a_src, A_DST_DIR + directory + '/' + file_path)
    shutil.copy2(b_src, B_DST_DIR + directory + '/' + file_path)

In [107]:
from pdb import set_trace as st
import os
import numpy as np
import cv2
import argparse

A_DIR = './A/'
B_DIR = './B/'
AB_DIR = './AB/'

splits = os.listdir(A_DIR)

for sp in splits:
    img_fold_A = os.path.join(A_DIR, sp)
    img_fold_B = os.path.join(B_DIR, sp)
    img_list = os.listdir(img_fold_A)

    num_imgs = len(img_list)
    print('split = %s, use %d/%d images' % (sp, num_imgs, len(img_list)))
    img_fold_AB = os.path.join(AB_DIR, sp)
    if not os.path.isdir(img_fold_AB):
        os.makedirs(img_fold_AB)
    print('split = %s, number of images = %d' % (sp, num_imgs))
    for n in range(num_imgs):
        name_A = img_list[n]
        path_A = os.path.join(img_fold_A, name_A)
        name_B = name_A
        path_B = os.path.join(img_fold_B, name_B)
        if os.path.isfile(path_A) and os.path.isfile(path_B):
            name_AB = name_A
            path_AB = os.path.join(img_fold_AB, name_AB)
            im_A = cv2.imread(path_A, 1)
            im_B = cv2.imread(path_B, 1)
            im_AB = np.concatenate([im_A, im_B], 1)
            cv2.imwrite(path_AB, im_AB)

split = test, use 117/117 images
split = test, number of images = 117
split = train, use 2006/2006 images
split = train, number of images = 2006
split = val, use 97/97 images
split = val, number of images = 97
