In [25]:
from bs４ import BeautifulSoup
import urllib.request
import time
import glob
import os
import sys
from PIL import Image, ImageFilter
import cv2
import numpy as np
import shutil

### Scrape image urls from Pinterest

In [44]:
FILE_NAME = './pinterest-explorer2.html'

f = open(FILE_NAME, 'r')
soup = BeautifulSoup(f.read(), 'html5lib')
f.close()

items = soup.findAll('div', {"class":"Grid__Item"})

urls = []
for item in items:
    srcset = item.findAll('img')[0].attrs['srcset']
    src = srcset.split('2x, ')[1]
    src = src.replace(' 3x', '')
    urls.append(src)
print(len(urls))

1000


### Download images

In [50]:
for i, url in enumerate(urls):
    name = url.split('/')[-1]
    res = urllib.request.urlretrieve(url, './images/' + name)
    time.sleep(1.0)
    
    if i % 100 == 0:
        print(str(i) + '/' + str(len(urls)))

0/1000
100/1000
200/1000
300/1000
400/1000
500/1000
600/1000
700/1000
800/1000
900/1000


### Resize images

In [69]:
IN_DIR     = './images/'
OUT_DIR    = './resized/'
MIN_LENGTH = 256
DIFF       = 20

if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)

files = glob.glob(IN_DIR + '/*')

for f in files:
    name_arr = f.rsplit('/', 1)[1].split('.')
    file_name, file_extension = name_arr[0], name_arr[1]

    image = Image.open(f)

    # Get dimensions
    width, height = image.size
    
    if width < MIN_LENGTH or height < MIN_LENGTH:
        continue
    
    index = 0
    if width >= height:
        resized_image = image.resize((int(MIN_LENGTH * width / height), MIN_LENGTH))
        width, height = resized_image.size

        for left in range(0, width - MIN_LENGTH, DIFF):
            right = left + MIN_LENGTH
            out_image = resized_image.crop((left, 0, right, MIN_LENGTH))
            
            out_path = OUT_DIR + file_name + '_' + str(index) + '.' + file_extension
            out_image.filter(ImageFilter.SHARPEN).save(out_path)
            index += 1
        
    else:
        resized_image = image.resize((int(MIN_LENGTH * width / height), MIN_LENGTH))
        width, height = resized_image.size

        for top in range(0, height - MIN_LENGTH, DIFF):
            bottom = top + MIN_LENGTH
            out_image = resized_image.crop((0, top, MIN_LENGTH, bottom))
            
            out_path = OUT_DIR + file_name + '_' + str(index) + '.' + file_extension
            out_image.filter(ImageFilter.SHARPEN).save(out_path)
            index += 1            

### Find edges

In [15]:
IN_DIR  = './resized/'
OUT_DIR = './line/'

if not os.path.exists(OUT_DIR):
    os.makedirs(OUT_DIR)
    
files = glob.glob(IN_DIR + '/*')

for f in files:
    file_path = f.rsplit('/', 1)[1]
    image = cv2.imread(f, 1)
    result = cv2.Canny(image, 200, 500)
    
    out_path = OUT_DIR + file_path
    cv2.imwrite(out_path, result)

### Create Dataset

In [29]:
A_SRC_DIR = './line/'
B_SRC_DIR = './resized/'

A_DST_DIR = './A/'
B_DST_DIR = './B/'

for root_dir in [A_DST_DIR, B_DST_DIR]:
    
    if not os.path.exists(root_dir):
        os.makedirs(root_dir)
        
    for sub_dir in ['train', 'test', 'val']:
        
        if not os.path.exists(root_dir + sub_dir):
            os.makedirs(root_dir + sub_dir)
    
files = glob.glob(A_SRC_DIR + '/*')

np.random.seed(0)

for a_src in files:
    file_path = a_src.rsplit('/', 1)[1]
    
    b_src = B_SRC_DIR + file_path
    
    ind = int(np.random.rand(1) * 10)
    directory = ''
    if ind == 0:
        directory = 'test'
    elif ind == 1:
        directory = 'val'
    else:
        directory = 'train'

    shutil.copy2(a_src, A_DST_DIR + directory + '/' + file_path)
    shutil.copy2(b_src, B_DST_DIR + directory + '/' + file_path)