In [0]:
import os
import re
import requests
import random
import json
import glob
import shutil
import multiprocessing as mp
from PIL import Image
from tqdm import tqdm

random.seed(2020)

output_size = 512
num_workers = 8

KEYS = ['PAIR', 'TOPP', 'BOTT', 'LEFT', 'BACK', 'RGHT', 'FRNT']
# only download 1 key at a time (because colab can't handle too large files)
KEYS_DOWNLOAD = KEYS[6]

In [0]:
# create folder holding downloaded items
os.makedirs(KEYS_DOWNLOAD.lower(), exist_ok=True)

# load dataset
if not os.path.exists('dataset.json'):
    shutil.copy('drive/My Drive/dataset.json', '.')

with open('dataset.json', 'r') as f:
    DATASET = json.load(f)

In [0]:
# request using proxies to prevent IP banned
PROXIES = None # list of proxies

def get_proxies():
    """Get all proxies from free-proxy-list.net"""
    res = requests.get('https://free-proxy-list.net/')
    pattern = r'<tr><td>([\d\.]+)<\/td><td>([\d]+)<\/td>'
    proxies = re.findall(pattern, res.text)
    proxies = ['http://{}:{}'.format(ip, port) for (ip, port) in proxies[:20]]
    return proxies

def _request(url):
    """request with proxies"""
    for proxy in random.sample(PROXIES, len(PROXIES)):
        try:
            res = requests.get(url, proxies={'http': proxy})
            return res
        except:
            pass
    # all proxies are dead
    return None

def request(url):
    """request and update new PROXIES"""
    global PROXIES
    if PROXIES is None:
        PROXIES = get_proxies()
    res = _request(url)
    if res:
        return res
    # refresh proxies
    PROXIES = get_proxies()
    res = _request(url)
    return res

In [0]:
def download(idx):
    """download using multiprocessing"""
    if idx >= len(DATASET):
        return
    img_id = list(DATASET.keys())[idx]
    img_data = DATASET[img_id]

    if KEYS_DOWNLOAD not in img_data['images']:
        return
    link = img_data['images'][KEYS_DOWNLOAD]
    output = os.path.join(KEYS_DOWNLOAD.lower(), '{}-{}.png'.format(img_id, KEYS_DOWNLOAD.lower()))
    if not os.path.exists(output):
        # download
        res = request(link)
        with open(output, 'wb') as f:
            f.write(res.content)


In [45]:
# download using multiprocessing (tested on colab)
pool = mp.Pool(num_workers)
for i in tqdm(range((len(list(DATASET.keys())) + num_workers)//num_workers)):
    args = [num_workers*i + j for j in range(num_workers)]
    pool.map(download, args)

100%|██████████| 8995/8995 [43:11<00:00,  3.47it/s]


In [46]:
# delete broken images
images = glob.glob('*/*.png')
for img in tqdm(images):
    try:
        Image.open(img)
    except:
        print('{} broken'.format(img))
        os.remove(img)

100%|██████████| 71488/71488 [01:41<00:00, 702.12it/s]


In [47]:
images = glob.glob('*/*.png')
parts = {}
cur_key = 0
for img in tqdm(images):
    if cur_key == 0:
        cur_key = 1
        parts[cur_key] = {'size':0, 'files':[]}
    if parts[cur_key]['size'] >= 1024*1024*1024:
        cur_key += 1
        parts[cur_key] = {'size':0, 'files':[]}
    parts[cur_key]['size'] += os.path.getsize(img)
    parts[cur_key]['files'].append(img)

100%|██████████| 71488/71488 [00:00<00:00, 211885.69it/s]


In [0]:
# zipping using multiprocessing
def zipping(key):
    if key not in parts.keys():
        return
    output_folder = '{}-part{:02}'.format(KEYS_DOWNLOAD.lower(), key)
    os.makedirs(output_folder)
    for img in parts[key]['files']:
        shutil.copy(img, output_folder)
    shutil.make_archive(output_folder, format='zip', root_dir=output_folder)
    shutil.rmtree(output_folder)

In [0]:
# moving files using multiprocessing
root = '/content/drive/My Drive/uit-shoesgan'

images = glob.glob('*/*.png')
images_folder = os.path.join(root, 'images', KEYS_DOWNLOAD.lower())
os.makedirs(images_folder, exist_ok=True)
def moving_images(idx):
    if idx >= len(images):
        return
    shutil.move(images[idx], images_folder)

# move zip files into drive
all_zips = glob.glob('*zip')
zips_folder = os.path.join(root, 'zips', KEYS_DOWNLOAD.lower())
os.makedirs(zips_folder, exist_ok=True)
def moving_zips(idx):
    if idx >= len(all_zips):
        return
    shutil.move(all_zips[idx], zips_folder)

In [50]:
pool = mp.Pool(num_workers)
for i in tqdm(range((len(parts) + num_workers)//num_workers)):
    args = [num_workers*i + j for j in range(num_workers)]
    pool.map(zipping, args)

100%|██████████| 2/2 [08:31<00:00, 255.96s/it]


In [51]:
pool = mp.Pool(num_workers)
for i in tqdm(range((len(images) + num_workers)//num_workers)):
    args = [num_workers*i + j for j in range(num_workers)]
    pool.map(moving_images, args)
print('\nImages: {}, {}'.format(len(images), len(os.listdir(images_folder))))

100%|██████████| 8937/8937 [1:12:04<00:00,  2.07it/s]



Images: 71488, 71488


In [52]:
all_zips = glob.glob('*zip')
pool = mp.Pool(num_workers)
for i in tqdm(range((len(all_zips) + num_workers)//num_workers)):
    args = [num_workers*i + j for j in range(num_workers)]
    pool.map(moving_zips, args)
print('\nZips: {}, {}'.format(len(all_zips), len(os.listdir(zips_folder))))

100%|██████████| 2/2 [02:00<00:00, 60.47s/it]


Zips: 9, 9





In [53]:
# text file to test images are all synced 
# because googlde drive take time to transfer all the images into specific folder
done_file = '{}-done.txt'.format(KEYS_DOWNLOAD.lower())
with open(done_file, 'w') as f:
    f.write(':)')
shutil.move(done_file, root)

'/content/drive/My Drive/uit-shoesgan/frnt-done.txt'

In [54]:
sorted(glob.glob('/content/drive/My Drive/uit-shoesgan/zips/*/*zip'))

['/content/drive/My Drive/uit-shoesgan/zips/back/back-part01.zip',
 '/content/drive/My Drive/uit-shoesgan/zips/back/back-part02.zip',
 '/content/drive/My Drive/uit-shoesgan/zips/back/back-part03.zip',
 '/content/drive/My Drive/uit-shoesgan/zips/back/back-part04.zip',
 '/content/drive/My Drive/uit-shoesgan/zips/back/back-part05.zip',
 '/content/drive/My Drive/uit-shoesgan/zips/back/back-part06.zip',
 '/content/drive/My Drive/uit-shoesgan/zips/back/back-part07.zip',
 '/content/drive/My Drive/uit-shoesgan/zips/back/back-part08.zip',
 '/content/drive/My Drive/uit-shoesgan/zips/bott/bott-part01.zip',
 '/content/drive/My Drive/uit-shoesgan/zips/bott/bott-part02.zip',
 '/content/drive/My Drive/uit-shoesgan/zips/bott/bott-part03.zip',
 '/content/drive/My Drive/uit-shoesgan/zips/bott/bott-part04.zip',
 '/content/drive/My Drive/uit-shoesgan/zips/bott/bott-part05.zip',
 '/content/drive/My Drive/uit-shoesgan/zips/bott/bott-part06.zip',
 '/content/drive/My Drive/uit-shoesgan/zips/bott/bott-part07.z