import os
import json
import random
import requests
from PIL import Image
from glob import glob
from text.ocr import ocr
from faces import extract_faces
from objects import extract_objects
from images.transform import resize_to_limit
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/53.0.2785.143 Chrome/53.0.2785.143 Safari/537.36'}
IMAGES_DIR = 'assets/commons'
def sample_wikicommons_urls(n):
"""sample wikicommons image urls"""
pop = glob('assets/commons_urls/*')
subpops = [random.choice(pop) for _ in range(n)]
sample = []
for p in subpops:
choices = [l.strip() for l in open(p, 'r').readlines()]
return sample
def sample_wikicommons(n):
"""sample downloaded wikicommons images"""
pop = glob('{}/*'.format(IMAGES_DIR))
return random.sample(pop, min(n, len(pop)))
def download_image(url, dir, overwrite=False):
"""download an image"""
fname = url.split('/')[-1]
path = os.path.join(dir, fname)
if os.path.exists(path) and not overwrite:
return path
res = requests.get(url, stream=True, headers=HEADERS)
if res.status_code == 200:
with open(path, 'wb') as f:
for chunk in res:
return path
print('failed to download:', url)
# res.raise_for_status()
def fetch_sample(n):
"""fetch, download, and process a sample of wikicommons image urls"""
sample = sample_wikicommons_urls(n)
for url in sample:
if any(url.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png']):
path = download_image(url, IMAGES_DIR)
if path:
# so we don't keep massive images
im = resize_to_limit(, (800, 800))
words = ocr(path)
if words:
fname = path.split('/')[-1]
with open('data/words/{}.json'.format(fname), 'w') as f:
json.dump(words, f)
except OSError:
print('unable to open:', url)