Skip to content
Go to file
Cannot retrieve contributors at this time
71 lines (61 sloc) 2.3 KB
import os
import json
import random
import requests
from PIL import Image
from glob import glob
from text.ocr import ocr
from faces import extract_faces
from objects import extract_objects
from images.transform import resize_to_limit
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/53.0.2785.143 Chrome/53.0.2785.143 Safari/537.36'}
IMAGES_DIR = 'assets/commons'
def sample_wikicommons_urls(n):
"""sample wikicommons image urls"""
pop = glob('assets/commons_urls/*')
subpops = [random.choice(pop) for _ in range(n)]
sample = []
for p in subpops:
choices = [l.strip() for l in open(p, 'r').readlines()]
return sample
def sample_wikicommons(n):
"""sample downloaded wikicommons images"""
pop = glob('{}/*'.format(IMAGES_DIR))
return random.sample(pop, min(n, len(pop)))
def download_image(url, dir, overwrite=False):
"""download an image"""
fname = url.split('/')[-1]
path = os.path.join(dir, fname)
if os.path.exists(path) and not overwrite:
return path
res = requests.get(url, stream=True, headers=HEADERS)
if res.status_code == 200:
with open(path, 'wb') as f:
for chunk in res:
return path
print('failed to download:', url)
# res.raise_for_status()
def fetch_sample(n):
"""fetch, download, and process a sample of wikicommons image urls"""
sample = sample_wikicommons_urls(n)
for url in sample:
if any(url.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png']):
path = download_image(url, IMAGES_DIR)
if path:
# so we don't keep massive images
im = resize_to_limit(, (800, 800))
words = ocr(path)
if words:
fname = path.split('/')[-1]
with open('data/words/{}.json'.format(fname), 'w') as f:
json.dump(words, f)
except OSError:
print('unable to open:', url)
You can’t perform that action at this time.