/
sampler.py
71 lines (61 loc) · 2.3 KB
/
sampler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import json
import random
import requests
from PIL import Image
from glob import glob
from text.ocr import ocr
from faces import extract_faces
from objects import extract_objects
from images.transform import resize_to_limit
HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/53.0.2785.143 Chrome/53.0.2785.143 Safari/537.36'}
IMAGES_DIR = 'assets/commons'
def sample_wikicommons_urls(n):
"""sample wikicommons image urls"""
pop = glob('assets/commons_urls/*')
subpops = [random.choice(pop) for _ in range(n)]
sample = []
for p in subpops:
choices = [l.strip() for l in open(p, 'r').readlines()]
sample.append(random.choice(choices))
return sample
def sample_wikicommons(n):
"""sample downloaded wikicommons images"""
pop = glob('{}/*'.format(IMAGES_DIR))
return random.sample(pop, min(n, len(pop)))
def download_image(url, dir, overwrite=False):
"""download an image"""
fname = url.split('/')[-1]
path = os.path.join(dir, fname)
if os.path.exists(path) and not overwrite:
return path
res = requests.get(url, stream=True, headers=HEADERS)
if res.status_code == 200:
with open(path, 'wb') as f:
for chunk in res:
f.write(chunk)
return path
else:
print('failed to download:', url)
# res.raise_for_status()
def fetch_sample(n):
"""fetch, download, and process a sample of wikicommons image urls"""
sample = sample_wikicommons_urls(n)
for url in sample:
if any(url.lower().endswith(ext) for ext in ['.jpg', '.jpeg', '.png']):
print(url)
path = download_image(url, IMAGES_DIR)
if path:
# so we don't keep massive images
try:
im = resize_to_limit(Image.open(path), (800, 800))
im.save(path)
extract_faces(path)
extract_objects(path)
words = ocr(path)
if words:
fname = path.split('/')[-1]
with open('data/words/{}.json'.format(fname), 'w') as f:
json.dump(words, f)
except OSError:
print('unable to open:', url)