-
Notifications
You must be signed in to change notification settings - Fork 3
/
facescrub.py
73 lines (60 loc) · 3.18 KB
/
facescrub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import os
from vipy.util import readcsv, remkdir
from vipy.image import ImageDetection
import numpy as np
class FaceScrub(object):
def __init__(self, datadir):
self._datadir = datadir
self._dataset = [] # parsed and validated ImageDetections
def __repr__(self):
return str('<vipy.data.facescrub: %s>' % self._datadir)
def __len__(self):
return len(self._dataset)
def parse(self):
""" Return a list of ImageDetections for all URLs in facescrub """
imset = []
imdir = remkdir(os.path.join(self._datadir, 'images'))
csv_actors = readcsv(os.path.join(self._datadir, 'facescrub_actors.txt'), separator='\t')
for (subjectname, imageid, faceid, url, bbox, sha256) in csv_actors[1:]:
categoryname = subjectname.replace(' ', '_')
(xmin,ymin,xmax,ymax) = bbox.split(',')
imset.append(ImageDetection(url=url, filename=os.path.join(imdir, '%s_%s.jpg' % (categoryname, imageid)), category=categoryname, xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax, attributes={'GENDER':'male'}))
csv_actresses = readcsv(os.path.join(self._datadir, 'facescrub_actresses.txt'), separator='\t')
for (subjectname, imageid, faceid, url, bbox, sha256) in csv_actresses[1:]:
categoryname = subjectname.replace(' ', '_')
(xmin,ymin,xmax,ymax) = bbox.split(',')
imset.append(ImageDetection(url=url, filename=os.path.join(imdir, '%s_%s.jpg' % (categoryname, imageid)), category=categoryname, xmin=xmin, ymin=ymin, xmax=xmax, ymax=ymax, attributes={'GENDER':'female'}))
return imset
def download(self):
""" Download every URL in dataset and store in provided filename """
return [d.download(ignoreErrors=True) for d in self.parse()]
def validate(self):
"""Validate downloaded dataset and store cached list of valid bounding boxes and loadable images accessible with dataset()"""
P = self.parse()
D = []
for (k,p) in enumerate(P):
if k % 1000 == 0:
print('[vipy.data.facescrub][%d/%d]: validating dataset... (successful download?, good bounding box?, loadable image?)' % (k, len(P)))
if not p.invalid() and p.load(ignoreErrors=True, fetch=False) is not None:
D.append(p.flush())
self._dataset = D # cache
return self
def dataset(self):
return self._dataset if len(self) > 0 else self.validate().dataset()
def stats(self):
print('[vipy.data.facescrub]: %f percent downloaded' % (float(len(self.parse())) / float(len(self.dataset()))))
def subjects(self):
return list(set([im.category() for im in self.dataset()]))
def split(self, valsize=128):
D = self.dataset()
subjects = list(set([im.category() for im in D]))
(trainset, valset) = ([], [])
for s in subjects:
S = [d for d in D if d.category() == s]
if len(S) > 3:
valset = valset + S[0:2] # two examples per subject
trainset = trainset + S[2:] # rest
else:
trainset = trainset + S
valset = np.random.choice(valset, valsize)
return (trainset, valset)