In [1]:
from fastbook import *
from fastai.vision.all import *
from fastai.vision.widgets import *

human_types = 'korean female','chinese female','japanese female'
path = Path('asian females')  # Users\Taeyon\FastAI Taeyon\
human_types

('korean female', 'chinese female', 'japanese female')

In [2]:
from tqdm.notebook import tqdm, trange
import time    # to be used in loop iterations

### DuckDuckGo Function

In [3]:
def search_images_ddg(term, max_images=200):
    "Search for `term` with DuckDuckGo and return a unique urls of about `max_images` images"
    assert max_images<1000
    url = 'https://duckduckgo.com/'
    res = urlread(url,data={'q':term})
    searchObj = re.search(r'vqd=([\d-]+)\&', res)
    assert searchObj
    requestUrl = url + 'i.js'
    params = dict(l='us-en', o='json', q=term, vqd=searchObj.group(1), f=',,,', p='1', v7exp='a')
    urls,data = set(),{'next':1}
    while len(urls)<max_images and 'next' in data:
        try:
            data = urljson(requestUrl,data=params)
            urls.update(L(data['results']).itemgot('image'))
            requestUrl = url + data['next']
        except (URLError,HTTPError): pass
        time.sleep(0.2)
    return L(urls)

### Download Images

In [4]:
from utils import *

for i in human_types:
    dest = (path/i)
    dest.mkdir(exist_ok=True)  # create folder with name eg. 'korean' if it doesn't exist already
    results = search_images_ddg(f'{i} female celebrity', max_images = 100) # url of 100 images of each bear type
    
    for k in trange(len(results)):  # download the 50 urls to path you set
        download_url(results[k], f'{dest}/{i}_female_celebrity_{k}.jpg')  # k = 1, 2, 3...

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

  0%|          | 0/100 [00:00<?, ?it/s]

In [5]:
fns = get_image_files(path)
failed = verify_images(fns)
failed

BrokenProcessPool: A process in the process pool was terminated abruptly while the future was running or pending.

### Train Model

In [6]:
celebrities = DataBlock(  # Set up parameters of DataBlock API
    blocks=(ImageBlock, CategoryBlock), 
    get_items=get_image_files, 
    splitter=RandomSplitter(valid_pct=0.2, seed=42),
    get_y=parent_label,
    item_tfms=Resize(128))

dls = celebrities.dataloaders(path)  # Pass path to DataBlock API

Due to IPython and Windows limitation, python multiprocessing isn't available now.
So `number_workers` is changed to 0 to avoid getting stuck


In [7]:
celebrities = celebrities.new(item_tfms=RandomResizedCrop(224, min_scale=0.5), batch_tfms=aug_transforms())  # resize & crop the images
dls = celebrities.dataloaders(path)

Due to IPython and Windows limitation, python multiprocessing isn't available now.
So `number_workers` is changed to 0 to avoid getting stuck


In [9]:
learn = cnn_learner(dls, resnet18, metrics=error_rate).to_fp16()
learn.fine_tune(2)

epoch,train_loss,valid_loss,error_rate,time
0,2.178483,2.318662,0.733333,00:23


epoch,train_loss,valid_loss,error_rate,time


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 2.00 GiB total capacity; 977.51 MiB already allocated; 9.09 MiB free; 980.00 MiB reserved in total by PyTorch)

### (Optional) Manual Data Cleansing

In [None]:
cleaner = ImageClassifierCleaner(learn)
cleaner

In [None]:
for idx in cleaner.delete(): cleaner.fns[idx].unlink()
for idx,cat in cleaner.change(): shutil.move(str(cleaner.fns[idx]), path/cat)

### Model Analysis

In [None]:
learn.dls.vocab

In [None]:
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix()

In [None]:
# export
learn.export()