# Dataset cleanup

## Biblioteki

In [0]:
# Put these at the top of every notebook, to get automatic reloading
%reload_ext autoreload
%autoreload 2

#matplotlib setup
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import matplotlib.style
import matplotlib as mpl
mpl.style.use('default')
mpl.style.use('seaborn-ticks')
# %config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings('ignore')
from pathlib import Path
import shutil as sh

# This file contains all the main external libs we'll use
from fastai.imports import *

from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots_wcz import *

from pathlib import Path

from  collections import Counter

from sklearn.metrics import confusion_matrix

## Mój dataset: 'Old Polish Cars v4'

In [0]:
dataset_path = Path('/content/data/old_polish_cars')
PATH = dataset_path / 'old_polish_cars_v5_photos-split/'

In [0]:
ls -lah $dataset_path

In [0]:
ls $PATH/train

In [0]:
cleanup_dataset_path = dataset_path / 'old_polish_cars_v4_photos/'

In [0]:
cats = sorted(list([x.stem for x in cleanup_dataset_path.iterdir()])) #; cats

In [0]:
cats_dict = {cat: idx for idx, cat in enumerate(cats)}; cats_dict

# Kategoria

In [0]:
selected_cat = 'Gazik'
selected_cat_idx = cats_dict[selected_cat]
selected_cat_path = cleanup_dataset_path / cats[selected_cat_idx]; print(selected_cat_path)

print('{} items'.format(len(list(selected_cat_path.iterdir()))))

## Model

In [0]:
sz=224
arch=resnet34

tfms = tfms_from_model(arch, sz)

data = ImageClassifierData.from_paths(PATH, test_name=selected_cat_path, tfms=tfms)
learn = ConvLearner.pretrained(arch, data, precompute=True)

In [0]:
learn.precompute=False
# learn.load('224_lastlayer')
learn.load('224_all')

## Predykcja z zawartości foldera

In [0]:
log_preds = learn.predict(is_test=True)

preds = np.argmax(log_preds, axis=1)  # from log probabilities to 0 or 1
probs = np.exp(log_preds);            # pr(car)
selected_y = np.full_like(preds, selected_cat_idx) #; selected_y[:10]

preds_classes = [data.classes[pred] for pred in preds] #; preds_classes[:10]
cat_classes = sorted(list(set(preds))) #; cat_classes

In [0]:
probs[:10]

In [0]:
pred_probs = np.array([(pred, probs[pred]) for pred,probs in zip(preds, probs)]); pred_probs[:10]

In [0]:
accuracy_np(probs, selected_y)

In [0]:
Counter(preds_classes)

## Tablica pomyłek

In [0]:
cm = confusion_matrix(selected_y, preds)

mpl.style.use('default')
mpl.style.use('seaborn-ticks')

# %config InlineBackend.figure_format = 'retina'

plot_confusion_matrix(cm, [data.classes[cat] for cat in cat_classes]) # WAS: data.classes

## Analiza rezultatów: przegląd zdjęć

In [0]:
def rand_by_mask(mask): return np.random.choice(np.where(mask)[0], 4, replace=False)
def rand_by_correct(is_correct): return rand_by_mask((preds == selected_y)==is_correct)

In [0]:
def plots(ims, figsize=(12,6), rows=1, titles=None):
    f = plt.figure(figsize=figsize)
    for i in range(len(ims)):
        sp = f.add_subplot(rows, len(ims)//rows, i+1)
        sp.axis('Off')
        if titles is not None: sp.set_title(titles[i], fontsize=16)
        plt.imshow(ims[i])

In [0]:
def load_img_id(ds, idx): return np.array(PIL.Image.open(PATH/ds.fnames[idx]))

def plot_val_with_title(idxs, title):  
    imgs = [load_img_id(data.test_ds,x) for x in idxs]
    def fn(x): return Path(data.test_ds.fnames[x]).stem
    def pred_str(x): return '' + data.classes[int(pred_probs[x, 0])] + f', {pred_probs[x, 1]*100:0.4}%'
    title_probs = [pred_str(x) for x in idxs]
    titles = [f'{fn(x)} -> {pred_str(x)}' for x in idxs]
    print(title)
    print(titles)
    return plots(imgs, rows=1, titles=title_probs, figsize=(16,8))

In [0]:
# 1. A few correct labels at random
plot_val_with_title(rand_by_correct(True), "Correctly classified")

In [0]:
# 2. A few incorrect labels at random
plot_val_with_title(rand_by_correct(False), "Incorrectly classified")

In [0]:
def most_by_mask(mask, mult):
    idxs = np.where(mask)[0]
    return idxs[np.argsort(mult * pred_probs[idxs, 1])[:4]]

def most_by_correct(y, is_correct): 
    mult = -1 if (y==selected_cat_idx)==is_correct else 1
    return most_by_mask((preds == selected_y)==is_correct & (selected_y == y), mult)

In [0]:
plot_val_with_title(most_by_correct(selected_cat_idx, True), "Most correct " + selected_cat)

In [0]:
plot_val_with_title(most_by_correct(selected_cat_idx, False), "Most incorrect " + selected_cat)

In [0]:
most_uncertain = np.argsort(np.abs(pred_probs[:, 1] -0.5))[:4]
plot_val_with_title(most_uncertain, "Most uncertain predictions")