In [None]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
# This file contains all the main external libs we'll use
from fastai.imports import *

from fastai.transforms import *
from fastai.conv_learner import *
from fastai.model import *
from fastai.dataset import *
from fastai.sgdr import *
from fastai.plots import *
from fastai.metrics import *

import matplotlib.pyplot as plt
import csv
from collections import defaultdict, namedtuple
import os
import shutil

import pandas as pd

from sklearn.metrics import confusion_matrix

In [None]:
path = '../data/plant-seedlings'

# Exploratory analysis

In [None]:
arch=resnet50
sz = 224
bs = 64

tfms = tfms_from_model(arch, sz, aug_tfms=transforms_top_down, max_zoom=1.2)
data = ImageClassifierData.from_paths(path, tfms=tfms, test_name='test', bs=bs)

In [None]:
plt.imshow(load_img_id(data.val_ds, 1, path))
plt.show()

In [None]:
y = data.trn_ds.y

In [None]:
print(y[:10])

In [None]:
from collections import Counter

num_train = 0
label_counts = Counter(y).most_common()
for l, c in label_counts:
    num_train = num_train + c
    print(c, '\t', data.classes[l])
    
print(num_train)

In [None]:
for label in sorted([l for l, c in label_counts], key=lambda p: data.classes[p]):
    i = [i for i, l in enumerate(y) if l == label][0]
    print(data.classes[y[i]])
    plt.imshow(load_img_id(data.trn_ds, i, path))
    plt.show()

## Image sizes

In [None]:
flags = cv2.IMREAD_UNCHANGED+cv2.IMREAD_ANYDEPTH+cv2.IMREAD_ANYCOLOR

folder = path + '/all/Black-grass'

files = os.listdir(folder)
sizes = []

for file in files:
    filename = os.path.join(folder, file)
    img = cv2.imread(filename, flags)
    sizes.append(max(img.shape[0], img.shape[1]))

In [None]:
plt.hist(sizes, bins=50)
plt.show()

# Image lighting

Let's try histogram equalization to improve constrast.

In [None]:
flags = cv2.IMREAD_UNCHANGED+cv2.IMREAD_ANYDEPTH+cv2.IMREAD_ANYCOLOR

folder = path + '/all/Black-grass'

files = os.listdir(folder)
for i in range(5):
    f = plt.figure(figsize=(5, 5))
    
    filename = os.path.join(folder, files[i])
    img = cv2.imread(filename, flags)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    sp = f.add_subplot(1, 2, 1)
    plt.imshow(img)
    
    img = np.concatenate([np.expand_dims(cv2.equalizeHist(img[:,:,i]), axis=2) for i in range(3)], axis=2)
    
    
    sp = f.add_subplot(1, 2, 2)
    plt.imshow(img)
    plt.show()
    
    print(img.shape)

In [None]:
# Normal version

def open_image_normal(fn):
    """ Opens an image using OpenCV given the file path.

    Arguments:
        fn: the file path of the image

    Returns:
        The numpy array representation of the image in the RGB format
    """
    flags = cv2.IMREAD_UNCHANGED+cv2.IMREAD_ANYDEPTH+cv2.IMREAD_ANYCOLOR
    if not os.path.exists(fn):
        raise OSError('No such file or directory: {}'.format(fn))
    elif os.path.isdir(fn):
        raise OSError('Is a directory: {}'.format(fn))
    else:
        try:
            return cv2.cvtColor(cv2.imread(fn, flags), cv2.COLOR_BGR2RGB).astype(np.float32)/255
        except Exception as e:
            raise OSError('Error handling image at: {}'.format(fn)) from e

In [None]:
# Histogram equalization

def open_image_hist_eq(fn):
    """ Opens an image using OpenCV given the file path.

    Arguments:
        fn: the file path of the image

    Returns:
        The numpy array representation of the image in the RGB format
    """
    flags = cv2.IMREAD_UNCHANGED+cv2.IMREAD_ANYDEPTH+cv2.IMREAD_ANYCOLOR
    if not os.path.exists(fn):
        raise OSError('No such file or directory: {}'.format(fn))
    elif os.path.isdir(fn):
        raise OSError('Is a directory: {}'.format(fn))
    else:
        try:
            img = cv2.cvtColor(cv2.imread(fn, flags), cv2.COLOR_BGR2RGB)
            img = np.concatenate([np.expand_dims(cv2.equalizeHist(img[:,:,i]), axis=2) for i in range(3)], axis=2)
            return img.astype(np.float32)/255
        except Exception as e:
            raise OSError('Error handling image at: {}'.format(fn)) from e

Uncomment the 2nd line below to apply histogram equalization to fastai dataset code.

In [None]:
#open_image = open_image_normal
#open_image = open_image_hist_eq

# Image augmentation

Here we can change image augmentation parameters and see how augmented images look like.

In [None]:
# Look at examples of image augmentation
def get_augs():
    x,_ = next(iter(data.aug_dl))
    return data.trn_ds.denorm(x)[1]

In [None]:
bs = 64

#aug_tfms = [RandomRotate(20), RandomLighting(0.8, 0.8)]
tfms = tfms_from_model(arch, sz, aug_tfms=transforms_top_down, max_zoom=1.2)
data = ImageClassifierData.from_paths(path, tfms=tfms, test_name='test', bs=bs)

ims = np.stack([get_augs() for i in range(6)])
plots(ims, rows=2)

# Training

In [None]:
# Create a learner
learn = ConvLearner.pretrained(arch, data, precompute=False)

In [None]:
wd=5e-4

## Searching for a good starting learning rate

In [None]:
def plot_loss_change(sched, sma=1, n_skip=20, y_lim=(-0.01,0.01)):
    """
    Plots rate of change of the loss function.
    Parameters:
        sched - learning rate scheduler, an instance of LR_Finder class.
        sma - number of batches for simple moving average to smooth out the curve.
        n_skip - number of batches to skip on the left.
        y_lim - limits for the y axis.
    """
    derivatives = [0] * (sma + 1)
    for i in range(1 + sma, len(learn.sched.lrs)):
        derivative = (learn.sched.losses[i] - learn.sched.losses[i - sma]) / sma
        derivatives.append(derivative)
        
    plt.ylabel("d/loss")
    plt.xlabel("learning rate (log scale)")
    plt.plot(learn.sched.lrs[n_skip:], derivatives[n_skip:])
    plt.xscale('log')
    plt.ylim(y_lim)

In [None]:
learn.lr_find()

In [None]:
learn.sched.plot_lr()

In [None]:
learn.sched.plot()

In [None]:
plot_loss_change(learn.sched, sma=20, n_skip=20, y_lim=(-0.03, 0.01))

## Training with LR 0.01 for 1 epoch

In [None]:
learn.fit(0.01, 20, wds=wd)

In [None]:
learn.save('step1')

In [None]:
learn.fit(0.01, 2, cycle_len=1, cycle_mult=2, wds=wd)

In [None]:
learn.save('step1_cycle')

In [None]:
learn.load('step1_cycle')

In [None]:
learn.unfreeze()
learn.lr_find()

In [None]:
learn.sched.plot()

In [None]:
plot_loss_change(learn.sched, sma=20, n_skip=20, y_lim=(-0.01, 0.01))

## Unfreeze and train with LR 0.01 for 3 epochs

In [None]:
learn.load('unfreeze1')

In [None]:
learn.unfreeze()
learn.fit(1e-3, 1, wds=wd)
#learn.fit(0.01, 3)

In [None]:
learn.save('unfreeze1')

In [None]:
learn.lr_find()

In [None]:
learn.sched.plot()

In [None]:
plot_loss_change(learn.sched, sma=20, n_skip=20, y_lim=(-0.001, 0.01))

## Train for a few cycles

In [None]:
learn.load('unfreeze1')

In [None]:
lr=1e-3

In [None]:
learn.fit(lr, 1, cycle_len=1, cycle_mult=2, wds=wd)
#learn.fit(lr, 4, cycle_len=1, cycle_mult=2)

In [None]:
learn.sched.plot_lr()

In [None]:
learn.save('2')

In [None]:
learn.load('2')

In [None]:
learn.lr_find()

In [None]:
learn.sched.plot_lr()

In [None]:
learn.sched.plot()

In [None]:
plot_loss_change(learn.sched, sma=20, n_skip=20, y_lim=(-0.02, 0.01))

## Error analysis

In [None]:
log_preds,y = learn.predict_with_targs()
preds = np.exp(log_preds)
pred_labels = np.argmax(preds, axis=1)

In [None]:
cm = confusion_matrix(y, pred_labels)
plot_confusion_matrix(cm, data.classes)

In [None]:
results = ImageModelResults(data.val_ds, log_preds)

### Most incorrect

In [None]:
incorrect = [i for i in range(len(pred_labels)) if pred_labels[i] != y[i]]
c = Counter([(y[i], data.classes[y[i]]) for i in incorrect])
c.most_common(20)

In [None]:
results.plot_most_incorrect(0)

In [None]:
results.plot_most_incorrect(6)

In [None]:
results.plot_most_incorrect(1)

### Most correct

In [None]:
results.plot_most_correct(0)

In [None]:
results.plot_most_correct(5)

In [None]:
results.plot_most_correct(10)

### Most uncertain

In [None]:
results.plot_most_uncertain(1)

## Test time augmentation (TTA)

In [None]:
log_preds,y = learn.TTA(n_aug=20)
preds = np.mean(np.exp(log_preds),0)

In [None]:
accuracy_np(preds, y)

# Retrain on the training set + validation set

Validation loss/accuracy won't be indicative of the model performance because the validation set is a subset of the training set now.

In [None]:
arch = resnet50
sz = 224
bs = 64
wd = 5e-4

In [None]:
#aug_tfms = [RandomRotate(20), RandomLighting(0.8, 0.8)]
tfms = tfms_from_model(arch, sz, aug_tfms=transforms_top_down, max_zoom=1.2)
data = ImageClassifierData.from_paths(path, tfms=tfms, trn_name='all', val_name='valid', test_name='test', bs=bs)

In [None]:
learn = ConvLearner.pretrained(arch, data, precompute=False)

In [None]:
lr = 0.01

In [None]:
learn.fit(lr, 10, wds=wd)

In [None]:
learn.save('step1_full')

In [None]:
learn.load('step1_full')

In [None]:
learn.fit(lr, 2, cycle_len=1, cycle_mult=2, wds=wd)

In [None]:
learn.save('step1_cycle_full')

In [None]:
learn.load('step1_cycle_full')

In [None]:
learn.load('final_full')

In [None]:
learn.unfreeze()
learn.fit(1e-3, 1, wds=wd)

In [None]:
learn.save('final_full')

In [None]:
learn.save('final_full_val013_997')

In [None]:
learn.load('final_full_val013_997')

In [None]:
learn.load('final_full_cycle')

In [None]:
learn.fit(1e-3, 1, cycle_len=1, cycle_mult=2, wds=wd)

In [None]:
learn.save('final_full_cycle')

In [None]:
log_preds, y = learn.TTA(n_aug=20) # (5, 2044, 120), (2044,)
probs = np.mean(np.exp(log_preds),0)
accuracy_np(probs, y), metrics.log_loss(y, probs)

In [None]:
preds = np.argmax(probs, axis=1)
print(probs.shape)
print(preds)
print(preds.shape)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y, preds)
plot_confusion_matrix(cm, data.classes)

In [None]:
test_log_predictions, _ = learn.TTA(is_test=True)
test_probs = np.mean(np.exp(test_log_predictions),0)
test_predictions = np.argmax(test_probs, axis=1)
print(test_predictions.shape)
test_predictions_classes = [data.classes[pred] for pred in test_predictions]
test_file_names = learn.data.test_ds.fnames

with open('submission.csv', 'w') as the_file:
    the_file.write('file,species\n')
    for file_path, prediction in zip(test_file_names, test_predictions_classes):
        file_name = file_path.split('/')[-1]
        the_file.write(file_name)
        the_file.write(",")
        the_file.write(prediction)
        the_file.write("\n")
the_file.close()

from IPython.display import FileLink
FileLink('submission.csv')

In [None]:
from collections import Counter

def winner(input): 
  
     # convert list of candidates into dictionary 
     # output will be likes candidates = {'A':2, 'B':4} 
     votes = Counter(input)
     #print(votes)
       
     # create another dictionary and it's key will 
     # be count of votes values will be name of  
     # candidates 
     dict = {} 
  
     for value in votes.values(): 
  
          # initialize empty list to each key to  
          # insert candidate names having same  
          # number of votes  
          dict[value] = [] 
  
     for (key,value) in votes.items(): 
          dict[value].append(key) 
  
     # sort keys in descending order to get maximum  
     # value of votes 
     maxVote = sorted(dict.keys(),reverse=True)[0] 
  
     # check if more than 1 candidates have same  
     # number of votes. If yes, then sort the list 
     # first and print first element 
     if len(dict[maxVote])>1:
         return sorted(dict[maxVote])[0]
     else:
         return dict[maxVote][0]

In [None]:
import pandas as pd

submission1 = './submission23.csv'
submission2 = './submission22.csv'
submission3 = './submission21.csv'
submission4 = './submission18.csv'
submission5 = './submission17.csv'
submission6 = './submission17.csv'
submission7 = './submission16.csv'

counter = 0

df1 = pd.read_csv(submission1)
df2 = pd.read_csv(submission2)
df3 = pd.read_csv(submission3)
df4 = pd.read_csv(submission4)
df5 = pd.read_csv(submission5)
df6 = pd.read_csv(submission6)
df7 = pd.read_csv(submission7)

classes = []
speciesList = []

for i in range(0, len(df1)):
    classes.append(winner([df1['species'][i], df2['species'][i], df3['species'][i], df4['species'][i], df5['species'][i], df6['species'][i], df7['species'][i]]))

with open('submission.csv', 'w') as the_file:
    the_file.write('file,species\n')
    for file_path, prediction in zip(test_file_names, classes):
        file_name = file_path.split('/')[-1]
        the_file.write(file_name)
        the_file.write(",")
        the_file.write(prediction)
        the_file.write("\n")
the_file.close()

from IPython.display import FileLink
FileLink('submission.csv')

In [None]:
learn.data.test_ds.fnames

# Testing on the test set

In [None]:
true_test_labels = {a.filename: a.label for a in test_annotations}
class_indexes = {c: i for i, c in enumerate(data.classes)}
filenames = [filepath[filepath.find('/') + 1:] for filepath in data.test_ds.fnames]
labels = [str(true_test_labels[filename]) for filename in filenames]
y_true = np.array([class_indexes[label] for label in labels])

In [None]:
log_preds = learn.predict(is_test=True)
preds = np.exp(log_preds)
accuracy_np(preds, y_true)

In [None]:
log_preds,_ = learn.TTA(n_aug=20, is_test=True)
preds = np.mean(np.exp(log_preds),0)
accuracy_np(preds, y_true)

# Analyze test results

In [None]:
pred_labels = np.argmax(preds, axis=1)

In [None]:
incorrect = [i for i in range(len(pred_labels)) if pred_labels[i] != y_true[i]]

In [None]:
for i in range(0,10):
    print(class_names[data.classes[y_true[incorrect[i]]]], class_names[data.classes[pred_labels[incorrect[i]]]], 
          preds[incorrect[i], y_true[incorrect[i]]], preds[incorrect[i], pred_labels[incorrect[i]]])
    plt.imshow(load_img_id(data.test_ds, incorrect[i], path))
    plt.show()

In [None]:
cm = confusion_matrix(y_true, pred_labels)

In [None]:
np.savetxt(os.path.join(path, 'confusion_matrix.tsv'), cm, delimiter='\t')

In [None]:
c = Counter([class_names[data.classes[y_true[i]]] for i in incorrect])
c.most_common(20)

In [None]:
c = Counter([class_names[data.classes[pred_labels[i]]] for i in incorrect])
c.most_common(20)

# Create a submission file

In [None]:
pred_labels = np.argmax(preds, axis=1)
pred_labels.shape

In [None]:
for i in range(10):
    class_id = data.classes[pred_labels[i]]
    filename = data.test_ds.fnames[i].split('/')[1]
    print(filename, class_id, class_names[class_id])
    plt.imshow(load_img_id(data.test_ds, i, path))
    plt.show()

In [None]:
with open('data/submission.csv', 'w') as f:
    for i in range(pred_labels.shape[0]):
        filename = data.test_ds.fnames[i].split('/')[1]
        f.write('{};{}\n'.format(filename, data.classes[pred_labels[i]]))

In [None]:
from IPython.display import FileLink
FileLink('data/submission.csv')