## Second architecture: Bottleneck Features (InceptionV3) + Logistic Regression 

In [None]:
import pandas as pd
import numpy as np
import os
import shutil
import math
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from random import shuffle
from livelossplot import PlotLossesKeras
from sklearn.metrics import log_loss, accuracy_score, f1_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from mpl_toolkits.axes_grid1 import ImageGrid
from sklearn.linear_model import LogisticRegression

from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras import layers
from keras import models
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.inception_v3 import preprocess_input
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
from keras.layers import Activation, Dropout, Flatten, Dense, BatchNormalization

In [None]:
#windows path
DATASET_DIR = " " # path to dataset's folder
TRAIN_DIR = DATASET_DIR+"\\train"# path to training set's folder
TEST_DIR = DATASET_DIR+"\\test"# path to test set's folder

TRAIN_LABEL = DATASET_DIR+"\\labels.csv" # path to label set's folder
TRAIN_SET_DIR = DATASET_DIR+"\\train_set"# path to traning set's folder [pre-processed]
VAL_SET_DIR = DATASET_DIR+"\\val_set"# path to validation set's folder [pre-processed]
TEST_SET_DIR = DATASET_DIR+"\\test_set"# path to test set's folder [pre-processed]

### Functions

In [None]:
def hot_enconding(ypred, n_classes):
    hot_encoder = np.zeros(shape=(ypred.shape[0], n_classes))
    for i in range(ypred.shape[0]):
        hot_encoder[i][int(ypred[i])] = 1
    return hot_encoder

In [None]:
def extract_features(DIR, n_total_images, n_features, size_img, conv):
    i = 0
    label = 0
    features = np.zeros(shape=(n_total_images, n_features))
    labels = np.zeros(shape=(n_total_images, 1))
    for root, dirs, files in os.walk(DIR):
        for dirname in tqdm(sorted(dirs)):
            filelist = os.listdir(DIR+'\\'+dirname)
            filelist = np.asarray(filelist)
            for filename in filelist:
                img_path = DIR+'\\'+dirname+'\\'+filename
                img = image.load_img(img_path, target_size=(size_img, size_img))
                
                x = image.img_to_array(img)
                x = np.expand_dims(x, axis=0)
                x = preprocess_input(x)#normalize[-1,1]
                
                features[i,:] = conv.predict(x)
                labels[i] = label
                i = i+1
            label = label+1       
    return features, labels

In [None]:
def extract_single_feature(img_path, n_features, size_img, conv):

    features = np.zeros(shape=(1, n_features))
    img = image.load_img(img_path, target_size=(size_img, size_img))
                
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    x = preprocess_input(x)
                
    features[0,:] = conv.predict(x)
    
    return features

In [None]:
def extract_single_feature_image(img, n_features, conv):
    
    features = np.zeros(shape=(1, n_features))

    x = np.expand_dims(img, axis=0)               
    features[0,:] = conv.predict(x)
    
    return features

In [None]:
def probability2discreteOutput(ypred):
    y_pred_d = []
    for i in range(ypred.shape[0]):
        y_pred_d.append(np.argmax(ypred[i]))
    y_pred_d = np.asarray(y_pred_d)
    return y_pred_d.astype(int)

### Reading breeds from the csv file

In [None]:
df_train = pd.read_csv(TRAIN_LABEL)
breeds = df_train.breed.unique()
breeds = np.sort(breeds)

### Bottleneck Features (InceptionV3) 

In [None]:
size_img = 300
conv_model = InceptionV3(include_top = False, weights='imagenet', input_shape=(size_img, size_img, 3), pooling="avg")

In [None]:
n_features = 2048
train_features, train_labels = extract_features(TRAIN_SET_DIR, 6547, n_features, size_img, conv_model)
val_features, val_labels = extract_features(VAL_SET_DIR, 1580, n_features, size_img, conv_model)
test_features, test_labels = extract_features(TEST_SET_DIR, 2095 , n_features, size_img, conv_model)

In [None]:
print(train_features.shape, train_labels.shape)
print(train_features.shape, train_labels.shape)

stackfeatures =  np.vstack((train_features, val_features))
stacklabel =  np.vstack((train_labels, val_labels))
stacklabel = stacklabel.reshape(train_labels.shape[0]+val_labels.shape[0])
print(stackfeatures.shape, stacklabel.shape)

### Traning

In [None]:
# Logistic Regression
logreg = LogisticRegression()
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', n_jobs=2)
logreg.fit(stackfeatures, stacklabel)

In [None]:
pred = logreg.predict_proba(test_features)
ypred = probability2discreteOutput(pred)#converting output to discrete label

### Model Evaluation: accuracy, error (log loss), and f1-score (average)

In [None]:
n_classes = 120
encoded_pred = hot_enconding(ypred, n_classes)
encoded_true = hot_enconding(test_labels, n_classes)
acc = accuracy_score(test_labels, ypred)
loss = log_loss(encoded_true, pred, eps=1e-15)
f1_sc = f1_score(test_labels, ypred, average='micro')
print("acc:",acc,"error:",loss,"f1-score:",f1_sc)

### Visualizing Prediction

In [None]:
batch_size = 10
test_datagen = ImageDataGenerator(rescale=1./255)
test_generator = test_datagen.flow_from_directory(
TEST_SET_DIR,target_size=(300, 300),
batch_size=batch_size,
class_mode='categorical')
batch_elem = test_generator.next() 

In [None]:
dict_breeds = dict(zip(breeds, range(breeds.shape[0])))
data_batch = batch_elem[0]
labels_batch = batch_elem[1]
fig = plt.figure(figsize=(20,20))
#from https://matplotlib.org/2.0.2/mpl_toolkits/axes_grid/users/overview.html
grid = ImageGrid(fig, 111,  # similar to subplot(111)
                 nrows_ncols=(2, 5),  # creates 2x2 grid of axes
                 axes_pad=0.04,  # pad between axes in inch.
                 )
font = {'family': 'serif',
        'size': 12,
        }
for i in range(10):
    ax = grid[i]
    data = np.copy(data_batch[i])
    label = labels_batch[i]
    label_breed = np.argmax(label)
    #print(np.max(data),np.min(data))
    #x = np.expand_dims(data, axis=0)
    
    #feature = conv_model.predict(x)
    img = np.copy(data)
    feature = extract_single_feature_image(img, n_features, conv_model)
    
    pred = logreg.predict_proba(feature)
    label_pred = np.argmax(pred)

    ax.text(10, 250, 'Label: %s' % (breeds[label_breed]),  fontdict=font, color='b', backgroundcolor='w', alpha=0.8)
    ax.text(10, 280, 'Pred: %s (%.2f)' % (breeds[label_pred], pred[0][label_pred]),  fontdict=font, color='r', backgroundcolor='w', alpha=0.8)
    ax.imshow(data)
    ax.axis('off')
plt.show()


### Visualizing Wrong Prediction

In [None]:
size_img = 300
fig = plt.figure(figsize=(20,20))
grid = ImageGrid(fig, 111,  # similar to subplot(111)
                 nrows_ncols=(2, 5),  # creates 2x2 grid of axes
                 axes_pad=0.04,  # pad between axes in inch.
                 )
font = {'family': 'serif',
        'size': 14,
        }
ind = 0
for root, dirs, files in os.walk(VAL_SET_DIR):
        for dirname in dirs:
            filelist = os.listdir(VAL_SET_DIR+'\\'+dirname)
            filelist = np.asarray(filelist)
            for filename in filelist:
                if ind == 10:
                    break
                img_path = VAL_SET_DIR+'\\'+dirname+'\\'+filename
                img = image.load_img(img_path, target_size=(size_img, size_img))
                
                feature = extract_single_feature(img_path, n_features, size_img, conv_model)
              
                pred = logreg.predict_proba(feature)
                label_pred = np.argmax(pred)
                
                if breeds[label_pred] != dirname:
                    ax = grid[ind]
                    #print(img_path)
                    ax.text(10, 250, 'Label: %s' % dirname,  fontdict=font, color='b', backgroundcolor='w', alpha=0.8)
                    ax.text(10, 280, 'Pred: %s' % (breeds[label_pred]),  fontdict=font, color='r', backgroundcolor='w', alpha=0.8)
                    ax.imshow(img)
                    ind = ind+1
                    ax.axis('off')
plt.show()

### Submission Kaggle

In [None]:
ypred = np.zeros([10357,120])
filelist = os.listdir(TEST_DIR)
filelist = np.asarray(filelist)
i = 0
diff = 0
size_img = 300
for filename in tqdm(filelist):
    img_path = TEST_DIR+'\\'+filename
    feat = extract_single_feature(img_path, n_features, size_img, conv_model)

    pred = logreg.predict_proba(feat)
    ypred[i,:] = pred
    i = i+1

In [None]:
ids = []
for i in range(filelist.shape[0]):
    name = filelist[i]
    name = name[:len(name)-4]
    ids.append(name)
ids = np.asarray(ids)

In [None]:
df_sub = pd.DataFrame(ypred, columns = breeds)
df_sub.insert(0, 'id', ids)
df_sub.to_csv('architecture2.csv', index=False)
print (df_sub)