# Notebook to extract features with a semantic CNN trained on the ImageNet dataset
***
## Imports

In [49]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications import imagenet_utils
from tensorflow.keras.preprocessing.image import img_to_array, load_img
from ConvFuncs import HDF5DatasetWriter
import numpy as np
import progressbar
import random
import os
import jobsConfig as config
import h5py
from preprocessors import AspectAwarePreprocessor
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, confusion_matrix
import joblib
import random

## Initializations 

In [2]:
test_database = h5py.File(config.TEST_HDF5, "r")
batch_size = 16
buffer_size = 1000

#Get data
X_test = test_database["images"]
y_test = test_database["labels"]

## Start with ResNet50 just like Adrian 

In [3]:
RN50 = ResNet50(weights = "imagenet", include_top = False)

In [4]:
#Look at layers
def inspect_model(model):
    for i, layer in enumerate(model.layers):
        print("{}\t{}\t{}".format(i, layer.output_shape, layer.__class__.__name__))

In [5]:
inspect_model(RN50)

0	[(None, None, None, 3)]	InputLayer
1	(None, None, None, 3)	ZeroPadding2D
2	(None, None, None, 64)	Conv2D
3	(None, None, None, 64)	BatchNormalization
4	(None, None, None, 64)	Activation
5	(None, None, None, 64)	ZeroPadding2D
6	(None, None, None, 64)	MaxPooling2D
7	(None, None, None, 64)	Conv2D
8	(None, None, None, 64)	BatchNormalization
9	(None, None, None, 64)	Activation
10	(None, None, None, 64)	Conv2D
11	(None, None, None, 64)	BatchNormalization
12	(None, None, None, 64)	Activation
13	(None, None, None, 256)	Conv2D
14	(None, None, None, 256)	Conv2D
15	(None, None, None, 256)	BatchNormalization
16	(None, None, None, 256)	BatchNormalization
17	(None, None, None, 256)	Add
18	(None, None, None, 256)	Activation
19	(None, None, None, 64)	Conv2D
20	(None, None, None, 64)	BatchNormalization
21	(None, None, None, 64)	Activation
22	(None, None, None, 64)	Conv2D
23	(None, None, None, 64)	BatchNormalization
24	(None, None, None, 64)	Activation
25	(None, None, None, 256)	Conv2D
26	(None, None, 

**No output layers on this network. Get a baseline by using this network as a feature extractor by dropping final FC layers.**

In [6]:
test_writer = HDF5DatasetWriter(config.TEST_FEATURES, (len(y_test), 100352), dataKey = "features", bufSize = buffer_size)

The supplied database ../data/hdf5/test_ResNet50.hdf5 already exists. Would you like to delete the database and proceede (y/n)?:  y


In [7]:
def extract_features(X, y, writer, model, batch_size, num_instances):
    #First initialize progressbar and preprocessor
    widgets = ["Extracting Features: ", progressbar.Percentage(), " ", progressbar.Bar(), " ", progressbar.ETA()]
    pbar = progressbar.ProgressBar(maxval = num_instances, widgets = widgets).start()
    
    aap = AspectAwarePreprocessor(224, 224)
    
    #Loop through images in batches
    for i in np.arange(0, num_instances, batch_size):
        batch_images = X[i:i+batch_size]
        batch_labels = y[i:i+batch_size]
        processed_images = []
        
        #Preprocess each image
        for j, image in enumerate(batch_images):
            
            #Ensure image is a keras compatible array
            image = aap.preprocess(image)
            image = img_to_array(image)
            
            #Preprocess image
            image = np.expand_dims(image, axis = 0)
            image = imagenet_utils.preprocess_input(image)
            
            #Replace image
            processed_images.append(image)
            
        #Extract features and flatten
        processed_images = np.vstack(processed_images)
        features = model.predict(processed_images, batch_size = batch_size)
        features = features.reshape((features.shape[0],100352))
        
        #Add features and update progressbar
        writer.add(features, batch_labels)
        pbar.update(i)
        
    #Close database and finish progressbar
    writer.close()
    pbar.finish()

In [8]:
extract_features(X_test, y_test, test_writer, RN50, batch_size, len(y_test))

Extracting Features: 100% |#####################################| Time: 0:00:35


***
## Evaluate jobs on Test Set

In [21]:
#Read in dataset and model
test = h5py.File(config.TEST_FEATURES, "r")
X_test = test["features"][:]
y_test = test["labels"][:]
colorstorm = joblib.load(config.MODEL_PATH)

In [22]:
#Predict on test set -> use 60% decision boundry
predictions = (colorstorm.predict_proba(X_test) >= 0.60).astype("int")[:,1]

In [23]:
#Classification report
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       150
           1       1.00      1.00      1.00       151

    accuracy                           1.00       301
   macro avg       1.00      1.00      1.00       301
weighted avg       1.00      1.00      1.00       301



In [26]:
#Get unrounded metrics
scores = [score(y_test, predictions) for score in [accuracy_score, precision_score, recall_score]]
print("Accuracy: {}\nPrecision: {}\nRecall: {}".format(scores[0], scores[1], scores[2]))

Accuracy: 1.0
Precision: 1.0
Recall: 1.0


In [30]:
#Confusion matrix
print(confusion_matrix(y_test, predictions))

[[150   0]
 [  0 151]]


***AI jobs is perfect on an approximately 50/50 split in labels***
***
## Asses jobs performance on 90/10 Split for labels
- 301 total instances
- Grab 271 satisfactory instances
- Grab 80 defective instances

In [37]:
#Grab a 90% satisfactory and 10% defective instances

In [52]:
#Indices for random instances from X_test
sat_indices = random.sample(range(151), 90)
def_indices = random.sample(range(150), 10)

X_test_sat_90 = X_test[y_test == 1][sat_indices]
y_test_sat_90 = y_test[y_test == 1][sat_indices]
X_test_def_10 = X_test[y_test == 0][def_indices]
y_test_def_10 = y_test[y_test == 0][def_indices]

#Merge satisfactory/defective slices into 1
X_test_90_10 = np.vstack([X_test_sat_90, X_test_def_10])
y_test_90_10 = np.hstack([y_test_sat_90, y_test_def_10])

In [53]:
#Predict
predictions_90_10 = (colorstorm.predict_proba(X_test_90_10) >= 0.60).astype("int")[:,1]

In [54]:
#Show classification report
print(classification_report(y_test_90_10, predictions_90_10))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        90

    accuracy                           1.00       100
   macro avg       1.00      1.00      1.00       100
weighted avg       1.00      1.00      1.00       100



In [55]:
#Get unrounded metrics
scores = [score(y_test_90_10, predictions_90_10) for score in [accuracy_score, precision_score, recall_score]]
print("Accuracy: {}\nPrecision: {}\nRecall: {}".format(scores[0], scores[1], scores[2]))

Accuracy: 1.0
Precision: 1.0
Recall: 1.0


***
# Final Results: 
## AI jobs predicts with 100% accuracy on 50/50 satisfactory/defective instances and with 100% accuracy on 90/10 satisfactory/defective instances

***Another one in the books***