<a href="https://colab.research.google.com/github/tperfetti/ML/blob/main/Module_8_Perfetti.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Ingest (1.0)

#### 1.1: Importing Libraries

In [None]:
import os, cv2, random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tqdm import tqdm
from random import shuffle 
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot
import keras.applications
#from tensorflow.keras.applications import resnet
#from keras.utils import plot_model
#from tensorflow.python.keras.applications import ResNet50
#from tensorflow.python.keras.models import Sequential
#from tensorflow.python.keras.layers import Dense, Flatten, GlobalAveragePooling2D
%matplotlib inline 

#### 1.2: Defining Parameters

In [None]:
TEST_SIZE = 0.5
RANDOM_STATE = 2018
BATCH_SIZE = 64
NO_EPOCHS = 20
NUM_CLASSES = 2
SAMPLE_SIZE = 20000
PATH = '/torrenceperfetti/Desktop/test/'
TRAIN_FOLDER = './train/'
TEST_FOLDER =  './test/'
IMG_SIZE = 224
#RESNET_WEIGHTS_PATH = '/kaggle/input/resnet50/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'

#### 1.3: Reading the Data

In [None]:
train_image_path = os.path.join(PATH, "train")
test_image_path = os.path.join(PATH, "test")

In [None]:
import zipfile
with zipfile.ZipFile(train_image_path,"r") as z:
    z.extractall(".")

In [None]:
with zipfile.ZipFile(test_image_path,"r") as z:
    z.extractall(".")

In [None]:
train_image_list = os.listdir("./train/")[0:SAMPLE_SIZE]
test_image_list = os.listdir("./test/")

#### 1.4: Defining Functions

We set a function for parsing the image names to extract the first 3 letters from the image names, which gives the label of the image.

In [None]:
def label_pet_image_one_hot_encoder(img):
    pet = img.split('.')[-3]
    if pet == 'cat': return [1,0]
    elif pet == 'dog': return [0,1]

We are defining as well a function to process the data (both train and test set).

In [None]:
def process_data(data_image_list, DATA_FOLDER, isTrain=True):
    data_df = []
    for img in tqdm(data_image_list):
        path = os.path.join(DATA_FOLDER,img)
        if(isTrain):
            label = label_pet_image_one_hot_encoder(img)
        else:
            label = img.split('.')[0]
        img = cv2.imread(path,cv2.IMREAD_COLOR)
        img = cv2.resize(img, (IMG_SIZE,IMG_SIZE))
        data_df.append([np.array(img),np.array(label)])
    shuffle(data_df)
    return data_df

## EDA (2.0)

#### 2.1: Class Distribution

In [None]:
def plot_image_list_count(data_image_list):
    labels = []
    for img in data_image_list:
        labels.append(img.split('.')[-3])
    sns.countplot(labels)
    plt.title('Cats and Dogs')
    
plot_image_list_count(train_image_list)

In [None]:
plot_image_list_count(os.listdir(TRAIN_FOLDER))

#### 2.2: Image Samples

In [None]:
train = process_data(train_image_list, TRAIN_FOLDER)

In [None]:
#def show_images(data, isTest=False):
    #f, ax = plt.subplots(5,5, figsize=(15,15))
    #for i,data in enumerate(data[:25]):
        #img_num = data[1]
        #img_data = data[0]
        #label = np.argmax(img_num)
        #if label  == 1: 
            #str_label='Dog'
        #elif label == 0: 
            #str_label='Cat'
        #if(isTest):
            #str_label="None"
        #ax[i//5, i%5].imshow(img_data)
        #ax[i//5, i%5].axis('off')
        #ax[i//5, i%5].set_title("Label: {}".format(str_label))
    #plt.show()

#show_images(train)

In [None]:
test = process_data(test_image_list, TEST_FOLDER, False)

In [None]:
#show_images(test,True)

## Modeling (3.0)

#### 3.1: Preparing the Training Data

In [None]:
X = np.array([i[0] for i in train]).reshape(-1,IMG_SIZE,IMG_SIZE,3)
y = np.array([i[1] for i in train])

#### 3.2: Preparing the Model 

In [None]:
model = Sequential()
model.add(ResNet50(include_top=False, pooling='max', weights=RESNET_WEIGHTS_PATH))
model.add(Dense(NUM_CLASSES, activation='softmax'))
# ResNet-50 model is already trained, should not be trained
model.layers[0].trainable = True

In [None]:
model.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])

#### 3.3: Model Summary

In [None]:
model.summary()

In [None]:
plot_model(model, to_file='model.png')
SVG(model_to_dot(model).create(prog='dot', format='svg'))

#### 3.4: Training the Model

In [None]:
train_model = model.fit(X_train, y_train,
                  batch_size=BATCH_SIZE,
                  epochs=NO_EPOCHS,
                  verbose=1,
                  validation_data=(X_val, y_val))

#### 3.5: Accuracy & Loss

In [None]:
def plot_accuracy_and_loss(train_model):
    hist = train_model.history
    acc = hist['acc']
    val_acc = hist['val_acc']
    loss = hist['loss']
    val_loss = hist['val_loss']
    epochs = range(len(acc))
    f, ax = plt.subplots(1,2, figsize=(14,6))
    ax[0].plot(epochs, acc, 'g', label='Training accuracy')
    ax[0].plot(epochs, val_acc, 'r', label='Validation accuracy')
    ax[0].set_title('Training and validation accuracy')
    ax[0].legend()
    ax[1].plot(epochs, loss, 'g', label='Training loss')
    ax[1].plot(epochs, val_loss, 'r', label='Validation loss')
    ax[1].set_title('Training and validation loss')
    ax[1].legend()
    plt.show()
plot_accuracy_and_loss(train_model)

In [None]:
score = model.evaluate(X_val, y_val, verbose=0)
print('Validation loss:', score[0])
print('Validation accuracy:', score[1])

In [None]:
#get the predictions for the test data
predicted_classes = model.predict_classes(X_val)
#get the indices to be plotted
y_true = np.argmax(y_val,axis=1)

In [None]:
correct = np.nonzero(predicted_classes==y_true)[0]
incorrect = np.nonzero(predicted_classes!=y_true)[0]

In [None]:
target_names = ["Class {}:".format(i) for i in range(NUM_CLASSES)]
print(classification_report(y_true, predicted_classes, target_names=target_names))

#### 3.6: Testing Prediction Data

In [None]:
pred_list = []
img_list = []
for img in tqdm(test):
    img_data = img[0]
    img_idx = img[1]
    data = img_data.reshape(-1,IMG_SIZE,IMG_SIZE,3)
    predicted = model.predict([data])[0]
    img_list.append(img_idx)
    pred_list.append(predicted[1])

#### 3.7: Submission File

In [None]:
submission = pd.DataFrame({'id':img_list , 'label':pred_list})
submission.head()
submission.to_csv("submission.csv", index=False)

## Conclusion (4.0)

As described in the Module 8 Assignment 1 Requirements section on Canvas, please reference how I addressed each stated requirement/question below:
- <b>Conduct your analysis using a cross-validation design</b>This was addressed in section 3.0.
- <b>Conduct / refine EDA</b>This was addressed in section 2.0.
- <b>Build at least three CNN models based on hyperparameter tuning</b> N/A.
- <b>Evaluate goodness of fit metrics</b> This was addressed in section 3.5.
- <b>Build ROC and Precision / Recall graphs</b> N/A.