<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Capstone Project: Animal Behaviour Analyser

## Part 1: Background

### Contents:
- [Background](#Background)
- [Data Import & Cleaning](#Data-Import-and-Cleaning)
- [Exploratory Data Analysis](#Exploratory-Data-Analysis)
- [Data Visualization](#Visualize-the-Data)
- [Conclusions and Recommendations](#Conclusions-and-Recommendations)

In [1]:
import cv2
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import skimage as ski
from PIL import Image

import pickle

# Modelling
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout, Dense,BatchNormalization, Flatten, MaxPool2D
from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau, Callback
from keras.layers import Conv2D, Reshape
from tensorflow.keras.utils import Sequence
from keras.backend import epsilon
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam

### Data Import

In [2]:
# Separate out the data to train and validation directories
# In each directories, classify each class in the subdirectory

base_dir = '../data'
train_dir = os.path.join(base_dir, 'train_data')
validation_dir = os.path.join(base_dir, 'validation_data')

# Train Data
# Directory with our training Angry Dog pictures
train_angry_dir = os.path.join(train_dir, 'angry')

# Directory with our training Happy Dog pictures
train_happy_dir = os.path.join(train_dir, 'happy')

# Directory with our training Relaxed Dog pictures
train_relaxed_dir = os.path.join(train_dir, 'relaxed')

# Directory with our training Sad Dog pictures
train_sad_dir = os.path.join(train_dir, 'sad')

# Validation Data
# Directory with our validation Angry Dog pictures
validation_angry_dir = os.path.join(validation_dir, 'angry')

# Directory with our validation Happy Dog pictures
validation_happy_dir = os.path.join(validation_dir, 'happy')

# Directory with our validation Relaxed Dog pictures
validation_relaxed_dir = os.path.join(validation_dir, 'relaxed')

# Directory with our validation Sad Dog pictures
validation_sad_dir = os.path.join(validation_dir, 'sad')

## EDA

In [3]:
num_classes = 4

In [4]:
class_folders = [train_angry_dir, train_happy_dir, train_relaxed_dir, train_sad_dir]

In [None]:
# Specify the folder for the specific class you want to plot
class_folder = train_happy_dir

# Get a list of image files in the specified folder
image_files = os.listdir(train_happy_dir)[:5] # Limit to the first 5 images

# Create subplots for the images
fig, axs = plt.subplots(nrows=1, ncols=len(image_files), figsize=(15, 3))

# Load and plot the images
for i, filename in enumerate(image_files):
    image_path = os.path.join(train_happy_dir, filename)
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    
    # Display the image
    axs[i].imshow(image)
    axs[i].axis('off')  # Hide axis

plt.suptitle("Happy Dogs", fontsize=14)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [None]:
# Specify the folder for the specific class you want to plot
class_folder = train_relaxed_dir

# Get a list of image files in the specified folder
image_files = glob.glob(class_folder + '/*.jpg')[:5]  # Limit to the first 5 images

# Create subplots for the images
fig, axs = plt.subplots(nrows=1, ncols=len(image_files), figsize=(12, 3))

# Load and plot the images
for i, filename in enumerate(image_files):
    image = ski.io.imread(filename)
    axs[i].imshow(image)
    axs[i].axis('off')  # Hide axis

plt.suptitle("Relaxed Dogs", fontsize=14)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [None]:
# Specify the folder for the specific class you want to plot
class_folder = '../data/train_data/sad'

# Get a list of image files in the specified folder
image_files = glob.glob(class_folder + '/*.jpg')[:5]  # Limit to the first 5 images

# Create subplots for the images
fig, axs = plt.subplots(nrows=1, ncols=len(image_files), figsize=(12, 3))

# Load and plot the images
for i, filename in enumerate(image_files):
    image = ski.io.imread(filename)
    axs[i].imshow(image)
    axs[i].axis('off')  # Hide axis

plt.suptitle("Sad Dogs", fontsize=14)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

In [None]:
# Specify the folder for the specific class you want to plot
class_folder = '../data/angry'

# Get a list of image files in the specified folder
image_files = glob.glob(class_folder + '/*.jpg')[:5]  # Limit to the first 5 images

# Create subplots for the images
fig, axs = plt.subplots(nrows=1, ncols=len(image_files), figsize=(12, 3))

# Load and plot the images
for i, filename in enumerate(image_files):
    image = ski.io.imread(filename)
    axs[i].imshow(image)
    axs[i].axis('off')  # Hide axis

plt.suptitle("Angry Dogs", fontsize=14)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()

## Modelling

### 1. Baseline CNN model

In [None]:
# Define the input and batch size
input_size = (32, 32)
batch_size = 32

# Perform data augmentation on training images
train_datagen_baseline = ImageDataGenerator(rescale = 1.0 / 255.0, # Rescale pixel values to the range [0, 1]
                             rotation_range = 80,      # Degree range for random rotations
                             width_shift_range = 0.5,  # Fraction of total width for random horizontal shifts
                             height_shift_range = 0.5, # Fraction of total height for random vertical shifts
                             shear_range = 0.5,        # Shear intensity for random shear transformations
                             zoom_range = 0.5,         # Random zoom range
                             horizontal_flip = True,   # Randomly flip inputs horizontally
                             fill_mode='nearest')       # Strategy for filling in newly created pixels after rotation or shifts

# Note that the validation data should not be augmented
test_datagen_baseline = ImageDataGenerator(rescale = 1.0/255.)   
                                            
                                            
# Flow training images in batches using train_datagen_baseline generator                                   
train_generator = train_datagen_baseline.flow_from_directory(train_dir,
                                                             batch_size = batch_size,
                                                             target_size = input_size,
                                                             class_mode='categorical',
                                                             subset='training')
                                            
# Flow validation images in batches using test_datagen_baseline generator
val_generator = test_datagen_baseline.flow_from_directory(validation_dir,
                                                          batch_size = batch_size,
                                                          target_size = input_size,
                                                          class_mode='categorical',
                                                          subset='validation')
                                            

# Instantiate model
cnn = models.Sequential([
    layers.Conv2D(filters=32, kernel_size=(3, 3), activation='relu', input_shape=input_size + (3,)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(filters=64, kernel_size=(3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(5, activation='softmax')  # Use 'softmax' for multi-class classification
])

optimizer = Adam(lr=0.001)
cnn.compile(optimizer=optimizer,
              loss='categorical_crossentropy', # Use 'categorical_crossentropy' for multi-class classification
              metrics=['accuracy'])

print("Model Summary: ", cnn.summary())

# Train the model
history = cnn.fit(train_generator,
                  epochs=20,
                  validation_data=val_generator)

# Evaluate the model on the training and validation sets
train_accuracy = cnn.evaluate(train_generator)[1]
test_accuracy = cnn.evaluate(val_generator)[1]

print("Baseline model Train accuracy:", train_accuracy)
print("Baseline model Validation accuracy:", test_accuracy)

### 2. Pre-Trained VGG-16 model

In [None]:
# Define the input and batch size
input_size = (224, 224)
batch_size = 20

# Perform data augmentation on training images
train_datagen = ImageDataGenerator(rescale = 1./255.,
                                   rotation_range = 40, 
                                   width_shift_range = 0.2, 
                                   height_shift_range = 0.2, 
                                   shear_range = 0.2, 
                                   zoom_range = 0.2, 
                                   horizontal_flip = True)

# Note that the validation data should not be augmented
test_datagen = ImageDataGenerator(rescale = 1.0/255.)

# Flow training images in batches using train_datagen generator
train = train_datagen.flow_from_directory(train_dir,
                                          batch_size = batch_size,
                                          target_size = input_size,
                                          class_mode = 'categorical', # Use 'categorical' for multi-class classification
                                          subset = 'training')

# Flow validation images in batches using test_datagen generator
val = test_datagen.flow_from_directory(validation_dir,
                                       batch_size = batch_size,
                                       target_size = input_size,
                                       class_mode = 'categorical',
                                       subset='validation')

# Instantiate pre-trained VGG16 model
vgg16_model = tf.keras.applications.vgg16.VGG16(include_top=False,
                                                weights="imagenet",
                                                input_shape=(224,224,3))
# Add Dense Layer to VGG16 model
model = Sequential([vgg16_model,
                    Flatten(),
                    Dense(4, activation = "softmax")]) # Use 'softmax' for multi-class classification

model.layers[0].trainable = False
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics="accuracy")

print("Model Summary: ", model.summary())

# Train the model
history =  model.fit(train,
                    epochs=20,
                    callbacks=[lr_callbacks],
                    validation_data=val)

# Evaluate the model on the training and validation sets
train_accuracy = model.evaluate(train)[1]
test_accuracy = model.evaluate(val)[1]

print("VGG16 Train accuracy:", train_accuracy)
print("VGG16 Validation accuracy:", test_accuracy)

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming you have true labels and predicted labels as numpy arrays
# true_labels = ...
# predicted_labels = ...

# Calculate the confusion matrix
conf_matrix = confusion_matrix(true_labels, predicted_labels)

# Plot the confusion matrix as a heatmap
class_names = ['Angry', 'Curious', 'Happy', 'Relaxed', 'Sad']  # Replace with your class names
plt.figure(figsize=(6, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
# Assuming you have a validation_generator that yields batches of validation data and labels
validation_data, true_labels = validation_generator.next()

# Make predictions using your trained model
predictions = cnn.predict(validation_data)

# Get the predicted labels
predicted_labels = np.argmax(predictions, axis=1)

# Find indices where predicted labels don't match true labels
misclassified_indices = np.where(predicted_labels != true_labels)[0]

# Get the misclassified data and true labels
misclassified_data = validation_data[misclassified_indices]
misclassified_true_labels = true_labels[misclassified_indices]
misclassified_predicted_labels = predicted_labels[misclassified_indices]

# Convert numerical labels to class names if necessary
class_names = ['Angry', 'Curious', 'Happy', 'Relaxed', 'Sad']  # Replace with your class names
misclassified_true_class_names = [class_names[int(label)] for label in misclassified_true_labels.ravel()]
misclassified_predicted_class_names = [class_names[int(label)] for label in misclassified_predicted_labels.ravel()]

# Print or further analyze misclassified data and true labels
for i in range(len(misclassified_indices)):
    print(f"True Label: {misclassified_true_class_names[i]}, Predicted Label: {misclassified_predicted_class_names[i]}")


In [None]:
# Save only the model (without the training history)
cnn.save("cnn_model.keras")

# Pickle out the CNN model file path for deployment on Streamlit
with open("../streamlit/cnn_model.pkl", "wb") as model_file:
    pickle.dump("cnn_model.keras", model_file)

### Prediction

In [None]:
# Get the path to the uploaded image
uploaded_image_path = "../testdata/test_image.jpg"

# Read and display the uploaded image
uploaded_image = Image.open(uploaded_image_path)
plt.imshow(uploaded_image)
plt.axis('off')  # Turn off axis numbers and ticks
plt.show()

In [None]:
# Define the class labels
class_labels = ['Angry', 'Curious', 'Happy', 'Relaxed', 'Sad']

# Read the uploaded image
image = keras_image.load_img(uploaded_image_path, target_size=(32, 32))
image_array = keras_image.img_to_array(image)
image_array = np.expand_dims(image_array, axis=0)  # Add batch dimension
image_array /= 255.0  # Normalize the pixel values

# Make prediction
predictions = cnn.predict(image_array)
predicted_class_index = np.argmax(predictions)
predicted_class_label = class_labels[predicted_class_index]

# Display the predicted class label
print(f"Predicted Class: {predicted_class_label}")