In [1]:
from __future__ import print_function

from time import time
import logging
import os
import yaml
import cv2
import math
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')

In [2]:
# Extrac the dataset path from a config file, which is in the form of yaml
config = yaml.safe_load(open(r'.\config.yml'))
rootPath = config['path']

In [3]:
#### Declare training data lists
X_train = []
y_train = []

# Declare target data lists
X_test = []
y_test = []


# arr = []
# targets = []
###
#  'arr' and 'targets' have been commented out
#  because datasets are stored in two folders, man and woman, respectively.
#  Each dataset should be divided into training and test sets directly from the folders.
#  Consequently, the 'train_test_split' function is not required for these datasets.
###


# Declare the division ratio
test_size = 0.4

# Specify the maximum size for both training and test sets
# A large amount of data unneseccarily takes a long time to be processed.
max_data_size = 4000

# Get all directories stored in the root folder
dirs = os.listdir(rootPath)

for singleDir in dirs:
    # Add the directory name after the root path
    filePath = os.path.join(rootPath, singleDir)
    
    # Get all file names stored in the file folder
    files = os.listdir(filePath)
    
    # Check if the file folder is empty
    # If empty, move on to the next folder
    if len(files) == 0: continue
    
    # If the training data size exceeds the maximum data size,
    # the last index is determined based on the maximum data size.
    # Otherwise, the index is calculated based on the number of files.
    data_size = min(len(files), max_data_size)
    last_train_index = max(0, math.floor(data_size * (1 - test_size)) - 1)
    
    for i in range(data_size):
        # Retrieve the image information and convert it to a numpy array
        farr = np.array(cv2.resize\
                (cv2.cvtColor(\
                  cv2.imread(os.path.join(filePath, files[i])), cv2.COLOR_BGR2GRAY), (300, 300))) / 255
        farr = farr.reshape(1, farr.shape[1] * farr.shape[0])[0]
        fname = singleDir
        
        # arr.append(farr)
        # targets.append(fname)
        ###
        #  The above lines have been commented out
        #  because the variables are not used
        ###
        
        # Append the single data (1D array) to the training list
        # if the current index is greater than or equals to the last train index.
        # Otherwise, it should be added to the test list
        if i <= last_train_index:
            X_train.append(farr)
            y_train.append(fname)
        else:
            X_test.append(farr)
            y_test.append(fname)

# Convert all the data lists into numpy arrays
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)
# arr = np.array(arr)
# targets = np.array(targets)

print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)
# print("arr:", arr.shape)
# print("targets:", targets.shape)

X_train: (4800, 90000)
X_test: (3200, 90000)
y_train: (4800,)
y_test: (3200,)


The **train_test_split function** is not used since the datasets have already been divided into two lists for training and test purposes. The data files are located in two folders, 'man' and 'woman'. The training and test lists should consist of files from both folders, maintaining a (specify division ratio) split. This manual separation is more manageable than using the train_test_split function in this case.

In [4]:
# from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(arr, targets, test_size=0.25, shuffle=False)

# Algorithm
The task is to determine the gender of the person in the given picture. For this problem, the algorithm needs to classify the datasets into two categories: man and woman.
#### 1. DecisionTree
: a non-parametric supervised learning method used for classification and regression.
#### 2. RandomForest
: a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.
#### 3. SupportVector
: a set of supervised learning methods used for classification, regression and outliers detection.
#### 4. KNeighbors
: a non-parametric supervised learning method. The input consists of the k closest training examples in a data set, and the output depends on whether k-NN is used for classification or regression according to Wikipedia (https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm).

### 1. DecisionTreeClassifier

In [5]:
from sklearn import tree

tree_clf = tree.DecisionTreeClassifier(random_state=0, max_depth=10, max_features=4).fit(X_train, y_train)
tree_prediction = tree_clf.predict(X_test)
# tree.plot_tree(tree_clf)

tree_acc_train = tree_clf.score(X_train, y_train)
tree_acc_test = accuracy_score(tree_prediction, y_test)
print('[ DecisionTreeClassifier ] Accuracy: %.2f%% with train data and %.2f%% with test data'\
         % (tree_acc_train * 100, tree_acc_test * 100))

[ DecisionTreeClassifier ] Accuracy: 82.02% with train data and 72.34% with test data


### 2. RandomForestClassifier

In [14]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier().fit(X_train, y_train)
rf_prediction = rf_clf.predict(X_test)

rf_acc_train = rf_clf.score(X_train, y_train)
rf_acc_test = accuracy_score(rf_prediction, y_test)
print('[ RandomForestClassifier ] Accuracy: %.2f%% with train data and %.2f%% with test data'\
         % (rf_acc_train * 100, rf_acc_test * 100))

[ RandomForestClassifier ] Accuracy: 99.98% with train data and 88.19% with test data


#### DecisionTreeClassifier vs. RandomForestClassifier
Based on the accuracy scores, RandomForestClassifier outperformed DecisionTreeClassifier for this problem. RandomForestClassifier is more effective as it employs multiple trees, whereas DecisionTreeClassifier uses a single tree.

### 3. SupportVectorClassifier (SVC)

#### 3-1. SVC using OVR

In [7]:
from sklearn import svm

svm_clf = svm.SVC().fit(X_train, y_train)
svm_prediction = svm_clf.predict(X_test)

svm_acc_train = svm_clf.score(X_train, y_train)
svm_acc_test = accuracy_score(svm_prediction, y_test)
print('[ SupportVectorClassification ] Accuracy: %.2f%% with train data and %.2f%% with test data'\
         % (svm_acc_train * 100, svm_acc_test * 100))

[ SupportVectorClassification ] Accuracy: 93.62% with train data and 88.50% with test data


The decision function shapes for Support Vector Machines (SVM) are **'ovr (one-vs-rest)'** and **'ovo (one-vs-one)'**. As 'ovr' is set as the default, the above result needs to be compared with that obtained using the other decision function shape, 'ovo'.

#### 3-2. SVC using OVO

In [8]:
svm_ovo_clf = svm.SVC(decision_function_shape='ovo').fit(X_train, y_train)
svm_ovo_prediction = svm_ovo_clf.predict(X_test)

svm_ovo_acc_train = svm_ovo_clf.score(X_train, y_train)
svm_ovo_acc_test = accuracy_score(svm_ovo_prediction, y_test)
print('[ SupportVectorClassification (OVO) ] Accuracy: %.2f%% with train data and %.2f%% with test data'\
         % (svm_ovo_acc_train * 100, svm_ovo_acc_test * 100))

[ SupportVectorClassification (OVO) ] Accuracy: 93.62% with train data and 88.50% with test data


Note that 'ovo' is always used as a multi-class strategy to train models while 'ovr' is only constructed from the ovo matrix, according to the manual of sklearn (https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html).
- If decision_function_shape='ovo', the function values are proportional to the distance of the samples X to the separating hyperplane.
- If decision_function_shape=’ovr’, the decision function is a monotonic transformation of ovo decision function.

### 4. KNeighborsClassifier

In [9]:
from sklearn.neighbors import KNeighborsClassifier

kn_clf = KNeighborsClassifier(n_neighbors=2, metric='euclidean').fit(X_train, y_train)
kn_prediction = kn_clf.predict(X_test)

kn_acc_train = kn_clf.score(X_train, y_train)
kn_acc_test = accuracy_score(kn_prediction, y_test)
print('[ KNeighborsClassifier ] Accuracy: %.2f%% with train data and %.2f%% with test data'\
         % (kn_acc_train * 100, kn_acc_test * 100))

[ KNeighborsClassifier ] Accuracy: 91.90% with train data and 71.03% with test data


# The Best Classifier Method
### based on the accuracy scores

In [15]:
# Prepare the resources for visualisation
methods = ['DecisionTree', 'RandomForest', 'SupportVector (ovr)', 'SupportVector (ovo)', 'KNeighbors']
acc_test = np.array([ tree_acc_test, rf_acc_test, svm_acc_test, svm_ovo_acc_test, kn_acc_test ])
acc_train = np.array([ tree_acc_train, rf_acc_train, svm_acc_train, svm_ovo_acc_train, kn_acc_train ])
# acc_test = np.array([ round(acc, 2) for acc in acc_test ])

acc_mth = dict(zip(methods, acc_test))
acc_max = np.argmax(acc_test)
acc_min = np.argmin(acc_test)

# Print the accuracy scores for the classifiers
for acc in acc_mth:
    print('%s : %.2f' % (acc, (acc_mth[acc] * 100)))

import plotly.graph_objs as go

# Draw the accuracy scores for both training and test data for each classifier
trace_test = go.Bar(x=methods, y=acc_test, name='test', text=acc_test, marker_color='rosybrown')
trace_train = go.Bar(x=methods, y=acc_train, name='train', text=acc_train, marker_color='slateblue')
fig = go.Figure(data=[trace_train, trace_test])
fig.update_layout(yaxis_range=[0, 1])
fig.update_traces(texttemplate='%{text:.2%}', textposition='inside')
fig.show()

print('The best  classifier for this problem is %s (%.2f%%)' % (methods[acc_max], acc_test[acc_max] * 100))
print('The worst classifier for this problem is %s (%.2f%%)' % (methods[acc_min], acc_test[acc_min] * 100))

DecisionTree : 72.34
RandomForest : 88.19
SupportVector (ovr) : 88.50
SupportVector (ovo) : 88.50
KNeighbors : 71.03


The best  classifier for this problem is SupportVector (ovr) (88.50%)
The worst classifier for this problem is KNeighbors (71.03%)
