<h1 style='color:green'> World Leaders Classifier </h1>

<h3 style ='color:purple'>Step1: Data Collection</h3>

<h4>Tool Used: </h4>
    * Fatkun Google Extension
    
<h4>Dataset collected</h4>
    * Donald Trump<br>
    * Jeff_Bezos<br>
    * Narendra_Modi<br>
    * Vladimir_Putin<br>
    * Xi_Jinping
    

<h3 style ='color:purple'>Step2: Data Cleaning</h3>

#### (1) Preprocessing: Detect face and eyes 

In [211]:
# import library
import numpy as np
import cv2
import matplotlib.pyplot as plt

In [212]:
#use haarcascade api(downloaded from Github) to detect the face and save face details in the variable#
face_cascade = cv2.CascadeClassifier('./opencv/haarcascades/haarcascade_frontalface_default.xml')
eye_cascade = cv2.CascadeClassifier('./opencv/haarcascades/haarcascade_eye.xml')

def get_cropped_image_if_2_eyes(img_path):
        #print("get_cropped_image_if_2_eyes : img_path =" + img_path )
        img = cv2.imread(img_path)
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        faces = face_cascade.detectMultiScale(gray, 1.3, 5)
        for (x,y,w,h) in faces:
            roi_gray = gray[y:y+h, x:x+w]
            roi_color = img[y:y+h, x:x+w]
            eyes = eye_cascade.detectMultiScale(roi_gray)
            #if len(eyes)>=2:
            return roi_color

In [213]:
#assigning the source and target folders in the 2 different variables
path_to_data = "./dataset/"
path_to_cr_data = "./dataset/cropped/"

In [214]:
#use lib "os" to initialise all directories in the array
import os
img_dirs =[]
for entry in os.scandir(path_to_data):
    if entry.is_dir():
        img_dirs.append(entry.path)

In [215]:
img_dirs

['./dataset/Jeff_Bezos',
 './dataset/Vladimir_Putin',
 './dataset/Narendra_Modi',
 './dataset/Xi_Jinping',
 './dataset/Donald_Trump']

In [216]:
import shutil
if os.path.exists(path_to_cr_data):
    shutil.rmtree(path_to_cr_data)
os.mkdir(path_to_cr_data)

In [217]:
#write function to crop face from the image dataset
cropped_image_dirs = []
celebrity_file_names_dict = {}
for img_dir in img_dirs:
    count = 1
    celebrity_name = img_dir.split('/')[-1]
    print(celebrity_name)
    celebrity_file_names_dict[celebrity_name] = []
    #print(img_dir)
    for entry in os.scandir(img_dir):
        #print(entry)
        roi_color = get_cropped_image_if_2_eyes(entry.path)
        if roi_color is not None:
            cropped_folder = path_to_cr_data + celebrity_name
            if not os.path.exists(cropped_folder):
                os.makedirs(cropped_folder)
                cropped_image_dirs.append(cropped_folder)
                print("Generating cropped images in folder: ",cropped_folder)
            cropped_file_name = celebrity_name + str(count) + ".png"
            #print("cropped_file_name is"+ cropped_file_name)
            cropped_file_path = cropped_folder + "/" + cropped_file_name
            #print("cropped_file_path is"+ cropped_file_path)
            cv2.imwrite(cropped_file_path, roi_color)
            celebrity_file_names_dict[celebrity_name].append(cropped_file_path)
            count += 1
            #print(count)

Jeff_Bezos
Generating cropped images in folder:  ./dataset/cropped/Jeff_Bezos
Vladimir_Putin
Generating cropped images in folder:  ./dataset/cropped/Vladimir_Putin
Narendra_Modi
Generating cropped images in folder:  ./dataset/cropped/Narendra_Modi
Xi_Jinping
Generating cropped images in folder:  ./dataset/cropped/Xi_Jinping
Donald_Trump
Generating cropped images in folder:  ./dataset/cropped/Donald_Trump


#### Preprocessing: Use wavelet transform as a feature for traning our model 
Wavelet transform

In [219]:
import numpy as np
import pywt
import cv2    

def w2d(img, mode='haar', level=1):
        imArray = img
        #Datatype conversions
        #convert to grayscale
        imArray = cv2.cvtColor( imArray,cv2.COLOR_RGB2GRAY )
        #convert to float
        imArray =  np.float32(imArray)   
        imArray /= 255;
        # compute coefficients 
        coeffs=pywt.wavedec2(imArray, mode, level=level)

        #Process Coefficients
        coeffs_H=list(coeffs)  
        coeffs_H[0] *= 0;  

        # reconstruction
        imArray_H=pywt.waverec2(coeffs_H, mode);
        imArray_H *= 255;
        imArray_H =  np.uint8(imArray_H)

        return imArray_H

In [220]:
#Create Class_dictionary
class_dict = {}
count = 0
for celebirity_name in celebrity_file_names_dict.keys():
    class_dict[celebirity_name] = count
    count = 1+count
class_dict

{'Jeff_Bezos': 0,
 'Vladimir_Putin': 1,
 'Narendra_Modi': 2,
 'Xi_Jinping': 3,
 'Donald_Trump': 4}

#### creating x and y values from the dataset for tarining model

In [222]:
X =[]
y =[]

for celebirity_name, training_files in celebrity_file_names_dict.items():
    for training_image in training_files:
        img = cv2.imread(training_image)
        if img is None:
            continue
        scalled_raw_img = cv2.resize(img,(32,32))
        img_har = w2d(img, 'db1',5)
        scalled_img_har = cv2.resize(img_har,(32,32))
        combined_img = np.vstack((scalled_raw_img.reshape(32*32*3,1),scalled_img_har.reshape(32*32,1)))
        X.append(combined_img)
        y.append(class_dict[celebirity_name])

In [223]:
len(X)

505

In [224]:
len(X[0])

4096

In [225]:
#converting the X values into float to avoid multiple warning messages in the Algos
X = np.array(X).reshape(len(X),4096).astype(float)
X.shape

(505, 4096)

<h3 style='color:purple'>Step 3: Taining the Model</h3>

In [227]:
# Import library
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

In [228]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0 )

#### Use SVM model

In [229]:
pipe = Pipeline([('scalar', StandardScaler()),('svm',SVC(kernel='rbf',C=10))])
pipe.fit(X_train, y_train)
pipe.score(X_test, y_test)

0.8661417322834646

#### Using Grid Search to find the optimal model for this dataset

In [230]:
#import library
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [231]:
#create dictionary to supply parameter for GridSearch
model_params ={
    'svm' : {
        'model' : SVC(gamma='auto', probability = True),
        'params':{
            'svc__C':[1,10,100,1000],
            'svc__kernel':['rbf', 'linear']
        }
    },
    'ramdom_forest':{
        'model' : RandomForestClassifier(),
        'params':{
            'randomforestclassifier__n_estimators':[1,5,10]
        }
    },
    'logistic_regression':{
        'model' : LogisticRegression(solver='liblinear', multi_class = 'auto'),
        'params' : {
            'logisticregression__C': [1,5,10]
        }
        
    }
    
}

In [232]:
#create GridSearch
score = []
best_estimators = {}
import pandas as pd
for algo , mp in model_params.items():
    pipe = make_pipeline(StandardScaler(),mp['model'])
    clf = GridSearchCV(pipe, mp['params'], cv=5, return_train_score=False)
    clf.fit(X_train, y_train)
    score.append({
        'model': algo,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    best_estimators[algo] = clf.best_estimator_
    
df = pd.DataFrame(score, columns = ['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.870281,"{'svc__C': 1, 'svc__kernel': 'linear'}"
1,ramdom_forest,0.682526,{'randomforestclassifier__n_estimators': 10}
2,logistic_regression,0.883614,{'logisticregression__C': 1}


In [233]:
best_estimators['svm'].score(X_test, y_test)

0.889763779527559

In [234]:
 best_estimators['ramdom_forest'].score(X_test, y_test)

0.7244094488188977

In [235]:
 best_estimators['logistic_regression'].score(X_test, y_test)

0.889763779527559

In [236]:
best_clf = best_estimators['svm']

### Saving the Artifacts for deployments

#### Use joblib ans save model as pickle in a file

In [237]:
import joblib
joblib.dump(best_clf, 'saved_model.pkl')

['saved_model.pkl']

#### Saving the class dictionary file as 

In [239]:
import json
with open("class_dictionary.json","w") as f:
    f.write(json.dumps(class_dict))