## The training data contains profile images collected from themovedb.org and labeled for each celebrity in the top 100 list.
* https://www.themoviedb.org/person/6384-keanu-reeves/images/profiles?language=en-US

* Extract the facial features and labels from the training data set
* Save the vectors into pickle files for the classifier

### Run the Docker image to install all the necessary libraries for the image and facial processing.


In [5]:
import os
os.environ["ESHOST"] ='localhost'
os.environ["ESPORT"] ='9200'
ESHOST= os.environ.get('ESHOST')
ESPORT= os.environ.get('ESPORT')


In [6]:
ESHOST

'localhost'

In [17]:
import math
from sklearn import neighbors
import os
import os.path
from pprint import pprint
import pickle
import pandas as pd
from PIL import Image, ImageDraw

ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg'}
import logging
logger = logging.getLogger('facialRecognition')
logger.setLevel(logging.INFO)


## Process celebrity images and perform facial recognition using the Dlib and OpenFace AI libraries.
* Crawl through the celebrity image folders.  Each folder is labeled with the actorID
* Store the facial vectors and labels for the classifcation models.


In [10]:
import face_recognition
from face_recognition.face_recognition_cli import image_files_in_folder

X = []
y = []
images = []

train_dir="knn_examples/train"

# Loop through each person in the training set
for class_dir in os.listdir(train_dir):
    if not os.path.isdir(os.path.join(train_dir, class_dir)):
        continue

    # Loop through each training image for the current person
    for img_path in image_files_in_folder(os.path.join(train_dir, class_dir)):
        try:
            image = face_recognition.load_image_file(img_path)
            face_bounding_boxes = face_recognition.face_locations(image)

            if len(face_bounding_boxes) != 1:
                # If there are no people (or too many people) in a training image, skip the image.
                if verbose:
                    print("Image {} not suitable for training: {}".format(img_path, "Didn't find a face" if len(face_bounding_boxes) < 1 else "Found more than one face"))
            else:
                # Add face encoding for current image to the training set
                X.append(face_recognition.face_encodings(image, known_face_locations=face_bounding_boxes)[0])
                y.append(class_dir)
                images.append(img_path)
        except Exception as e:
            logger.error('Failed to retrieve data : '+img_path+' :: '+ str(e))

import pickle
trainDict = { "labels" : y , "images" : images}
pickle.dump( trainDict, open( "./models/faceLabels.pkl", "wb" ) )
pickle.dump( X, open( "./models/faceLFeatures.pkl", "wb" ) )


ModuleNotFoundError: No module named 'dlib'

## Load the facial recognition vectors along with the celebrity biometric data from the open web.


In [11]:
faceLabels = pickle.load( open( './data/faceLabels.p', 'rb' ) )
faceFeatures = pickle.load( open( './data/faceFeatures.p', 'rb' ) )


In [None]:
faceFeatures

In [2]:

faceLabels = pickle.load( open( './models/faceLabels.pkl', 'rb' ) )
faceFeatures = pickle.load( open( './models/faceFeatures.pkl', 'rb' ) )


In [None]:
faceLabels['labels'][:40]

In [None]:
faceLabels['images'][:40]

In [21]:
celebrityImages =[]
for images in faceLabels['images']:
    image = images.split('/')
    celebrityImages.append(image[-1])
len(celebrityImages)

1290

In [22]:
celebrityFacial=[]
for feature in faceFeatures:
    celebrityFacial.append(feature.tolist())
len(celebrityFacial)

1290

In [23]:
celebrityBio = pd.read_json('./celebrityBio.json', dtype=str)
celebrityBio.head()

Unnamed: 0,_actorID,actor,age,birthdate,gender,height
0,49,Dave Bautista,50,1969-01-18,male,"6' 6"" (1.98 m)"
1,6,Tom Cruise,57,1962-07-03,male,"5' 7"" (1.7 m)"
2,73,Tim Allen,66,1953-06-13,male,"5' 10½"" (1.79 m)"
3,24,Chadwick Boseman,42,1976-11-29,male,6' (1.83 m)
4,32,Sandra Bullock,55,1964-07-26,female,"5' 7½"" (1.71 m)"


In [23]:

# topActors = pd.read_csv('./top100Actors.txt', 
#                      sep='\t',
#                      header=0, 
#                     dtype=str)

actorLables = pd.DataFrame( faceLabels['labels'], columns=['_actorID'])
actorLookup = actorLables.merge( celebrityBio, how='inner', left_on = '_actorID', right_on = '_actorID')
#actorLookup = actorLookup[['actorID','First Name','Last Name']]
actorNameRef =actorLookup.values.tolist()
len(actorNameRef)

pickle.dump( actorNameRef, open( "./models/actorNameRef.pkl", "wb" ) )

Unnamed: 0,_actorID,actor,age,birthdate,gender,height
0,1,Robert Downey Jr.,54,1965-04-04,male,"5' 8½"" (1.74 m)"
1,1,Robert Downey Jr.,54,1965-04-04,male,"5' 8½"" (1.74 m)"
2,1,Robert Downey Jr.,54,1965-04-04,male,"5' 8½"" (1.74 m)"
3,1,Robert Downey Jr.,54,1965-04-04,male,"5' 8½"" (1.74 m)"
4,1,Robert Downey Jr.,54,1965-04-04,male,"5' 8½"" (1.74 m)"
5,1,Robert Downey Jr.,54,1965-04-04,male,"5' 8½"" (1.74 m)"
6,1,Robert Downey Jr.,54,1965-04-04,male,"5' 8½"" (1.74 m)"
7,1,Robert Downey Jr.,54,1965-04-04,male,"5' 8½"" (1.74 m)"
8,1,Robert Downey Jr.,54,1965-04-04,male,"5' 8½"" (1.74 m)"
9,10,Harrison Ford,77,1942-07-13,male,"6' 1"" (1.85 m)"


## Elasticsearch will store image classification and perform a similary and recommendation model using the facial image vectors and biometric.

In [15]:
# Load pre-saved models 
faceLabels = pickle.load( open( './models/faceLabels.pkl', 'rb' ) )
faceFeatures = pickle.load( open( './models/faceFeatures.pkl', 'rb' ) )
actorNameRef = pickle.load( open( './models/actorNameRef.pkl', 'rb' ) )


In [18]:
import requests
from pprint import pprint
from datetime import datetime
from elasticsearch import Elasticsearch
import json


_index='facialimages'
esIndex = 'http://'+ESHOST+':'+ESPORT+'/'+_index  
es=Elasticsearch([{'host':ESHOST,'port':ESPORT}])

_doc= 'facialRecognition'

headers={"Content-Type": "application/json"}

## Set the index to a dense vector space to hold the 128 facial data points.

In [27]:
# Build Elasticsearch index for facial vectors
request_body = {
  "mappings": {
    "properties": {
      "_actorID" : {
        "type" : "keyword"
      },     
      "celebrity" : {
        "type" : "keyword"
      }, 
      "facial_vector": {
        "type": "dense_vector",
        "dims": 128
      }

    }
  }
}

try:
    es.indices.delete(index = _index)
except:
    pass
    
es.indices.create(index = _index, body = request_body)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'facialimages'}

## Build the ES documents and insert for indexing

In [28]:
# Load Facail data into ES
for i in range(len(actorNameRef)):
    _id = celebrityImages[i].split('.')[0]
    #print(_id)
    doc = {
        #_actorID	actor	age	birthdate	gender	height
        "_actorID" : actorNameRef[i][0],
        "celebrity": actorNameRef[i][1],
        "age": actorNameRef[i][2],
        "birthdate": actorNameRef[i][3],
        "gender": actorNameRef[i][4],
        "height": actorNameRef[i][5],
        "image" : celebrityImages[i],
        "facial_vector": celebrityFacial[i]  
        }
    r = requests.put(esIndex+'/_doc/'+_id, headers= headers, data = json.dumps(doc))
    print (r.text)

{"_index":"facialimages","_type":"_doc","_id":"themoviedb_i_98_l4oaexOrj9KGqVmc2TK8haHfQ9E","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":1280,"_primary_term":1}
{"_index":"facialimages","_type":"_doc","_id":"themoviedb_i_98_lfW2cDigTcvuoe4k6UfCdGVE1R6","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":1281,"_primary_term":1}
{"_index":"facialimages","_type":"_doc","_id":"themoviedb_i_98_qaO2MTBhIKcAJg2rjMmoKCIBcXF","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":1282,"_primary_term":1}
{"_index":"facialimages","_type":"_doc","_id":"themoviedb_i_98_qPgSCfqFHU12KaDeYzkbq8B2uTv","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_seq_no":1283,"_primary_term":1}
{"_index":"facialimages","_type":"_doc","_id":"themoviedb_i_98_t1HaRL7lRJemWySXcXxOT8fAGhj","_version":1,"result":"created","_shards":{"total":2,"successful":1,"failed":0},"_se

In [None]:
es.indices.stats(index=_index)

## Test a given document to ensure the document was inserted correctly.

In [19]:
r = requests.get(esIndex+'/_doc/themoviedb_i_17_mjxJd59SSdA0kiD6ju5WHpdLxy5')
pprint (r.text)

('{"_index":"facialimages","_type":"_doc","_id":"themoviedb_i_17_mjxJd59SSdA0kiD6ju5WHpdLxy5","_version":1,"_seq_no":103,"_primary_term":1,"found":true,"_source":{"_actorID": '
 '"17", "celebrity": "Don Cheadle", "age": "54", "birthdate": "1964-11-29", '
 '"gender": "male", "height": "5\' 7\\u00be\\" (1.72 m)", "image": '
 '"themoviedb_i_17_mjxJd59SSdA0kiD6ju5WHpdLxy5.jpg", "facial_vector": '
 '[-0.0865994319319725, 0.1547723263502121, 0.11647135764360428, '
 '-0.05348784476518631, -0.026111885905265808, -0.004996640142053366, '
 '0.02010984718799591, -0.09138417989015579, 0.16325019299983978, '
 '-0.10393447428941727, 0.26392027735710144, 0.04262522608041763, '
 '-0.17962029576301575, -0.09791891276836395, 0.09553860872983932, '
 '0.08158333599567413, -0.29037392139434814, -0.08611540496349335, '
 '-0.05405981093645096, -0.06868094205856323, -0.05936623737215996, '
 '0.05098782852292061, 0.08535328507423401, -0.013151603750884533, '
 '-0.059518761932849884, -0.26108041405677795, -0.07

## Test Search Query based on a given facial vector space.

In [28]:
from pprint import pprint
#celebrityFacial[1]
id=10
print ( actorNameRef[id], ' : ', celebrityImages[id])
facicalVector =[-0.0865994319319725, 0.1547723263502121, 0.11647135764360428, -0.05348784476518631, -0.026111885905265808, -0.004996640142053366, 0.02010984718799591, -0.09138417989015579, 0.16325019299983978, -0.10393447428941727, 0.26392027735710144, 0.04262522608041763, -0.17962029576301575, -0.09791891276836395, 0.09553860872983932, 0.08158333599567413, -0.29037392139434814, -0.08611540496349335, -0.05405981093645096, -0.06868094205856323, -0.05936623737215996, 0.05098782852292061, 0.08535328507423401, -0.013151603750884533, -0.059518761932849884, -0.26108041405677795, -0.07175756245851517, -0.13915595412254333, 0.15117278695106506, -0.0414716862142086, -0.06382555514574051, 0.005769657902419567, -0.2461748570203781, -0.03276572376489639, -0.062235064804553986, -0.029176823794841766, 0.03860582783818245, -0.026595083996653557, 0.19672393798828125, 0.047811634838581085, -0.09344539791345596, -0.07775542885065079, -0.0040527209639549255, 0.22311992943286896, 0.21590833365917206, -0.019895289093255997, -0.04806771129369736, 0.07343978434801102, 0.01277383416891098, -0.16670356690883636, 0.07625529915094376, 0.10308811813592911, 0.20396625995635986, 0.057876359671354294, 0.004202492069453001, -0.19998204708099365, -0.03275959938764572, -0.002369481138885021, -0.08730950951576233, 0.06798733025789261, 0.09074174612760544, -0.0927027091383934, -0.020686306059360504, -0.007380591239780188, 0.33890408277511597, 0.09122084826231003, -0.08129587769508362, -0.18075791001319885, 0.10208127647638321, -0.09331383556127548, -0.0988638624548912, 0.047538258135318756, -0.1588563323020935, -0.017572728917002678, -0.26215359568595886, 0.0427873358130455, 0.25608041882514954, 0.11964087188243866, -0.198289692401886, -0.005372968502342701, -0.20604664087295532, -0.041313473135232925, -0.00883499439805746, 0.09691508114337921, -0.09412065148353577, 0.07351617515087128, -0.07343287765979767, -0.011971116065979004, 0.12895527482032776, -0.00040591973811388016, -0.11712130159139633, 0.2198733389377594, -0.06745602190494537, 0.03355204313993454, -0.009182915091514587, -0.15589208900928497, 0.01921061798930168, -0.05102435499429703, -0.08595862984657288, -0.0336931049823761, -0.0066622281447052956, -0.12268581986427307, -0.04636511579155922, 0.1837030053138733, -0.22998324036598206, 0.0762787014245987, -0.0007653213106095791, 0.011642297729849815, 0.03597094863653183, 0.06777633726596832, -0.09956732392311096, -0.06468761712312698, 0.15399740636348724, -0.2643252909183502, 0.24139092862606049, 0.1924624890089035, 0.02700689062476158, 0.10112108290195465, 0.03542455658316612, 0.11118753254413605, -0.034934964030981064, 0.0791907086968422, -0.11021479219198227, -0.06284264475107193, 0.05167575180530548, -0.06907293945550919, 0.10757213085889816, 0.093105748295784]

q={
  "_source": {
    "includes": [ "celebrity"]
    },
  "size": 20,
  "query": {
    "script_score": {
      "query" : {
        "match_all" : {}
      },
      "script": {
        "source": "cosineSimilarity(params.query_vector, doc['facial_vector']) ", 
        "params": {
          "query_vector": facicalVector
        }
      }
    }
  }
}
res= es.search(index=_index, body=q)

pprint(res)

['10', 'Harrison Ford', '77', '1942-07-13', 'male', '6\' 1" (1.85 m)']  :  themoviedb_i_10_5Vj23wvkRVrx2Lc2lVZh5MEbIwg.jpg
{'_shards': {'failed': 0, 'skipped': 0, 'successful': 1, 'total': 1},
 'hits': {'hits': [{'_id': 'themoviedb_i_17_mjxJd59SSdA0kiD6ju5WHpdLxy5',
                    '_index': 'facialimages',
                    '_score': 1.0,
                    '_source': {'celebrity': 'Don Cheadle'},
                    '_type': '_doc'},
                   {'_id': 'themoviedb_i_17_4PvWfLvABO5n1ZfzK76vol9Bqae',
                    '_index': 'facialimages',
                    '_score': 0.987859,
                    '_source': {'celebrity': 'Don Cheadle'},
                    '_type': '_doc'},
                   {'_id': 'themoviedb_i_17_hx8L3nxhZdF1SN71iWqMkrf0Wm5',
                    '_index': 'facialimages',
                    '_score': 0.9851827,
                    '_source': {'celebrity': 'Don Cheadle'},
                    '_type': '_doc'},
                   {'_id': 'themov

In [29]:
test ={}

matches = res['hits']['hits']
results={}
for match in matches:
    celebrirty= match['_source']['celebrity']
    print(celebrirty)
    if celebrirty not in  results:
        results[celebrirty] =  match['_score']
        print( results)
        
        
        

Don Cheadle
{'Don Cheadle': 1.0}
Don Cheadle
Don Cheadle
Don Cheadle
Don Cheadle
Don Cheadle
Don Cheadle
Chadwick Boseman
{'Don Cheadle': 1.0, 'Chadwick Boseman': 0.90763265}
Chadwick Boseman
Kevin Hart
{'Don Cheadle': 1.0, 'Chadwick Boseman': 0.90763265, 'Kevin Hart': 0.8967935}
Eddie Murphy
{'Don Cheadle': 1.0, 'Chadwick Boseman': 0.90763265, 'Kevin Hart': 0.8967935, 'Eddie Murphy': 0.8967278}
Chadwick Boseman
Chadwick Boseman
Chadwick Boseman
John Boyega
{'Don Cheadle': 1.0, 'Chadwick Boseman': 0.90763265, 'Kevin Hart': 0.8967935, 'Eddie Murphy': 0.8967278, 'John Boyega': 0.8914113}
Kevin Hart
John Boyega
Chadwick Boseman
Chadwick Boseman
Idris Elba
{'Don Cheadle': 1.0, 'Chadwick Boseman': 0.90763265, 'Kevin Hart': 0.8967935, 'Eddie Murphy': 0.8967278, 'John Boyega': 0.8914113, 'Idris Elba': 0.88909835}
