In [1]:
import numpy as np
import pandas as pd
import glob
import json

## Data Pre-processing

In [2]:
# read in csv file generated by Amazon API
path = r'C:\Users\movin\Documents\GitHub\PersonalityFromPhoto\data'
allFiles = glob.glob(path + '/*.csv')
df = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_, index_col=None, header=0)
    list_.append(df)
df = pd.concat(list_, ignore_index=True)

In [3]:
len(df.index)

15997

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,fileName,Labels,FaceDetails
0,0,Facebook/ENFJ/1000006876806379.jpg,"[{'Name': 'Human', 'Confidence': 99.2957839965...",[{'BoundingBox': {'Width': 0.11999999731779099...
1,1,Facebook/ENFJ/1001091583356351.jpg,"[{'Name': 'Art', 'Confidence': 52.856147766113...",[]
2,2,Facebook/ENFJ/1001328126675771.jpg,"[{'Name': 'People', 'Confidence': 99.012985229...","[{'BoundingBox': {'Width': 0.4146634638309479,..."
3,3,Facebook/ENFJ/10100163853254202.jpg,"[{'Name': 'Art', 'Confidence': 91.370552062988...",[]
4,4,Facebook/ENFJ/10100238407492499.jpg,"[{'Name': 'Human', 'Confidence': 99.3081130981...","[{'BoundingBox': {'Width': 0.6088888645172119,..."


In [5]:
# rename the unnamed column
df = df.rename(columns={'Unnamed: 0': 'GroupIndex'})

In [7]:
# add an index column
df['index'] = np.arange(len(df.index))

In [9]:
cols = df.columns.tolist()
cols

['GroupIndex', 'fileName', 'Labels', 'FaceDetails', 'index']

In [10]:
cols = cols[-1:] + cols[:-1]
cols

['index', 'GroupIndex', 'fileName', 'Labels', 'FaceDetails']

In [12]:
df = df[cols]

In [13]:
df.head()

Unnamed: 0,index,GroupIndex,fileName,Labels,FaceDetails
0,0,0,Facebook/ENFJ/1000006876806379.jpg,"[{'Name': 'Human', 'Confidence': 99.2957839965...",[{'BoundingBox': {'Width': 0.11999999731779099...
1,1,1,Facebook/ENFJ/1001091583356351.jpg,"[{'Name': 'Art', 'Confidence': 52.856147766113...",[]
2,2,2,Facebook/ENFJ/1001328126675771.jpg,"[{'Name': 'People', 'Confidence': 99.012985229...","[{'BoundingBox': {'Width': 0.4146634638309479,..."
3,3,3,Facebook/ENFJ/10100163853254202.jpg,"[{'Name': 'Art', 'Confidence': 91.370552062988...",[]
4,4,4,Facebook/ENFJ/10100238407492499.jpg,"[{'Name': 'Human', 'Confidence': 99.3081130981...","[{'BoundingBox': {'Width': 0.6088888645172119,..."


In [14]:
# get the personality for every photo
personality_list = []
for i in range(len(df.index)):
    personality_list.append(df['fileName'][i][9:13])
len(personality_list)

15997

In [15]:
personality_list[:10]

['ENFJ',
 'ENFJ',
 'ENFJ',
 'ENFJ',
 'ENFJ',
 'ENFJ',
 'ENFJ',
 'ENFJ',
 'ENFJ',
 'ENFJ']

In [16]:
personality_list[-10:]

['ISTP',
 'ISTP',
 'ISTP',
 'ISTP',
 'ISTP',
 'ISTP',
 'ISTP',
 'ISTP',
 'ISTP',
 'ISTP']

In [17]:
# add the personality column to dataframe
se = pd.Series(personality_list)
df['Personality'] = se.values
df.head()

Unnamed: 0,index,GroupIndex,fileName,Labels,FaceDetails,Personality
0,0,0,Facebook/ENFJ/1000006876806379.jpg,"[{'Name': 'Human', 'Confidence': 99.2957839965...",[{'BoundingBox': {'Width': 0.11999999731779099...,ENFJ
1,1,1,Facebook/ENFJ/1001091583356351.jpg,"[{'Name': 'Art', 'Confidence': 52.856147766113...",[],ENFJ
2,2,2,Facebook/ENFJ/1001328126675771.jpg,"[{'Name': 'People', 'Confidence': 99.012985229...","[{'BoundingBox': {'Width': 0.4146634638309479,...",ENFJ
3,3,3,Facebook/ENFJ/10100163853254202.jpg,"[{'Name': 'Art', 'Confidence': 91.370552062988...",[],ENFJ
4,4,4,Facebook/ENFJ/10100238407492499.jpg,"[{'Name': 'Human', 'Confidence': 99.3081130981...","[{'BoundingBox': {'Width': 0.6088888645172119,...",ENFJ


In [18]:
df.tail()

Unnamed: 0,index,GroupIndex,fileName,Labels,FaceDetails,Personality
15992,15992,995,Facebook/ISTP/1522341501192227.jpg,"[{'Name': 'Glasses', 'Confidence': 91.36360931...","[{'BoundingBox': {'Width': 1.0522222518920898,...",ISTP
15993,15993,996,Facebook/ISTP/1522843721140384.jpg,"[{'Name': 'Human', 'Confidence': 99.3121795654...","[{'BoundingBox': {'Width': 0.6979812383651733,...",ISTP
15994,15994,997,Facebook/ISTP/1523030107783316.jpg,"[{'Name': 'People', 'Confidence': 99.251564025...",[{'BoundingBox': {'Width': 0.38461539149284363...,ISTP
15995,15995,998,Facebook/ISTP/152404242162230.jpg,"[{'Name': 'Emblem', 'Confidence': 99.133453369...",[],ISTP
15996,15996,999,Facebook/ISTP/1524525840928834.jpg,"[{'Name': 'People', 'Confidence': 99.210029602...","[{'BoundingBox': {'Width': 1.0099999904632568,...",ISTP


In [19]:
# empty face details number
count = 0
for i in df['FaceDetails']:
    if len(i)==2:
        count += 1

In [20]:
count

3409

In [22]:
# replace empty face detail with NaN
df = df.replace('[]', np.nan)

In [23]:
df.tail()

Unnamed: 0,index,GroupIndex,fileName,Labels,FaceDetails,Personality
15992,15992,995,Facebook/ISTP/1522341501192227.jpg,"[{'Name': 'Glasses', 'Confidence': 91.36360931...","[{'BoundingBox': {'Width': 1.0522222518920898,...",ISTP
15993,15993,996,Facebook/ISTP/1522843721140384.jpg,"[{'Name': 'Human', 'Confidence': 99.3121795654...","[{'BoundingBox': {'Width': 0.6979812383651733,...",ISTP
15994,15994,997,Facebook/ISTP/1523030107783316.jpg,"[{'Name': 'People', 'Confidence': 99.251564025...",[{'BoundingBox': {'Width': 0.38461539149284363...,ISTP
15995,15995,998,Facebook/ISTP/152404242162230.jpg,"[{'Name': 'Emblem', 'Confidence': 99.133453369...",,ISTP
15996,15996,999,Facebook/ISTP/1524525840928834.jpg,"[{'Name': 'People', 'Confidence': 99.210029602...","[{'BoundingBox': {'Width': 1.0099999904632568,...",ISTP


In [24]:
# drop empty value
df = df.dropna()

In [25]:
df.tail()

Unnamed: 0,index,GroupIndex,fileName,Labels,FaceDetails,Personality
15990,15990,993,Facebook/ISTP/1520202358072757.jpg,"[{'Name': 'Human', 'Confidence': 99.3152389526...","[{'BoundingBox': {'Width': 0.4788888990879059,...",ISTP
15992,15992,995,Facebook/ISTP/1522341501192227.jpg,"[{'Name': 'Glasses', 'Confidence': 91.36360931...","[{'BoundingBox': {'Width': 1.0522222518920898,...",ISTP
15993,15993,996,Facebook/ISTP/1522843721140384.jpg,"[{'Name': 'Human', 'Confidence': 99.3121795654...","[{'BoundingBox': {'Width': 0.6979812383651733,...",ISTP
15994,15994,997,Facebook/ISTP/1523030107783316.jpg,"[{'Name': 'People', 'Confidence': 99.251564025...",[{'BoundingBox': {'Width': 0.38461539149284363...,ISTP
15996,15996,999,Facebook/ISTP/1524525840928834.jpg,"[{'Name': 'People', 'Confidence': 99.210029602...","[{'BoundingBox': {'Width': 1.0099999904632568,...",ISTP


In [26]:
len(df)

12580

## Face Detail Value

In [27]:
df['FaceDetails'][15996]

"[{'BoundingBox': {'Width': 1.0099999904632568, 'Height': 1.0099999904632568, 'Left': -0.17888888716697693, 'Top': -0.10444444417953491}, 'AgeRange': {'Low': 15, 'High': 25}, 'Smile': {'Value': True, 'Confidence': 90.81520080566406}, 'Eyeglasses': {'Value': False, 'Confidence': 93.44303894042969}, 'Sunglasses': {'Value': False, 'Confidence': 99.92829895019531}, 'Gender': {'Value': 'Female', 'Confidence': 100.0}, 'Beard': {'Value': False, 'Confidence': 99.99625396728516}, 'Mustache': {'Value': False, 'Confidence': 99.999267578125}, 'EyesOpen': {'Value': True, 'Confidence': 99.99658203125}, 'MouthOpen': {'Value': False, 'Confidence': 99.9348373413086}, 'Emotions': [{'Type': 'CALM', 'Confidence': 58.72126388549805}, {'Type': 'HAPPY', 'Confidence': 55.22835159301758}, {'Type': 'CONFUSED', 'Confidence': 3.6207854747772217}], 'Landmarks': [{'Type': 'eyeLeft', 'X': 0.2096046805381775, 'Y': 0.35737723112106323}, {'Type': 'eyeRight', 'X': 0.5451221466064453, 'Y': 0.33994126319885254}, {'Type': 

In [28]:
teststring = df['FaceDetails'][15996]

In [29]:
import json

In [30]:

json_acceptable_string = teststring.replace("'", "\"")
json_acceptable_string = json_acceptable_string.replace("True", "\"True\"")
json_acceptable_string = json_acceptable_string.replace("False", "\"False\"")
json_acceptable_string = json_acceptable_string[1:-1]

In [31]:
json_acceptable_string

'{"BoundingBox": {"Width": 1.0099999904632568, "Height": 1.0099999904632568, "Left": -0.17888888716697693, "Top": -0.10444444417953491}, "AgeRange": {"Low": 15, "High": 25}, "Smile": {"Value": "True", "Confidence": 90.81520080566406}, "Eyeglasses": {"Value": "False", "Confidence": 93.44303894042969}, "Sunglasses": {"Value": "False", "Confidence": 99.92829895019531}, "Gender": {"Value": "Female", "Confidence": 100.0}, "Beard": {"Value": "False", "Confidence": 99.99625396728516}, "Mustache": {"Value": "False", "Confidence": 99.999267578125}, "EyesOpen": {"Value": "True", "Confidence": 99.99658203125}, "MouthOpen": {"Value": "False", "Confidence": 99.9348373413086}, "Emotions": [{"Type": "CALM", "Confidence": 58.72126388549805}, {"Type": "HAPPY", "Confidence": 55.22835159301758}, {"Type": "CONFUSED", "Confidence": 3.6207854747772217}], "Landmarks": [{"Type": "eyeLeft", "X": 0.2096046805381775, "Y": 0.35737723112106323}, {"Type": "eyeRight", "X": 0.5451221466064453, "Y": 0.3399412631988525

In [32]:
d = json.loads(json_acceptable_string)

In [33]:
d

{'AgeRange': {'High': 25, 'Low': 15},
 'Beard': {'Confidence': 99.99625396728516, 'Value': 'False'},
 'BoundingBox': {'Height': 1.0099999904632568,
  'Left': -0.17888888716697693,
  'Top': -0.10444444417953491,
  'Width': 1.0099999904632568},
 'Confidence': 99.99915313720703,
 'Emotions': [{'Confidence': 58.72126388549805, 'Type': 'CALM'},
  {'Confidence': 55.22835159301758, 'Type': 'HAPPY'},
  {'Confidence': 3.6207854747772217, 'Type': 'CONFUSED'}],
 'Eyeglasses': {'Confidence': 93.44303894042969, 'Value': 'False'},
 'EyesOpen': {'Confidence': 99.99658203125, 'Value': 'True'},
 'Gender': {'Confidence': 100.0, 'Value': 'Female'},
 'Landmarks': [{'Type': 'eyeLeft',
   'X': 0.2096046805381775,
   'Y': 0.35737723112106323},
  {'Type': 'eyeRight', 'X': 0.5451221466064453, 'Y': 0.33994126319885254},
  {'Type': 'nose', 'X': 0.4138149917125702, 'Y': 0.5268155932426453},
  {'Type': 'mouthLeft', 'X': 0.2641378939151764, 'Y': 0.6857045292854309},
  {'Type': 'mouthRight', 'X': 0.5286892652511597,

In [34]:
len(d)

15

In [35]:
d.keys()

dict_keys(['BoundingBox', 'AgeRange', 'Smile', 'Eyeglasses', 'Sunglasses', 'Gender', 'Beard', 'Mustache', 'EyesOpen', 'MouthOpen', 'Emotions', 'Landmarks', 'Pose', 'Quality', 'Confidence'])

In [36]:
d['AgeRange']

{'High': 25, 'Low': 15}

In [37]:
d['Smile']

{'Confidence': 90.81520080566406, 'Value': 'True'}

In [38]:
d['Eyeglasses']

{'Confidence': 93.44303894042969, 'Value': 'False'}

In [39]:
d['Sunglasses']

{'Confidence': 99.92829895019531, 'Value': 'False'}

In [40]:
d['Gender']

{'Confidence': 100.0, 'Value': 'Female'}

In [41]:
d['Beard']

{'Confidence': 99.99625396728516, 'Value': 'False'}

In [42]:
d['Mustache']

{'Confidence': 99.999267578125, 'Value': 'False'}

In [43]:
d['EyesOpen']

{'Confidence': 99.99658203125, 'Value': 'True'}

In [44]:
d['MouthOpen']

{'Confidence': 99.9348373413086, 'Value': 'False'}

In [45]:
d['Emotions']

[{'Confidence': 58.72126388549805, 'Type': 'CALM'},
 {'Confidence': 55.22835159301758, 'Type': 'HAPPY'},
 {'Confidence': 3.6207854747772217, 'Type': 'CONFUSED'}]

In [46]:
d['Pose']

{'Pitch': -4.012670993804932,
 'Roll': -3.856233596801758,
 'Yaw': 5.892911434173584}

In [47]:
d['Quality']

{'Brightness': 35.42747497558594, 'Sharpness': 99.9980239868164}

In [48]:
d['Confidence']

99.99915313720703

In [49]:
QualityDF = pd.DataFrame(data=d['Quality'], index=[0])
QualityDF

Unnamed: 0,Brightness,Sharpness
0,35.427475,99.998024


In [50]:
d['Smile'].keys()

dict_keys(['Value', 'Confidence'])

In [51]:
d['Smile']['Confidence']

90.81520080566406

In [54]:
d['Gender']

{'Confidence': 100.0, 'Value': 'Female'}

In [56]:
d['Gender']['Value']

'Female'

In [57]:
d['Beard']

{'Confidence': 99.99625396728516, 'Value': 'False'}

In [58]:
d['Mustache']

{'Confidence': 99.999267578125, 'Value': 'False'}

In [59]:
d['EyesOpen']

{'Confidence': 99.99658203125, 'Value': 'True'}

In [60]:
d['MouthOpen']

{'Confidence': 99.9348373413086, 'Value': 'False'}

In [61]:
d['Emotions']

[{'Confidence': 58.72126388549805, 'Type': 'CALM'},
 {'Confidence': 55.22835159301758, 'Type': 'HAPPY'},
 {'Confidence': 3.6207854747772217, 'Type': 'CONFUSED'}]

In [62]:
d['Emotions'][0]

{'Confidence': 58.72126388549805, 'Type': 'CALM'}

### write a function to extract features

In [74]:
def get_emotion(string):
    """input face detail string, output list of emotion types"""
    # change the string to json acceptable format
    json_string = string.replace("'", "\"")
    json_string = json_string.replace("True", "\"True\"")
    json_string = json_string.replace("False", "\"False\"")
    json_string = json_string[1:-1]
    
    # convert to python object
    d = json.loads(json_string)
    
    # get emotions
    EmotionList = []
    for i in d['Emotions']:
        EmotionList.append(i['Type'])
        
    return EmotionList

In [89]:
df['FaceDetails'][5][2580:2650]

"76599}, {'Type': 'mouthUp', 'X': 0.36307042837142944, 'Y': 0.648285090"

In [83]:
get_emotion(df['FaceDetails'][5])

JSONDecodeError: Extra data: line 1 column 2952 (char 2951)

In [76]:
for i in range(len(df['FaceDetails'])):
    EmotionList = get_emotion(df['FaceDetails'][i])
    print(i, 'is done.')
    EmotionSet = set()
    for emotion in EmotionList:
        EmotionSet.add(emotion)

EmotionSet

JSONDecodeError: Extra data: line 1 column 2944 (char 2943)

In [92]:
SmileList = []
GenderList = []
BeardList = []
MustacheList = []
EyesOpenList = []
MouthOpenList = []


In [93]:
for string in df['FaceDetails']:
    # change the string to json acceptable format
    json_string = string.replace("'", "\"")
    json_string = json_string.replace("True", "\"True\"")
    json_string = json_string.replace("False", "\"False\"")
    json_string = json_string[1:-1]
    
    # convert to python object
    d = json.loads(json_string)
    
    # add value to list
    SmileList.append(d['Smile']['Confidence'])
    GenderList.append(d['Gender']['Value'])
    BeardList.append(d['Beard']['Value'])
    MustacheList.append(d['Mustache']['Value'])
    EyesOpenList.append(d['EyesOpen']['Value'])
    MouthOpenList.append(d['MouthOpen']['Value'])

JSONDecodeError: Extra data: line 1 column 2944 (char 2943)

In [95]:
GenderList

[]