# Load files

In [1]:
import pandas as pd
import numpy as np

In [2]:
import pickle

image_survey_metrics = pickle.load( open( "image_survey_metrics.pickle", "rb" ) )


* create connection uid-image_id

In [3]:
imgid_uid = image_survey_metrics[['user_id','image_id']].drop_duplicates() #set a matrix
get_uid_from_imgid = imgid_uid.set_index('image_id').to_dict()['user_id'] # img_id -> uid
get_imgids_from_uid = {k: list(v) for k,v in imgid_uid.groupby('user_id')["image_id"]} # uid -> img_id 

In [4]:
image_survey_metrics.drop_duplicates(inplace=True)

In [5]:
len(image_survey_metrics)

459570

In [6]:
image_survey_metrics.columns

Index([u'image_id', u'image_height', u'image_width', u'image_filter',
       u'image_posted_time', u'data_memorability', u'user_id',
       u'user_followed_by', u'user_follows', u'user_posted_photos',
       u'anp_label', u'anp_sentiment', u'emotion_score', u'emotion_label',
       u'data_amz_label', u'data_amz_label_confidence', u'face_id',
       u'face_gender', u'face_gender_confidence', u'face_age_range_high',
       u'face_age_range_low', u'face_sunglasses', u'face_beard',
       u'face_beard_confidence', u'face_mustache', u'face_mustache_confidence',
       u'face_smile', u'face_smile_confidence', u'eyeglasses',
       u'eyeglasses_confidence', u'face_emo', u'emo_confidence', u'id',
       u'gender', u'born', u'education', u'employed', u'income', u'A_2',
       u'N_1', u'P_1', u'E_1', u'A_1', u'H_1', u'M_1', u'R_1', u'M_2', u'E_2',
       u'LON', u'H_2', u'P_2', u'N_2', u'A_3', u'N_3', u'E_3', u'H_3', u'R_2',
       u'M_3', u'R_3', u'P_3', u'HAP', u'insta_user_id', u'P', u'E', u'

In [7]:
#Convenient way to select multiple columns to get
for c in image_survey_metrics.columns:
    print "'"+c+"',"

'image_id',
'image_height',
'image_width',
'image_filter',
'image_posted_time',
'data_memorability',
'user_id',
'user_followed_by',
'user_follows',
'user_posted_photos',
'anp_label',
'anp_sentiment',
'emotion_score',
'emotion_label',
'data_amz_label',
'data_amz_label_confidence',
'face_id',
'face_gender',
'face_gender_confidence',
'face_age_range_high',
'face_age_range_low',
'face_sunglasses',
'face_beard',
'face_beard_confidence',
'face_mustache',
'face_mustache_confidence',
'face_smile',
'face_smile_confidence',
'eyeglasses',
'eyeglasses_confidence',
'face_emo',
'emo_confidence',
'id',
'gender',
'born',
'education',
'employed',
'income',
'A_2',
'N_1',
'P_1',
'E_1',
'A_1',
'H_1',
'M_1',
'R_1',
'M_2',
'E_2',
'LON',
'H_2',
'P_2',
'N_2',
'A_3',
'N_3',
'E_3',
'H_3',
'R_2',
'M_3',
'R_3',
'P_3',
'HAP',
'insta_user_id',
'P',
'E',
'R',
'M',
'A',
'PERMA',
'N_EMO',
'P_EMO',
'imagecount',
'comment_count',
'like_count',


# Construct the final ANP matrix (for each image), with the corresponding features extracted

* Load ANP dataset

In [11]:
# # LOAD FROM feather !!!!!!!!
# import feather
# anp_df = feather.api.read_dataframe('data_science_case/anp.feather')

In [12]:
#load ANPs from Aris pickle df
anp_df = image_survey_metrics[['image_id', 'anp_label', 'anp_sentiment', 'emotion_score',
       'emotion_label']].drop_duplicates().set_index("image_id")
del anp_df.index.name

In [13]:
anp_df.head(15)

Unnamed: 0,anp_label,anp_sentiment,emotion_score,emotion_label
1372870097060159201_53918317,haunted_house,0.036,0.1634,fear
1372870097060159201_53918317,beautiful_hair,0.488,0.1981,amazement
1372870097060159201_53918317,fake_fur,-0.916,0.0841,sadness
1372870097060159201_53918317,wicked_witch,0.156,0.2253,joy
1372870097060159201_53918317,funny_dog,0.156,0.2859,joy
1376341630843643565_53918317,cute_kids,0.285,0.2212,joy
1376341630843643565_53918317,great_adventure,0.791,0.3061,amazement
1376341630843643565_53918317,special_delivery,0.263,0.0978,surprise
1376341630843643565_53918317,personal_trainers,0.025,0.1396,amazement
1376341630843643565_53918317,plastic_bottles,0.042,0.0837,amazement


In [14]:
len(anp_df)

13275

### Create features based on ANPs for each image

In [15]:
#generate a new df containing the classes corresponding to Butchniks emotions
classes = anp_df.emotion_label.unique()
from sklearn.preprocessing import label_binarize
anp_final = pd.DataFrame(label_binarize(anp_df.emotion_label, classes=classes),columns=classes,index=anp_df.index)

anp_final.head()

#put the value of emotion score in the corresponding row of the image
anp_final = anp_final.multiply(anp_df.emotion_score,axis=0)
anp_final.head(20)

Unnamed: 0,fear,amazement,sadness,joy,surprise,interest,terror,anger,trust,ecstasy,serenity,pensiveness,grief,acceptance,rage,disgust,boredom,annoyance,distraction,anticipation
1372870097060159201_53918317,0.1634,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1372870097060159201_53918317,0.0,0.1981,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1372870097060159201_53918317,0.0,0.0,0.0841,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1372870097060159201_53918317,0.0,0.0,0.0,0.2253,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1372870097060159201_53918317,0.0,0.0,0.0,0.2859,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1376341630843643565_53918317,0.0,0.0,0.0,0.2212,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1376341630843643565_53918317,0.0,0.3061,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1376341630843643565_53918317,0.0,0.0,0.0,0.0,0.0978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1376341630843643565_53918317,0.0,0.1396,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1376341630843643565_53918317,0.0,0.0837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
#sum all emotions over each image
anp_final = anp_final.groupby(anp_final.index).sum()
anp_final.head(10)

Unnamed: 0,fear,amazement,sadness,joy,surprise,interest,terror,anger,trust,ecstasy,serenity,pensiveness,grief,acceptance,rage,disgust,boredom,annoyance,distraction,anticipation
1332664773053247354_22180590,0.0,0.4581,0.1618,0.188,0.0,0.1136,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1332783950418129613_2123557533,0.0,0.5045,0.1281,0.2419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1332822008853103927_52590715,0.0,0.9781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1078,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1332854358136052130_50853245,0.0,0.4998,0.608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1332881501499202562_3807589911,0.0,0.0,0.0,0.1191,0.0,0.2126,0.0,0.0,0.2522,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1332895896828082520_703978203,0.0,0.0,0.1281,0.3598,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0867,0.0,0.0,0.0,0.0,0.0
1332910087207201057_246095675,0.0,0.5217,0.0,0.2453,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1332922180442768959_203407563,0.0,0.9658,0.0,0.0893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1333048316602121055_287562303,0.0,0.3673,0.0,0.0,0.1403,0.1526,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1406,0.0,0.0,0.0,0.0,0.0
1333396217719165291_703978203,0.0,1.0129,0.0,0.0,0.2026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
#sum all sentiments of label corresponding to the image
anp_final['anp_sentiment'] = anp_df['anp_sentiment'].groupby(anp_df.index).sum()
anp_final.head(10)

Unnamed: 0,fear,amazement,sadness,joy,surprise,interest,terror,anger,trust,ecstasy,...,pensiveness,grief,acceptance,rage,disgust,boredom,annoyance,distraction,anticipation,anp_sentiment
1332664773053247354_22180590,0.0,0.4581,0.1618,0.188,0.0,0.1136,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.584
1332783950418129613_2123557533,0.0,0.5045,0.1281,0.2419,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.568
1332822008853103927_52590715,0.0,0.9781,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.462
1332854358136052130_50853245,0.0,0.4998,0.608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.524
1332881501499202562_3807589911,0.0,0.0,0.0,0.1191,0.0,0.2126,0.0,0.0,0.2522,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.514
1332895896828082520_703978203,0.0,0.0,0.1281,0.3598,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0867,0.0,0.0,0.0,0.0,0.0,-0.232
1332910087207201057_246095675,0.0,0.5217,0.0,0.2453,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.815
1332922180442768959_203407563,0.0,0.9658,0.0,0.0893,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.062
1333048316602121055_287562303,0.0,0.3673,0.0,0.0,0.1403,0.1526,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.1406,0.0,0.0,0.0,0.0,0.0,0.545
1333396217719165291_703978203,0.0,1.0129,0.0,0.0,0.2026,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.146


In [None]:
# TODO normalize matrix ????

# Export matrix

In [18]:
# extract the matrix
import pickle
with open ('ANP_features.pickle','wb') as f:
    pickle.dump(anp_final,f)

# Correlate

### Load Y variable(s)

In [24]:
image_survey_metrics.columns

Index([u'image_id', u'image_height', u'image_width', u'image_filter',
       u'image_posted_time', u'data_memorability', u'user_id',
       u'user_followed_by', u'user_follows', u'user_posted_photos',
       u'anp_label', u'anp_sentiment', u'emotion_score', u'emotion_label',
       u'data_amz_label', u'data_amz_label_confidence', u'face_id',
       u'face_gender', u'face_gender_confidence', u'face_age_range_high',
       u'face_age_range_low', u'face_sunglasses', u'face_beard',
       u'face_beard_confidence', u'face_mustache', u'face_mustache_confidence',
       u'face_smile', u'face_smile_confidence', u'eyeglasses',
       u'eyeglasses_confidence', u'face_emo', u'emo_confidence', u'id',
       u'gender', u'born', u'education', u'employed', u'income', u'A_2',
       u'N_1', u'P_1', u'E_1', u'A_1', u'H_1', u'M_1', u'R_1', u'M_2', u'E_2',
       u'LON', u'H_2', u'P_2', u'N_2', u'A_3', u'N_3', u'E_3', u'H_3', u'R_2',
       u'M_3', u'R_3', u'P_3', u'HAP', u'insta_user_id', u'P', u'E', u'

In [22]:
PERMA_df = image_survey_metrics[['image_id','user_id','P',
    'E',
    'R',
    'M',
    'A',
    'PERMA',
    'image_posted_time',
    '']].drop_duplicates().set_index('image_id')
del PERMA_df.index.name
print 'len(PERMA_df):', len(PERMA_df)
print ' '
PERMA_df.head(15)

len(PERMA_df): 2776
 


Unnamed: 0,user_id,P,E,R,M,A,PERMA
1372870097060159201_53918317,53918317.0,1,7.0,8.0,5.0,5.0,1
1376341630843643565_53918317,53918317.0,1,7.0,8.0,5.0,5.0,1
1373407612938533591_53918317,53918317.0,1,7.0,8.0,5.0,5.0,1
1372871586851626025_53918317,53918317.0,1,7.0,8.0,5.0,5.0,1
1376342482815974929_53918317,53918317.0,1,7.0,8.0,5.0,5.0,1
1373465021300229307_53918317,53918317.0,1,7.0,8.0,5.0,5.0,1
1390205097753265971_53918317,53918317.0,1,7.0,8.0,5.0,5.0,1
1373589994983539906_53918317,53918317.0,1,7.0,8.0,5.0,5.0,1
1379287198058577014_53918317,53918317.0,1,7.0,8.0,5.0,5.0,1
1372785727494354721_53918317,53918317.0,1,7.0,8.0,5.0,5.0,1


In [21]:
anp_final

Unnamed: 0,fear,amazement,sadness,joy,surprise,interest,terror,anger,trust,ecstasy,...,pensiveness,grief,acceptance,rage,disgust,boredom,annoyance,distraction,anticipation,anp_sentiment
1332664773053247354_22180590,0.0000,0.4581,0.1618,0.1880,0.0000,0.1136,0.0000,0.0000,0.0000,0.0000,...,0.0,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0,3.584
1332783950418129613_2123557533,0.0000,0.5045,0.1281,0.2419,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0,1.568
1332822008853103927_52590715,0.0000,0.9781,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.462
1332854358136052130_50853245,0.0000,0.4998,0.6080,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0,-1.524
1332881501499202562_3807589911,0.0000,0.0000,0.0000,0.1191,0.0000,0.2126,0.0000,0.0000,0.2522,0.0000,...,0.0,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.514
1332895896828082520_703978203,0.0000,0.0000,0.1281,0.3598,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0,0.0000,0.0,0.0867,0.0000,0.0000,0.0000,0.0,0.0,-0.232
1332910087207201057_246095675,0.0000,0.5217,0.0000,0.2453,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.815
1332922180442768959_203407563,0.0000,0.9658,0.0000,0.0893,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0,-0.062
1333048316602121055_287562303,0.0000,0.3673,0.0000,0.0000,0.1403,0.1526,0.0000,0.0000,0.0000,0.0000,...,0.0,0.0000,0.0,0.1406,0.0000,0.0000,0.0000,0.0,0.0,0.545
1333396217719165291_703978203,0.0000,1.0129,0.0000,0.0000,0.2026,0.0000,0.0000,0.0000,0.0000,0.0000,...,0.0,0.0000,0.0,0.0000,0.0000,0.0000,0.0000,0.0,0.0,0.146


In [None]:
anp_final.merge(PERMA_df,on='image_id')

In [None]:
import seaborn as sns

In [None]:
corr_matrix = anp_final.merge(PERMA_df,on='image_id').corr()

In [None]:
corr_matrix.head()