In [6]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import json

To do:
- FaceNet would be good to look at whether an entity is male or female in different label types (+ race and other attributes)
- Link to a KG - though unfortunately entity links are to the paid Google KG
- NER the ents

To update:
- Finish faces analysis
- Different analysis of fundamental features

# Data

In [3]:
anno_subtask2a_train = pd.read_json('X:\\PhD\\SemEval Task4\\Data\\annotations\\data\\subtask2a\\train.json')
anno_subtask2a_train['subset'] = 'train'
anno_subtask2a_val = pd.read_json('X:\\PhD\\SemEval Task4\\Data\\annotations\\data\\subtask2a\\validation.json')
anno_subtask2a_val['subset'] = 'val'
anno_subtask2a_dev = pd.read_json('X:\\PhD\\SemEval Task4\\Data\\annotations\\data\\subtask2a\\dev_unlabeled.json')

anno_subtask2a_combined = pd.concat([anno_subtask2a_train, anno_subtask2a_val])

anno_subtask2b_train = pd.read_json('X:\\PhD\\SemEval Task4\\Data\\annotations\\data\\subtask2b\\train.json')
anno_subtask2b_train['subset'] = 'train'
anno_subtask2b_val = pd.read_json('X:\\PhD\\SemEval Task4\\Data\\annotations\\data\\subtask2b\\val.json')
anno_subtask2b_val['subset'] = 'val'
anno_subtask2b_dev = pd.read_json('X:\\PhD\\SemEval Task4\\Data\\annotations\\data\\subtask2b\\dev_unlabeled.json')

anno_subtask2b_combined = pd.concat([anno_subtask2b_train, anno_subtask2b_val])

In [4]:
path = r'X:\PhD\SemEval Task4\Data\subtask2a_images'
images = [os.path.join(dirpath,f) for (dirpath, dirnames, filenames) in os.walk(path) for f in filenames]
images_df = pd.DataFrame(images, columns=['filepath'])
images_df['image'] = images_df['filepath'].str.split('\\').str[-1]

subtask2a = pd.merge(anno_subtask2a_combined, images_df, on='image')

In [5]:
path = r'X:\PhD\SemEval Task4\Data\subtask2b_images'
images = [os.path.join(dirpath,f) for (dirpath, dirnames, filenames) in os.walk(path) for f in filenames]
images_df = pd.DataFrame(images, columns=['filepath'])
images_df['image'] = images_df['filepath'].str.split('\\').str[-1]

subtask2b = pd.merge(anno_subtask2b_combined, images_df, on='image')

# has a nan text field so replace it with an empty string
subtask2b['text'] = subtask2b['text'].fillna(' ')
subtask2b

Unnamed: 0,id,text,image,label,subset,filepath
0,35807,DONALD TRUMP: BARACK\nOBAMA AND JOE BIDEN\nWIL...,prop_meme_6570.png,propagandistic,train,X:\PhD\SemEval Task4\Data\subtask2b_images\tra...
1,30562,00\n10% FOR\nTHE BIG GUY\nNANCY'S\nCUT\n@ImMem...,prop_meme_8346.png,propagandistic,train,X:\PhD\SemEval Task4\Data\subtask2b_images\tra...
2,44163,"To much political posting online\nthese days, ...",prop_meme_24378.png,non_propagandistic,train,X:\PhD\SemEval Task4\Data\subtask2b_images\tra...
3,24224,I DON'T THINK\nYOU UNDERSTOOD\nWHAT I SAID.\nY...,prop_meme_2594.png,propagandistic,train,X:\PhD\SemEval Task4\Data\subtask2b_images\tra...
4,31611,ⒸSergey Mihailicenko/Anadolu Agency via Getty ...,prop_meme_7654.png,propagandistic,train,X:\PhD\SemEval Task4\Data\subtask2b_images\tra...
...,...,...,...,...,...,...
1345,44900,197\nNazi ain't got no humanity\nThey're the f...,prop_meme_19869.png,propagandistic,val,X:\PhD\SemEval Task4\Data\subtask2b_images\val...
1346,12635,HANG ONTHAVE\nA MEME\nFOR THIS\n,prop_meme_641.png,non_propagandistic,val,X:\PhD\SemEval Task4\Data\subtask2b_images\val...
1347,12740,"HE GAVE HIS BLOOD, SWEAT AND TEARS\nFOR THE AM...",prop_meme_746.png,propagandistic,val,X:\PhD\SemEval Task4\Data\subtask2b_images\val...
1348,46086,BUT I WANTED NORTH\nKOREA TO NUKE US\nTO MAKE ...,prop_meme_18775.png,propagandistic,val,X:\PhD\SemEval Task4\Data\subtask2b_images\val...


# Entity Analysis

In [65]:
path_f = r'X:\PhD\SemEval Task4\Code\GoogleVision\vision_face_detect.json'
path_w = r'X:\PhD\SemEval Task4\Code\GoogleVision\web_entities.json'

with open(path_f) as f:
    faces = json.load(f)

with open(path_w) as f:
    web_ents = json.load(f)

The responses files from Google Vision are not great, so best way is to explode the interested columns in a dataframe instead of dealing with mismatched files. Setting the image ID as the index is also a good method for this. There is probably a better method....

The entities are extracted for all images (Subtask2a and Subtask2b) so we also want to join the labels from those subtasks.

In [103]:
# function for sorting the JSON file a bit more sensibly to work with
# col is in the Response: e.g., webEntities, fullMatchingImages .etc, check the json

def explode_frame(json_file, col):
    df = pd.json_normalize(json_file)
    df.set_index('Image ID', inplace=True)
    return df['Response.' + col].explode().pipe(lambda x: pd.json_normalize(x).set_index(x.index))

## Web Entities

In [99]:
web_ents = explode_frame(web_ents, 'webEntities')

In [109]:
# grab the labels, merge the frames
labels = pd.concat([subtask2a[['image', 'labels']], subtask2b[['image', 'label']]])
labels.rename(columns={'labels': '2a_label', 'label': '2b_label'}, inplace=True)
web_ents = web_ents.merge(labels, left_on=web_ents.index, right_on='image')

### Subtask2b
Start with the easiest....

In [126]:
# get a count of the top entities

web_ents2b = web_ents.dropna(subset='2b_label')
web_ents2b.groupby('description').count().sort_values(['entityId'], ascending=False).head(50)

Unnamed: 0_level_0,entityId,score,id,image,2a_label,2b_label
description,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Text,813,813,813,813,0,813
Photo caption,534,534,534,534,0,534
Meter,477,477,477,477,0,477
Font,380,380,380,380,0,380
Image,359,359,359,359,0,359
Meme,342,342,342,342,0,342
Politics,251,251,251,251,0,251
Internet meme,222,222,222,222,0,222
funny,218,218,218,218,0,218
Funny meme,214,214,214,214,0,214


In [133]:
# get a count of the top entities - by label?
web_ents2b[web_ents2b['2b_label'] == 'non_propagandistic'].groupby(['description', '2b_label']).count().sort_values(['entityId'], 
                                                                                                                    ascending=False).head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,entityId,score,id,image,2a_label
description,2b_label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Text,non_propagandistic,164,164,164,164,0
Meme,non_propagandistic,122,122,122,122,0
Image,non_propagandistic,121,121,121,121,0
funny,non_propagandistic,102,102,102,102,0
Funny meme,non_propagandistic,100,100,100,100,0
Meter,non_propagandistic,92,92,92,92,0
Font,non_propagandistic,91,91,91,91,0
Internet meme,non_propagandistic,88,88,88,88,0
Product,non_propagandistic,66,66,66,66,0
Brand,non_propagandistic,62,62,62,62,0


In [134]:
web_ents2b[web_ents2b['2b_label'] == 'propagandistic'].groupby(['description', '2b_label']).count().sort_values(['entityId'], 
                                                                                                                    ascending=False).head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,entityId,score,id,image,2a_label
description,2b_label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Text,propagandistic,649,649,649,649,0
Photo caption,propagandistic,485,485,485,485,0
Meter,propagandistic,385,385,385,385,0
Font,propagandistic,289,289,289,289,0
Image,propagandistic,238,238,238,238,0
Politics,propagandistic,227,227,227,227,0
Meme,propagandistic,220,220,220,220,0
Human,propagandistic,173,172,173,173,0
United States,propagandistic,170,170,170,170,0
US President,propagandistic,160,160,160,160,0


### Subtask2a
Since we probably went to look at the different top entities appearing per label, we'll do the largest counts per label in a loop.

In [136]:
web_ents2a = web_ents.dropna(subset='2a_label')
web_ents2a = web_ents2a.explode(['2a_label']) # row per label

In [157]:
labels = web_ents2a['2a_label'].unique()

for i in labels:
    print(f'Top entities for "{i}"')
    print(web_ents2a[web_ents2a['2a_label'] == i].groupby(['description']).count().sort_values(['entityId'], ascending=False).head(20))
    print(f'\n---------')

Top entities for "Black-and-white Fallacy/Dictatorship"
                entityId  score   id  image  2a_label  2b_label
description                                                    
Text                 444    444  444    444       444         0
Photo caption        305    305  305    305       305         0
Meter                269    269  269    269       269         0
Font                 233    233  233    233       233         0
Meme                 150    150  150    150       150         0
Image                129    129  129    129       129         0
Human                128    127  128    128       128         0
Brand                118    118  118    118       118         0
Human behavior       115    115  115    115       115         0
Politics             113    113  113    113       113         0
Product              104    104  104    104       104         0
United States         95     95   95     95        95         0
US President          91     91   91     91     

## Faces

Info from API response:

CLASS LIKELIHOOD
Values: UNKNOWN (0): Unknown likelihood. VERY_UNLIKELY (1): It is very unlikely. UNLIKELY (2): It is unlikely. POSSIBLE (3): It is possible. LIKELY (4): It is likely. VERY_LIKELY (5): It is very likely.

See more: https://cloud.google.com/python/docs/reference/vision/latest/google.cloud.vision_v1.types.FaceAnnotation

Probably for now, most interested in the headwear annotations and emotions.r.

In [182]:
# different response, function doesn't work but don't need it here anyway

df = pd.json_normalize(faces)
df.set_index('Image ID', inplace=True)
df = df['Response'].explode().pipe(lambda x: pd.json_normalize(x).set_index(x.index))

headwear = df[['headwear_likelihood']]
emotions = df[['joy_likelihood', 'sorrow_likelihood', 'anger_likelihood', 'surprise_likelihood']]

In [184]:
labels = pd.concat([subtask2a[['image', 'labels']], subtask2b[['image', 'label']]])
labels.rename(columns={'labels': '2a_label', 'label': '2b_label'}, inplace=True)
headwear = headwear.merge(labels, left_on=headwear.index, right_on='image')
emotions = emotions.merge(labels, left_on=emotions.index, right_on='image')

In [194]:
emotions[emotions['anger_likelihood'] > 2.0]

Unnamed: 0,joy_likelihood,sorrow_likelihood,anger_likelihood,surprise_likelihood,image,2a_label,2b_label
512,1.0,1.0,4.0,1.0,prop_meme_12478.png,[Black-and-white Fallacy/Dictatorship],
513,1.0,1.0,4.0,1.0,prop_meme_12478.png,[Black-and-white Fallacy/Dictatorship],
514,1.0,1.0,4.0,1.0,prop_meme_12478.png,[Black-and-white Fallacy/Dictatorship],
515,1.0,1.0,4.0,1.0,prop_meme_12478.png,[Black-and-white Fallacy/Dictatorship],
986,1.0,1.0,4.0,3.0,prop_meme_16753.png,"[Loaded Language, Transfer, Smears]",
...,...,...,...,...,...,...,...
18995,1.0,2.0,3.0,1.0,prop_meme_9814.png,"[Presenting Irrelevant Data (Red Herring), Sme...",
19138,1.0,1.0,3.0,4.0,prop_meme_12951.png,,propagandistic
19140,1.0,1.0,3.0,1.0,prop_meme_12967.png,,propagandistic
19774,1.0,1.0,4.0,1.0,prop_meme_3648.png,,propagandistic


In [195]:
# stopped here, will update later