In [1]:
import os
import json
import pandas as pd
import numpy as np

#### Helper functions

In [1]:
def convert_jsonl_pd(filepath):
  with open(filepath, 'r') as json_file:
    json_list = list(json_file)
  
  dat = []
  for json_str in json_list:
    result = json.loads(json_str)
    dat.append(result)
  
  dat = pd.DataFrame(dat)
  return dat



In [28]:
import spacy
from profanity_filter import ProfanityFilter

def add_profanity(dat):
  nlp = spacy.load('en')
  profanity_filter = ProfanityFilter(nlps={'en': nlp})  # reuse spacy Language (optional)
  nlp.add_pipe(profanity_filter.spacy_component, last=True)
  profanity = []
  for item in dat.text:
    doc = nlp(item)
    profanity.append(doc._.is_profane)
  
  dat['profanity'] = profanity
  return dat

#### Identify profanity in each dataset

In [29]:
dev_seen = convert_jsonl_pd('/content/dev_seen.jsonl')
dev_seen.head()

Unnamed: 0,id,img,label,text
0,8291,img/08291.png,1,white people is this a shooting range
1,46971,img/46971.png,1,bravery at its finest
2,3745,img/03745.png,1,your order comes to $37.50 and your white priv...
3,83745,img/83745.png,1,it is time.. to send these parasites back to t...
4,80243,img/80243.png,1,mississippi wind chime


In [30]:
#add indicator for profanity
dev_seen1 = add_profanity(dev_seen)
dev_seen1.head()

Unnamed: 0,id,img,label,text,profanity
0,8291,img/08291.png,1,white people is this a shooting range,False
1,46971,img/46971.png,1,bravery at its finest,False
2,3745,img/03745.png,1,your order comes to $37.50 and your white priv...,False
3,83745,img/83745.png,1,it is time.. to send these parasites back to t...,False
4,80243,img/80243.png,1,mississippi wind chime,False


In [31]:
pd.crosstab(dev_seen1['profanity'], dev_seen1['label'])

label,0,1
profanity,Unnamed: 1_level_1,Unnamed: 2_level_1
False,237,217
True,16,30


#### Build data dictionary with keywords for


*   Racism 
*   Nationality
*   Pregnancy/abortion
*   Disability
*   Religion



In [34]:
# check % of hateful memes with specific word
train = convert_jsonl_pd('/content/train.jsonl')
np.mean(train.label[train.text.apply(lambda x:x.find('asian') != -1)])

0.7560975609756098

In [35]:
nationality = {'asian', 'native american', 'aussie', 'indian', 'black', 'african', 'caucasian', 'white', 'jewish', 'jew', 'european', 'mexican'}

In [9]:
keywords = {'muslim', 'mohammed', 'sex', 'gay', 'shooter', 'straight', 'diaper', 'blind', 'baby', 'israel', 'lesbian', 'pregnant'}