In [1]:
import os
import json
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount("/content/gdrive")

In [2]:
#install profanity filter
!pip install profanity-filter

Collecting profanity-filter
  Downloading profanity_filter-1.3.3-py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 2.7 MB/s 
Collecting pydantic<2.0,>=1.3
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 4.5 MB/s 
[?25hCollecting poetry-version<0.2.0,>=0.1.3
  Downloading poetry_version-0.1.5-py2.py3-none-any.whl (13 kB)
Collecting ordered-set<4.0,>=3.0
  Downloading ordered-set-3.1.1.tar.gz (10 kB)
Collecting ordered-set-stubs<0.2.0,>=0.1.3
  Downloading ordered_set_stubs-0.1.3-py2.py3-none-any.whl (4.8 kB)
Collecting ruamel.yaml<0.16.0,>=0.15.89
  Downloading ruamel.yaml-0.15.100-cp37-cp37m-manylinux1_x86_64.whl (654 kB)
[K     |████████████████████████████████| 654 kB 49.8 MB/s 
Collecting redis<4.0,>=3.2
  Downloading redis-3.5.3-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 483 kB/s 
Collecting tomlkit<0.6.0,>=0.4.6
  Downloading tomlkit-0.5.11-py2

#### Helper functions

In [27]:
def convert_jsonl_pd(filepath):
  with open(filepath, 'r') as json_file:
    json_list = list(json_file)
  
  dat = []
  for json_str in json_list:
    result = json.loads(json_str)
    dat.append(result)
  
  dat = pd.DataFrame(dat)
  return dat



In [28]:
import spacy
from profanity_filter import ProfanityFilter

def add_profanity(dat):
  nlp = spacy.load('en')
  profanity_filter = ProfanityFilter(nlps={'en': nlp})  # reuse spacy Language (optional)
  nlp.add_pipe(profanity_filter.spacy_component, last=True)
  profanity = []
  for item in dat.text:
    doc = nlp(item)
    profanity.append(doc._.is_profane)
  
  dat['profanity'] = profanity
  return dat

#### Identify profanity in each dataset

In [29]:
dev_seen = convert_jsonl_pd('/content/dev_seen.jsonl')
dev_seen.head()

Unnamed: 0,id,img,label,text
0,8291,img/08291.png,1,white people is this a shooting range
1,46971,img/46971.png,1,bravery at its finest
2,3745,img/03745.png,1,your order comes to $37.50 and your white priv...
3,83745,img/83745.png,1,it is time.. to send these parasites back to t...
4,80243,img/80243.png,1,mississippi wind chime


In [30]:
#add indicator for profanity
dev_seen1 = add_profanity(dev_seen)
dev_seen1.head()

Unnamed: 0,id,img,label,text,profanity
0,8291,img/08291.png,1,white people is this a shooting range,False
1,46971,img/46971.png,1,bravery at its finest,False
2,3745,img/03745.png,1,your order comes to $37.50 and your white priv...,False
3,83745,img/83745.png,1,it is time.. to send these parasites back to t...,False
4,80243,img/80243.png,1,mississippi wind chime,False


In [31]:
pd.crosstab(dev_seen1['profanity'], dev_seen1['label'])

label,0,1
profanity,Unnamed: 1_level_1,Unnamed: 2_level_1
False,237,217
True,16,30


#### Build data dictionary with keywords for


*   Racism 
*   Nationality
*   Pregnancy/abortion
*   Disability
*   Religion



In [34]:
# check % of hateful memes with specific word
train = convert_jsonl_pd('/content/train.jsonl')
np.mean(train.label[train.text.apply(lambda x:x.find('asian') != -1)])

0.7560975609756098

In [35]:
nationality = {'asian', 'native american', 'aussie', 'indian', 'black', 'african', 'caucasian', 'white', 'jewish', 'jew', 'european', 'mexican'}

In [9]:
keywords = {'muslim', 'mohammed', 'sex', 'gay', 'shooter', 'straight', 'diaper', 'blind', 'baby', 'israel', 'lesbian', 'pregnant'}