In [None]:
import os
import json
import pandas as pd
import numpy as np
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
from bs4 import BeautifulSoup
import requests

In [None]:
#install profanity filter
!pip install profanity-filter

Collecting profanity-filter
  Downloading profanity_filter-1.3.3-py3-none-any.whl (45 kB)
[K     |████████████████████████████████| 45 kB 2.5 MB/s 
[?25hCollecting poetry-version<0.2.0,>=0.1.3
  Downloading poetry_version-0.1.5-py2.py3-none-any.whl (13 kB)
Collecting pydantic<2.0,>=1.3
  Downloading pydantic-1.8.2-cp37-cp37m-manylinux2014_x86_64.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 16.4 MB/s 
[?25hCollecting ordered-set<4.0,>=3.0
  Downloading ordered-set-3.1.1.tar.gz (10 kB)
Collecting redis<4.0,>=3.2
  Downloading redis-3.5.3-py2.py3-none-any.whl (72 kB)
[K     |████████████████████████████████| 72 kB 542 kB/s 
[?25hCollecting ruamel.yaml<0.16.0,>=0.15.89
  Downloading ruamel.yaml-0.15.100-cp37-cp37m-manylinux1_x86_64.whl (654 kB)
[K     |████████████████████████████████| 654 kB 32.6 MB/s 
[?25hCollecting ordered-set-stubs<0.2.0,>=0.1.3
  Downloading ordered_set_stubs-0.1.3-py2.py3-none-any.whl (4.8 kB)
Collecting tomlkit<0.6.0,>=0.4.6
  Downloading

#### Helper functions

In [None]:
def convert_jsonl_pd(filepath):
  with open(filepath, 'r') as json_file:
    json_list = list(json_file)
  
  dat = []
  for json_str in json_list:
    result = json.loads(json_str)
    dat.append(result)
  
  dat = pd.DataFrame(dat)
  return dat



In [None]:
import spacy
from profanity_filter import ProfanityFilter

def add_profanity(dat):
  nlp = spacy.load('en')
  profanity_filter = ProfanityFilter(nlps={'en': nlp})  # reuse spacy Language (optional)
  nlp.add_pipe(profanity_filter.spacy_component, last=True)
  profanity = []
  for item in dat.text:
    doc = nlp(item)
    profanity.append(doc._.is_profane)
  
  dat['profanity'] = profanity
  return dat

#### Identify profanity in each dataset

In [None]:
train = convert_jsonl_pd('/content/train.jsonl')
train.head()

Unnamed: 0,id,img,label,text
0,42953,img/42953.png,0,its their character not their color that matters
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...
2,13894,img/13894.png,0,putting bows on your pet
3,37408,img/37408.png,0,i love everything and everybody! except for sq...
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h..."


In [None]:
#add indicator for profanity
train = add_profanity(train)
train.head()

Unnamed: 0,id,img,label,text,profanity
0,42953,img/42953.png,0,its their character not their color that matters,False
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...,False
2,13894,img/13894.png,0,putting bows on your pet,False
3,37408,img/37408.png,0,i love everything and everybody! except for sq...,False
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h...",False


In [None]:
pd.crosstab(train['profanity'], train['label'])

label,0,1
profanity,Unnamed: 1_level_1,Unnamed: 2_level_1
False,4732,2399
True,749,620


#### Build data dictionary with keywords for


*   Racism 
*   Nationality
*   Pregnancy/abortion
*   Disability
*   Religion
*   Gender



In [None]:
nationality = ["afghanistan", "albania", "algeria", "argentina", "australia", 
"austria", "bangladesh", "belgium", "bolivia", "botswana", "brazil", 
"bulgaria", "cambodia", "cameroon", "canada", "chile", "china", 
"colombia", "costa rica", "croatia", "cuba", "czech republic", 
"denmark", "dominican republic", "ecuador", "egypt", "el salvador", 
"england", "estonia", "ethiopia", "fiji", "finland", "france", 
"germany", "ghana", "greece", "guatemala", "haiti", "honduras", 
"hungary", "iceland", "india", "indonesia", "iran", "iraq", "ireland", 
"israel", "italy", "jamaica", "japan", "jordan", "kenya", "kuwait", 
"laos", "latvia", "lebanon", "libya", "lithuania", "madagascar", 
"malaysia", "mali", "malta", "mexico", "mongolia", "morocco", 
"mozambique", "namibia", "nepal", "netherlands", "new zealand", 
"nicaragua", "nigeria", "norway", "pakistan", "panama", "paraguay", 
"peru", "philippines", "poland", "portugal", "romania", "russia", 
"saudi arabia", "scotland", "senegal", "serbia", "singapore", 
"slovakia", "south africa", "south korea", "spain", "sri lanka", 
"sudan", "sweden", "switzerland", "syria", "taiwan", "tajikistan", 
"thailand", "tonga", "tunisia", "turkey", "ukraine", "united arab emirates", 
"united kingdom", "united states", "uruguay", "venezuela", 
"vietnam", "wales", "zambia", "zimbabwe", "afghan", "albanian", 
"algerian", "argentineargentinian", "australian", "austrian", 
"bangladeshi", "belgian", "bolivian", "batswana", "brazilian", 
"bulgarian", "cambodian", "cameroonian", "canadian", "chilean", 
"chinese", "colombian", "costa rican", "croatian", "cuban", "czech", 
"danish", "dominican", "ecuadorian", "egyptian", "salvadorian", 
"english", "estonian", "ethiopian", "fijian", "finnish", "french", 
"german", "ghanaian", "greek", "guatemalan", "haitian", "honduran", 
"hungarian", "icelandic", "indian", "indonesian", "iranian", 
"iraqi", "irish", "israeli", "italian", "jamaican", "japanese", 
"jordanian", "kenyan", "kuwaiti", "lao", "latvian", "lebanese", 
"libyan", "lithuanian", "malagasy", "malaysian", "malian", "maltese", 
"mexican", "mongolian", "moroccan", "mozambican", "namibian", 
"nepalese", "dutch", "nicaraguan", "nigerian", "norwegian", "pakistani", 
"panamanian", "paraguayan", "peruvian", "philippine", "polish", 
"portuguese", "romanian", "russian", "saudi", "scottish", "senegalese", 
"serbian", "singaporean", "slovak", "south african", "korean", 
"spanish", "sri lankan", "sudanese", "swedish", "swiss", "syrian", 
"taiwanese", "tajikistani", "thai", "tongan", "tunisian", "turkish", 
"ukrainian", "emirati", "british", "american", "uruguayan", 
"venezuelan", "vietnamese", "welsh", "zambian", "zimbabwean", 
"botswanan", "costrican", "croat", "dane", "salvadoran", "englishman", "englishwoman", 
"finn", "frenchm", "frenchwoman", "icelander", "irishman", "irishwoman", 
"laotain", "nambian", "dutchmandutchwoman", "new zealander", 
"filipino", "pole", "saudi", "scot", "spaniard", "swede", 
"turk", "ukranian", "brit", "american", "welshmanwelshwoman", 
"dari", "persian", "pashto", "arabic", "bengali", "flemish", 
"english, setswana", "french",
"amharic", "fijian", "creole", "hindi", 
"persian", "arabic, kurdish", "irish", "hebrew", "swahili", 
"arabiv", "laotian", "malagasy", "malay", "malaysian", 
"english, maltese", "arabic", "nepali", "maori", 
"urdu, english", "tagalog", "filipino", "malay", "mandarin", "tamil", 
"afrikaans", "sinhala", "tamil", "arabic", 
"romansh", "tajik", "tongan", 
"welsh", "asian", 'native american', 'aussie', 'african', 'jewish', 'european', 'caucasian']
nationality = set(nationality)

In [None]:
out = []
for text in train.text:
  temp = []
  for item in nationality:
    if text.find(item) != -1:
      temp.append(item)
  temp = ','.join(temp)
  out.append(temp)

  

In [None]:
train['nationality'] = out

In [None]:
pd.crosstab(train['nationality'] == '', train['label'])

label,0,1
nationality,Unnamed: 1_level_1,Unnamed: 2_level_1
False,328,451
True,5153,2568


### Racism

In [None]:
# get URL
page = requests.get("https://en.wikipedia.org/wiki/List_of_regional_nicknames")
 
# scrape webpage
soup = BeautifulSoup(page.content, 'html.parser')
 
list(soup.children)
 
# find all occurrence of p in HTML
# includes HTML tags
print(soup.find_all('dt'))

[<dt>Arkansawyer</dt>, <dt>Arkie/Arky</dt>, <dt>Appler</dt>, <dt><a href="/wiki/Aussie" title="Aussie">Aussie</a></dt>, <dt>Banker (or Outer Banker)</dt>, <dt>Banana bender</dt>, <dt><a class="mw-redirect" href="/wiki/Puerto_Rican_people#Boricua" title="Puerto Rican people">Boricua</a></dt>, <dt>Bluenose, Bluenoser</dt>, <dt><a class="mw-redirect" href="/wiki/Bonacker" title="Bonacker">Bonacker</a></dt>, <dt><a class="mw-redirect" href="/wiki/Brummie" title="Brummie">Brummie</a></dt>, <dt><a class="mw-redirect" href="/wiki/Buckeye_(nickname)" title="Buckeye (nickname)">Buckeye</a></dt>, <dt>Cabbage patcher</dt>, <dt><a class="mw-redirect" href="/wiki/Cajun" title="Cajun">Cajun</a></dt>, <dt><a href="/wiki/Canuck" title="Canuck">Canuck</a></dt>, <dt>Capixaba</dt>, <dt><a href="/wiki/Carioca" title="Carioca">Carioca</a></dt>, <dt>Candango</dt>, <dt><a class="mw-redirect" href="/wiki/Catracho" title="Catracho">Catracho</a></dt>, <dt><a class="extiw" href="https://en.wiktionary.org/wiki/ch

In [None]:
out = []
for item in soup.find_all('dt'):
  out.append(item.get_text().lower())

In [None]:
out

['arkansawyer',
 'arkie/arky',
 'appler',
 'aussie',
 'banker (or outer banker)',
 'banana bender',
 'boricua',
 'bluenose, bluenoser',
 'bonacker',
 'brummie',
 'buckeye',
 'cabbage patcher',
 'cajun',
 'canuck',
 'capixaba',
 'carioca',
 'candango',
 'catracho',
 'chamo',
 'cheesehead',
 'chilango, defeño, capitalino',
 'cockney',
 'cockroach',
 'cohee',
 'conch',
 'croweater',
 'culchie',
 'dallie or dally',
 'foolio',
 'fudgie',
 'gaúcho',
 'geordie',
 'gult, gulti, gulte',
 'hidrocálido',
 'hillbilly',
 'hoosier',
 'jackeen',
 'jafa, jafa',
 'swansea jack',
 'janner',
 'jarocho',
 'kaaskop',
 'kiwi',
 'kraut',
 'leodesian, loiner',
 'mackem',
 'madrasi',
 'michigander',
 'monkey hanger',
 'moonrakers',
 'newfie, newfie, newf',
 'knickerbocker',
 'nutmegger',
 'nipper',
 'okie',
 'ossi',
 'pikey',
 'pinoy',
 'polentone\xa0[it]',
 'pom',
 'porteño',
 'polak',
 'regio, regiomontano',
 'rhode islander',
 'saffa / saffer',
 'sandgroper',
 'sandlapper',
 'scouser',
 'sooner',
 'spud isl

In [None]:
racism = ['black', 'canuck', 'colored', 'coloured', 'coolie', 'coon', 'cracker', 'dago',
          'eskimo', 'frog', 'gipsy', 'gook', 'goy', 'gringo', 'gypsy', 'half-breed', 'half-caste',
          'yid', 'yankee', 'yank', 'wop', 'wog', 'whitey', 'wetback', 'wasp', 'uncle tom', 'taffy',
          'squaw', 'spade', 'savage', 'redskin', 'red indian', 'pygmy', 'primitive', 'pommy', 'polack',
          'pickaninny', 'paleface', 'pakeha', 'paddy', 'oriental', 'nonwhite', 'nigger', 'negro', 'negress',
          'native', 'mulatto', 'mick', 'mammy', 'makwerekwere', 'kraut', 'kafir', 'kaffir', 'jock', 'jerry',
          'injun', 'honky', 'haole', "blacks", "abeed", "mixed", "beaner", "cholo", "buckra", "canuck", 
"east asians", "banana", "ah beng", "jap", 
"gaoli bangzi", "american-born confused desi", "bong", 
"chinki", "paki","ang mo", "šiptar", "limey", "cheesehead", 
"chukhna", "cheese-eating surrender monkeys", "hun", "grecoman", 
"fenian", "dago", "polack", "moskal", "shkije", "dago", "khokhol", 
"bulgarophiles", "rafida", "christ killer", "blackfella", "kebab", 
"didicoy", "ajam", "abeed", "beaner", "cholo", "buckra", "canuck", 
"banana", "ah beng", "jap", "gaoli bangzi",  
"american-born confused desi", "bong", "chinki", "paki", "banana", 
"ah beng", "jap", "gaoli bangzi", "american-born confused desi", 
"bong", "chinki", "paki", "ang mo", "šiptar", "limey", "cheesehead", 
"chukhna", "cheese-eating surrender monkeys", "hun", "grecoman", 
"fenian", "dago", "polack", "moskal", "shkije", "dago", "khokhol", 
"bulgarophiles", 'abcd', 'white', 'apache', 'bohemian', 'bugger', 'cannibal', 'cohee',
'goth', 'gringo', 'gyp', 'hun', 'mongol', 'philistine', 'pygmy', 'sherpa', 'tartar', 'vandal',
"arkansawyer", "arkie", "arky", "appler", "aussie", "banker", 
"banana bender", "boricua", "bluenose, bluenoser", "bonacker", 
"brummie", "buckeye", "cabbage patcher", "cajun", "canuck", "capixaba", 
"carioca", "candango", "catracho", "chamo", "cheesehead", "chilango, defeño, capitalino", 
"cockney", "cockroach", "cohee", "conch", "croweater", "culchie", 
"dallie", "dally", "foolio", "fudgie", "gaúcho", "geordie", 
"gult, gulti, gulte", "hidrocálido", "hillbilly", "hoosier", 
"jackeen", "jafa", "jafa", "swansea jack", "janner", "jarocho", 
"kaaskop", "kiwi", "kraut", "leodesian, loiner", "mackem", "madrasi", 
"michigander", "monkey hanger", "moonrakers", "newfie, newfie, newf", 
"knickerbocker", "nutmegger", "nipper", "okie", "ossi", "pikey", 
"pinoy", "polentone", "pom", "porteño", "polak", "regio, regiomontano", 
"rhode islander", "saffa", "saffer", "sandgroper", "sandlapper", 
"scouser", "sooner", "spud islander", "stubblejumper", "taffy", 
"tar heel", "taswegian, tassie", "tapatío", "terrone", "tico", 
"tripeiro", "trolls", "tuckahoe", "tyke", "wessi", "yat", "yellowbelly", "copthorne", 
"yellowbelly", "lincolnshire", "yinzer", "yooper", "zimbo", "carcamano", 
"coastie", "cohee", "eurotrash", "flatlander", "gaucho", "goober", 
"guajiro", "hillbilly", "redneck", "swamp yankee", "teuchter", 
"westie", "westy", "woollyback", "yankee", "yank", "yardie", "yokel"]
racism = set(racism)

In [None]:
out = []
for text in train.text:
  temp = []
  for item in racism:
    if text.find(item) != -1:
      temp.append(item)
  temp = ','.join(temp)
  out.append(temp)


In [None]:
train['racism'] = out
pd.crosstab(train['racism'] == '', train['label'])

label,0,1
racism,Unnamed: 1_level_1,Unnamed: 2_level_1
False,312,584
True,5169,2435


### Religion

In [None]:
religion = ["bible beater", "bible basher", "bible thumper", 
"cafeteria christian", "chuhra", "fundie", "isai, saai", "rice christian, rice bag", 
"campbellite", "holy roller", "jaffa", "prod", "russellite", 
"shaker", "soup-taker", "left-footer", "fenian", "mackerel snapper", 
"mick", "papist", "red letter tribe", "redneck", "roman catholic", 
"shaveling", "taig", "mormon", "molly mormon", "jack mormon", 
"abbie", "heeb", "hymie", "ikey", "itzig", "jewboy", "kike", 
"mocky", "moch", "red sea pedestrian", "sheeny", "shylock", "yakubian", 
"yid", "zhyd", "raghead", "osama", "muzzie", "qadiani", "kadrun", 
"cow piss drinker, piss drinker", "dothead", "malaun", 
"buddhists", "christians", "general", "catholics", "protestants", 
"hindus", "jews", "reformers", "cryptos", "muslim", 'muslims', 'mohammed',
"ahmadis", "isma'ilis", "sufis", "shias", "sunnis", "salafis", 
"non-believers", "non-muslims", "non-jewish", "zoroastrians"]

In [None]:
out = []
for text in train.text:
  temp = []
  for item in religion:
    if text.find(item) != -1:
      temp.append(item)
  temp = ','.join(temp)
  out.append(temp)

In [None]:
train['religion'] = out
pd.crosstab(train['religion'] == '', train['label'])

label,0,1
religion,Unnamed: 1_level_1,Unnamed: 2_level_1
False,68,248
True,5413,2771


### Gender

In [None]:
page = requests.get("https://en.wikipedia.org/wiki/List_of_LGBT_slang_terms")
 
# scrape webpage
soup = BeautifulSoup(page.content, 'html.parser')
 
list(soup.children)
out = []
for ul in soup.find_all('ul'):
  lis=ul.find_all('li')
  for elem in lis:
    out.append(elem.get_text().strip().lower())
print(out)

['lesbian', 'gay', 'bisexual', 'transgender', 'homosexuality', 'bisexuality\npansexuality', 'pansexuality', 'asexuality\ngray asexuality', 'gray asexuality', 'queer', 'sexual identity', 'demographics', 'biology', 'environment', 'gender identity', 'gender role', 'gender variance', 'non-binary gender', 'queer heterosexuality', 'sex and gender distinction', 'trans man', 'trans woman', 'transgender', 'transsexual', 'pansexuality', 'gray asexuality', 'timeline', 'social movements', 'gay liberation', 'stonewall riots', 'lgbt and intersex', 'coming out', 'community\nafrican-american', 'african-american', 'dyke march', 'events\nlargest events', 'largest events', 'gay village', 'homosocialization', 'media\nfilms\nnew queer cinema\nperiodicals', 'films', 'new queer cinema', 'periodicals', 'pride\nparade', 'parade', 'queer art', 'same-sex relationship', 'slang\nlist', 'list', 'symbols', 'takatāpui', 'african-american', 'largest events', 'films', 'new queer cinema', 'periodicals', 'parade', 'list'

In [None]:
gender = ["bean flicker", "butch", "carpet muncher", "dyke", "diesel dyke", 
"drag dyke", "kiki", "kitty puncher", "pussy puncher", "lezzie", 
"lesbo", "leso", "les", "leb", "lipstick lesbian,", "muff diver", 
"the game of flats", "anal assassin", "arse bandit", "ass bandit", 
"backgammon player", "bear", "bent", "bentshot", "bender", "bone smuggler", 
"brownie king", "brown piper", "bufter", "bufty", "booty buffer", 
"bugger", "switch hitter", "bicon:", "gillette blade", "unicorn", 
"hot bi babe", "hbb", "futanari", "hermie", "cuntboy", "dickgirl", 
"egg", "enby", "lady boy", "shemale", "t-girl", "tranny", "transbian", 
"molly", "tommy", "skoliosexual", "cissy", "cishet", "chaser", 
"fetishist", "lesbian", "gay", "bisexual", "transgender", "homosexuality", 
"bisexuality", "pansexuality", "asexuality", "gray asexuality", 
"gray asexuality", "queer", "bum boy", "bum chum", "bum robber", 
"bum-driller", "bumhole engineer", "butt pirate", "butt boy", 
"butt rider", "butt pilot", "butt rustler", "chi chi man", "cockstruction", 
"cockpipe cosmonaut", "crafty butcher", "daffodil", "daffy", 
"donut puncher", "donut muncher", "faggot", "fairy", "femboy", 
"finocchio", "flamer", "flit", "flower", "friend of dorothy", 
"fruit", "fudge packer", "gaysian", "gym bunny", "homo", "light in the loafers", 
"light in the pants", "light in the fedora", "limp wristed", 
"meat masseuse", "muscle mary", "ogay", "oklahomo", "pansy", 
"payaso", "peterpuffer", "bean queen", "taco queen", "salsa queen", 
"brownie queen", "chicken queen", "grey queen", "potato queen", 
"rice queen", "queer", "ring raider", "sissy", "sod", "twink", 
"woolly", "homophobic"]

In [None]:
out = []
for text in train.text:
  temp = []
  for item in gender:
    if item in text.split(' '):
      temp.append(item)
  temp = ','.join(temp)
  out.append(temp)
train['gender'] = out
pd.crosstab(train['gender'] == '', train['label'])

label,0,1
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
False,52,161
True,5429,2858


In [None]:
# check % of hateful memes with specific word
#train = convert_jsonl_pd('/content/train.jsonl')
np.mean(train.label[train.text.apply(lambda x:x.find('fat') != -1)])

0.5833333333333334

### Pregnancy

In [None]:
pregnancy = ["bun in the oven", "knocked up", "bat in the cave", "in the family way", 
"the rabbit died", "in the pudding club", "up the duff", "tin roof", 
"preggers", "knock up", "prego", "food baby", "preggy", "jamie lynn spears", 
"up the spout", "preggo", "chola", "pregnophile", "teen pregnancy", 
"tivo", "regnant", "mpreg", "pregnancy", "twit", "stuffed", "maiesiophilia", 
"preggie", "baby bump", "for shiz up the spout", "hermione", 
"duff", "juno", "project twins", "carlos", "sarah palin", 
"cooking up a baby", "tebo", "preg", "pregny", "heifer", "geni", 
"pregers", "pwilf", "up the pole", "selling life", "preggophile", 
"pregxy", "fupa", "mung", "pregnaphobia", "chubby-bunny", "infertile", 
"egg basketed", "internally fluffed", "pupa", "pregnot", "pregolicios", 
"catch a baby"]

### Disability

In [None]:
# get URL
page = requests.get("https://www.autistichoya.com/p/ableist-words-and-terms-to-avoid.html")
 
# scrape webpage
soup = BeautifulSoup(page.content, 'html.parser')
 
list(soup.children)
 
# find all occurrence of p in HTML
# includes HTML tags
print(soup.find_all('b'))

[<b><span style="font-size: x-large;">BEFORE YOU CONTINUE:</span></b>, <b><u>some</u></b>, <b><u>not</u> generally considered slurs, and in fact, <u>may not actually be</u> hurtful, upsetting, retraumatizing, or offensive to many disabled people</b>, <b>ableist</b>, <b>Violence in Language: Circling Back to Linguistic Ableism</b>, <b><u>Ableism is not a list of bad words.</u> Language is *one* tool of an oppressive system. Being aware of language -- for those of us who have the privilege of being able to change our language -- can help us understand how pervasive ableism is. Ableism is systematic, institutional devaluing of bodies and minds deemed deviant, abnormal, defective, subhuman, less than. <u>Ableism is *violence.*</u></b>, <b><u><br/></u></b>, <b>Glossary of Ableist Phrases</b>, <b>not </b>, <b>One important note</b>, <b><u>Generally ableist terms/phrases (some are slurs, some not)</u></b>, <b>Blind to ____ / turn a blind eye to </b>, <b>____ / blinded by ignorance/bigotry/etc

In [None]:
out = []
for item in soup.find_all('b'):
  out.append(item.get_text().lower())
print(out)

['before you continue:', 'some', 'not\xa0generally considered slurs, and in fact, may not actually be hurtful, upsetting, retraumatizing, or offensive to many disabled people', 'ableist', 'violence in language: circling back to linguistic ableism', 'ableism is not a list of bad words. language is *one* tool of an oppressive system. being aware of language -- for those of us who have the privilege of being able to change our language -- can help us understand how pervasive ableism is. ableism is systematic, institutional devaluing of bodies and minds deemed deviant, abnormal, defective, subhuman, less than. ableism is *violence.*', '', 'glossary of ableist phrases', 'not ', 'one important note', 'generally ableist terms/phrases (some are slurs, some not)', 'blind to ____ / turn a blind eye to\xa0', '____ / blinded by ignorance/bigotry/etc. ', 'double-blind review', 'bonkers', 'bound to a wheelchair (wheelchair bound)', 'burn victim', 'confined to a wheelchair', 'crazy\xa0', 'cripple/cri

In [None]:
disability = ["lame", "dumb", "retarded", "blind", "deaf", "imbecile", 
"psycho", "spaz", "barren", "cretin", "cripple", "crippled", 
"daft", "deaf-mute", "derp", "diffability", "differently abled", 
"feeble-minded", "handicap", "handicapable", "harelip", "hearing-impaired", 
"loony", "loony bin", "lunatic", "madhouse", "madman", 
"maniac", "mental", "mental case", "mental defective", "mongoloid", 
"moron", "moronic", "psychopathic", "psychotic", "short-bus", 
"simpleton", "spazzed", "specially abled", "special needs", "wacko", 
"whacko", "stupor", "blind to", "blind eye", "bigotry", 
"bonkers", "bound to a wheelchair", "wheelchair bound", "burn victim", 
"confined to a wheelchair", "cuckoo", "deaf to", "deaf ear", 
"deformed", "deformity", "deranged", "herp-derp", "der", "durr", 
"duh", "doy", "different abilities", "handicapped", "hermaphrodite", 
"autism", "nuthouse", "midget", "morbidly obese", "obese", "mouth breather", 
"nutcase", "nutter", "libtard", "fucktard", "albino", "autistic", 
"bipolar", "borderline", "deluded", "delusional", "freaky", "impaired", "impairment", "manic", 
"multiple personalities", "narcissistic", "ocd", "phobic", 
"islamophobic", "schizo", "schizophrenic", "the wheelchair", 
"claustrophobic", "anorexic", "spastic", "invalid"]

In [None]:
out = []
for text in train.text:
  temp = []
  for item in disability:
    if item in text.split(' '):
      temp.append(item)
  temp = ','.join(temp)
  out.append(temp)
train['disability'] = out
pd.crosstab(train['disability'] == '', train['label'])

label,0,1
disability,Unnamed: 1_level_1,Unnamed: 2_level_1
False,39,87
True,5442,2932


In [None]:
train.disability.value_counts()

                     8244
stupid                 85
mental                 31
retarded               25
mad                    23
blind                  20
dumb                   11
idiot                  11
autistic                9
nuts                    8
deaf                    6
midget                  6
spastic                 4
autism                  4
handicap                2
mental,moron            1
deformity               1
delusional              1
mental,delusional       1
maniac                  1
dumb,stupid             1
moron                   1
idiotic                 1
depressed               1
freak                   1
invalid                 1
Name: disability, dtype: int64

In [None]:
other