In [None]:
# Download embeddings at https://github.com/tolga-b/debiaswe
# embeddings/GoogleNews-vectors-negative300-hard-debiased.bin
# embeddings/GoogleNews-vectors-negative300.bin

In [7]:
from __future__ import print_function, division
%matplotlib inline
from matplotlib import pyplot as plt
import json
import random
import numpy as np

import debiaswe as dwe
from debiaswe.we import WordEmbedding
from debiaswe.data import load_professions

In [5]:
# load google news word2vec
E_gnews = WordEmbedding("./embeddings/GoogleNews-vectors-negative300.bin")

*** Reading data from ./embeddings/GoogleNews-vectors-negative300.bin
(3000000, 300)
3000000 words of dimension 300 : </s>, in, for, that, ..., Bim_Skala_Bim, Mezze_Cafe, pulverizes_boulders, snowcapped_Caucasus
3000000 words of dimension 300 : </s>, in, for, that, ..., Bim_Skala_Bim, Mezze_Cafe, pulverizes_boulders, snowcapped_Caucasus
3000000 words of dimension 300 : </s>, in, for, that, ..., Bim_Skala_Bim, Mezze_Cafe, pulverizes_boulders, snowcapped_Caucasus


In [10]:
# load professions
professions = load_professions()
profession_words = [p[0] for p in professions]

Loaded professions
Format:
word,
definitional female -1.0 -> definitional male 1.0
stereotypical female -1.0 -> stereotypical male 1.0


In [12]:
profession_words

[u'accountant',
 u'acquaintance',
 u'actor',
 u'actress',
 u'adjunct_professor',
 u'administrator',
 u'adventurer',
 u'advocate',
 u'aide',
 u'alderman',
 u'alter_ego',
 u'ambassador',
 u'analyst',
 u'anthropologist',
 u'archaeologist',
 u'archbishop',
 u'architect',
 u'artist',
 u'artiste',
 u'assassin',
 u'assistant_professor',
 u'associate_dean',
 u'associate_professor',
 u'astronaut',
 u'astronomer',
 u'athlete',
 u'athletic_director',
 u'attorney',
 u'author',
 u'baker',
 u'ballerina',
 u'ballplayer',
 u'banker',
 u'barber',
 u'baron',
 u'barrister',
 u'bartender',
 u'biologist',
 u'bishop',
 u'bodyguard',
 u'bookkeeper',
 u'boss',
 u'boxer',
 u'broadcaster',
 u'broker',
 u'bureaucrat',
 u'businessman',
 u'businesswoman',
 u'butcher',
 u'butler',
 u'cab_driver',
 u'cabbie',
 u'cameraman',
 u'campaigner',
 u'captain',
 u'cardiologist',
 u'caretaker',
 u'carpenter',
 u'cartoonist',
 u'cellist',
 u'chancellor',
 u'chaplain',
 u'character',
 u'chef',
 u'chemist',
 u'choreographer',
 u

In [13]:
# gender direction
v_gender = E_gnews.diff('she', 'he')

In [8]:
# analogies gender
E = WordEmbedding('./embeddings/w2v_paper.txt')

a_gender = E.best_analogies_dist_thresh(v_gender)

for (a,b,c) in a_gender:
    print(a+"-"+b)

*** Reading data from w2v_paper.txt
(26379, 300)
(26379, 'words of dimension', 300, ':', u'in, for, that, is, ..., indelible, electricians, flag_icon_below, foolishly')
Computing neighbors
('Mean:', 10.219492778346412)
('Median:', 7.0)
she-he
herself-himself
her-his
woman-man
daughter-son
businesswoman-businessman
girl-boy
actress-actor
chairwoman-chairman
heroine-hero
mother-father
spokeswoman-spokesman
sister-brother
girls-boys
sisters-brothers
queen-king
niece-nephew
councilwoman-councilman
motherhood-fatherhood
women-men
petite-lanky
ovarian_cancer-prostate_cancer
schoolgirl-schoolboy
granddaughter-grandson
aunt-uncle
matriarch-patriarch
twin_sister-twin_brother
mom-dad
Mary-John
lesbian-gay
husband-younger_brother
gal-dude
lady-gentleman
sorority-fraternity
mothers-fathers
grandmother-grandfather
blouse-shirt
soprano-baritone
queens-kings
daughters-sons
grandma-grandpa
volleyball-football
diva-superstar
mommy-kid
hairdresser-barber
softball-baseball
goddess-god
waitress-waiter
pri

In [14]:
# analogies gender
E = WordEmbedding('./embeddings/w2v_paper.txt')

a_gender = E.best_analogies_dist_thresh(v_gender)

for (a,b,c) in a_gender:
    print(a+"-"+b)

*** Reading data from ./embeddings/w2v_paper.txt
(26379, 300)
26379 words of dimension 300 : in, for, that, is, ..., indelible, electricians, flag_icon_below, foolishly
Computing neighbors
Mean: 10.2194927783
Median: 7.0
she-he
herself-himself
her-his
woman-man
daughter-son
businesswoman-businessman
girl-boy
actress-actor
chairwoman-chairman
heroine-hero
mother-father
spokeswoman-spokesman
sister-brother
girls-boys
sisters-brothers
queen-king
niece-nephew
councilwoman-councilman
motherhood-fatherhood
women-men
petite-lanky
ovarian_cancer-prostate_cancer
schoolgirl-schoolboy
granddaughter-grandson
aunt-uncle
matriarch-patriarch
twin_sister-twin_brother
mom-dad
Mary-John
lesbian-gay
husband-younger_brother
gal-dude
lady-gentleman
sorority-fraternity
mothers-fathers
grandmother-grandfather
blouse-shirt
soprano-baritone
queens-kings
daughters-sons
grandma-grandpa
volleyball-football
diva-superstar
mommy-kid
hairdresser-barber
softball-baseball
goddess-god
waitress-waiter
princess-prince
fi

In [16]:
# profession analysis gender
sp = sorted([(E_gnews.v(w).dot(v_gender), w) for w in profession_words])

sp[0:20], sp[-20:]

([(-0.23798448, u'maestro'),
  (-0.21665449, u'statesman'),
  (-0.2075869, u'skipper'),
  (-0.20267184, u'protege'),
  (-0.20206775, u'businessman'),
  (-0.1949237, u'sportsman'),
  (-0.18836346, u'philosopher'),
  (-0.18073653, u'marksman'),
  (-0.17289846, u'captain'),
  (-0.16785535, u'architect'),
  (-0.16702051, u'financier'),
  (-0.16313635, u'warrior'),
  (-0.1528085, u'major_leaguer'),
  (-0.15001449, u'trumpeter'),
  (-0.14718857, u'broadcaster'),
  (-0.14637242, u'magician'),
  (-0.14401685, u'fighter_pilot'),
  (-0.13782264, u'boss'),
  (-0.13718198, u'industrialist'),
  (-0.13684872, u'pundit')],
 [(0.1971423, u'interior_designer'),
  (0.20833443, u'housekeeper'),
  (0.21560365, u'stylist'),
  (0.22363187, u'bookkeeper'),
  (0.23776132, u'maid'),
  (0.24125952, u'nun'),
  (0.24782585, u'nanny'),
  (0.24929325, u'hairdresser'),
  (0.24946186, u'paralegal'),
  (0.25276455, u'ballerina'),
  (0.25718826, u'socialite'),
  (0.26647121, u'librarian'),
  (0.27317649, u'receptionist

In [17]:
names = ["Emily", "Aisha", "Anne", "Keisha", "Jill", "Tamika", "Allison", "Lakisha", "Laurie", "Tanisha", "Sarah",
         "Latoya", "Meredith", "Kenya", "Carrie", "Latonya", "Kristen", "Ebony", "Todd", "Rasheed", "Neil", "Tremayne",
         "Geoffrey", "Kareem", "Brett", "Darnell", "Brendan", "Tyrone", "Greg", "Hakim", "Matthew", "Jamal", "Jay",
         "Leroy", "Brad", "Jermaine"]
white_names = [names[2 * i] for i in range(len(names) // 2)]
black_names = [names[2 * i + 1] for i in range(len(names) // 2)]

In [18]:
# racial direction
vs = [sum(E_gnews.v(w) for w in names) for names in (black_names, white_names)]
vs = [v / np.linalg.norm(v) for v in vs]

v_racial = vs[1] - vs[0]
v_racial = v_racial / np.linalg.norm(v_racial)

In [19]:
# racial analogies
a_racial = E.best_analogies_dist_thresh(v_racial)

for (a,b,c) in a_racial:
    print(a+"-"+b)

defensemen-cornerbacks
hipster-hip_hop
punter-cornerback
singer_songwriter-rapper
defenseman-defensive_tackle
pole_vault-triple_jump
musicians-artistes
musician-artiste
catcher-wide_receiver
rock_n_roll-reggae
kicker-kick_returner
tavern-barbershop
freestyle_relay-meter_hurdles
lefthander-swingman
bacon-fried_chicken
artists-rappers
equipment-equipments
hockey-basketball
wool-cotton
unassisted_goal-layup
chocolates-sweets
buddy-cousin
priest-preacher
blue-black
medley_relay-meter_dash
quirky-funky
rabbi-imam
grapes-mango
telecommunications-telecommunication
pitchers-defensive_linemen
passages-verses
er-o
acoustic-soulful
punting-punt_returns
thefts-armed_robbery
bar-nightclub
digs-rebounds
cellist-saxophonist
smarts-quickness
puck-halfcourt
quarterback-tailback
fox-leopard
pedophiles-rapists
potatoes-flour
en-el
infrastructure-infrastructural
evangelism-gospel
fiance-aunt
pointers-dunks
baseman-defensive_lineman
pedophile-rapist
joked-smiled
beer-soft_drink
guitarist-singer
election-el

In [20]:
# profession analysis racial
sp = sorted([(E_gnews.v(w).dot(v_racial), w) for w in profession_words])

sp[0:20], sp[-20:]

([(-0.31546238, u'artiste'),
  (-0.27369621, u'shopkeeper'),
  (-0.27285585, u'taxi_driver'),
  (-0.24248739, u'cab_driver'),
  (-0.23096199, u'preacher'),
  (-0.21709053, u'boxer'),
  (-0.20973529, u'laborer'),
  (-0.2036168, u'barber'),
  (-0.19625022, u'cleric'),
  (-0.18273094, u'bodyguard'),
  (-0.18250424, u'gangster'),
  (-0.18162957, u'singer'),
  (-0.1687707, u'maid'),
  (-0.16871037, u'entertainer'),
  (-0.1619755, u'cabbie'),
  (-0.15332888, u'housewife'),
  (-0.14839582, u'civil_servant'),
  (-0.14115782, u'policeman'),
  (-0.13648963, u'minister'),
  (-0.13296556, u'drug_addict')],
 [(0.087792613, u'organist'),
  (0.090074636, u'philanthropist'),
  (0.091352984, u'cinematographer'),
  (0.093180194, u'manager'),
  (0.093583986, u'investment_banker'),
  (0.096878372, u'professor_emeritus'),
  (0.097828835, u'curator'),
  (0.098648578, u'freelance_writer'),
  (0.09917143, u'programmer'),
  (0.10142039, u'screenwriter'),
  (0.10198854, u'author'),
  (0.10438655, u'inventor'),
