# GloVe Word Embedding Association Test

### First the imports

In [1]:
import gensim
from gensim.matutils import softcossim 
from gensim import corpora
import gensim.downloader as api
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from gensim.utils import simple_preprocess
print(gensim.__version__)

3.8.1


## Sample data - Insects and Flowers

In [5]:
flowers = ['aster', 'clover', 'hyacinth', 'marigold', 'poppy', 'azalea', 'crocus', 'iris', 'orchid', 
           'rose', 'bluebell', 'daffodil', 'lilac', 'pansy', 'tulip', 'buttercup', 'daisy', 'lily', 
           'peony', 'violet', 'carnation', 'geranium','magnolia', 'petunia', 'zinnia']  

insects = ['ant', 'caterpillar', 'flea', 'locust', 'spider', 'bedbug', 'centipede', 'fly', 'maggot', 
           'tarantula', 'bee', 'cockroach', 'gnat', 'mosquito', 'termite', 'beetle', 'cricket', 
           'hornet', 'moth', 'wasp', 'blackfly','dragonfly', 'horsefly', 'roach', 'weevil']

pleasant = ['caress', 'freedom', 'health', 'love', 'peace', 'cheer', 'friend','heaven', 'loyal',
            'pleasure', 'diamond', 'gentle', 'honest', 'lucky', 'rainbow', 'diploma', 'gift', 
            'honor', 'miracle', 'sunrise', 'family', 'happy', 'laughter', 'paradise', 'vacation']

unpleasant = ['abuse', 'crash', 'filth', 'murder', 'sickness', 'accident', 'death', 
              'grief', 'poison', 'stink',  'assault', 'disaster', 'hatred', 'pollute', 'tragedy', 
              'divorce', 'jail', 'poverty', 'ugly', 'cancer', 'kill', 'rotten', 'vomit', 'agony', 'prison']

## Load the GloVe pre-trained embeddings

From the Common Crawl Corpus 2.2 million unique words, 300-dimensional vector

In [3]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.840B.300d.txt' # glove.840B.300d
word2vec_output_file = 'glove840b.300d.w2v'
#glove2word2vec(glove_input_file, word2vec_output_file)

### now load it

In [4]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format(word2vec_output_file)

### now convert data to vectors

In [20]:
nurse = ["care", "patient", "nurse", "nursing", "patients", "health", "hospital", "RN", "staff", "therapy"

         "healthcare", "provider", "practitioner", "staffing", "outpatient", "family", "famiies", "home", "clinic", 

male = ['man', 'men', 'brother', "father", "uncle", "grandfather", "son", "he", "his", "him"]

female = ['woman', 'women', "sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"]

25 25


## Now, we need to convert the formulas from the paper

In [55]:
def mean_sim(word, attributes):
    cos = [cosine_similarity([word], [a]) for a in attributes]
    cos2 = [x[0][0] for x in cos]
    return (np.mean(cos2))

def testWord(w, A, B):
    # this gives us the s(w, A, B) from the original paper
    print("males: %f   female %f" % (mean_sim(w, A) , mean_sim(w, B)))
    return mean_sim(w, A) - mean_sim(w, B)

def numerator(X, Y, A, B):
    return np.mean([testWord(x, A, B) for x in X]) - np.mean([testWord(y, A, B) for y in Y])
    
#[ x[0][0] for x in mean_sim(target1[0], attribute1) ]   
#testWord(target1[0], attribute1, attribute2)
numerator(target1, target2, attribute1, attribute2)

males: 0.307276   female 0.345356
males: 0.264841   female 0.322488
males: 0.296172   female 0.448483
males: 0.210416   female 0.327193
males: 0.168572   female 0.232252
males: 0.214436   female 0.263055
males: 0.326970   female 0.366775
males: 0.001115   female 0.095459
males: 0.226772   female 0.210137
males: 0.151517   female 0.224009
males: 0.127146   female 0.159022
males: 0.101282   female 0.123084
males: 0.185828   female 0.213213
males: 0.027250   female 0.051616
males: 0.060516   female 0.112707
males: 0.452772   female 0.466966
males: -0.102539   female -0.086660
males: 0.378629   female 0.379099
males: 0.226569   female 0.281714
males: 0.091845   female 0.143824
males: 0.292008   female 0.166306
males: 0.150269   female 0.126159
males: 0.052183   female 0.058905
males: 0.078372   female 0.092080
males: 0.106126   female 0.074782
males: 0.113220   female 0.086201
males: -0.059521   female -0.078201
males: -0.014253   female -0.053625
males: 0.162285   female 0.138887
males: 0

-0.07319996

### now the standard deviation from the paper

In [14]:
def denominator(X, Y, A, B):
    full = np.concatenate((X,Y))
    foo = [testWord(x, A, B) for x in full]
    return(np.std(foo))
    
print(denominator(target1, target2, attribute1, attribute2) )   


0.051646188


## finally the Cohen's D

In [15]:
def cohenD(t1, t2, a1, a2):
    return numerator(t1, t2, a1, a2) / denominator(t1, t2,  a1, a2)

In [17]:
cohenD(target1, target2, attribute2, attribute1)

1.383523

### Results
GloVe results on flowers / insects Cohen's D of 1.5339754

In [39]:
len(np.concatenate((attribute1, attribute2)))

50

# More sample data

### WEAT Science Arts / male female

In [14]:
science = ["science", "technology", "physics", "chemistry", "Einstein", "NASA", "experiment", "astronomy"]
arts = ['poetry', "art", "Shakespeare", "dance", "literature", "novel", "symphony", "drama"]
male = ['brother', "father", "uncle", "grandfather", "son", "he", "his", "him"]
female = ["sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"]

In [15]:
target1 = [model[w] for w in science]
target2 = [model[w] for w in arts]
attribute1 = [model[w] for w in male]
attribute2 = [model[w] for w in female]
cohenD(target1, target2, attribute1, attribute2)

1.278036

#### Cohen D for Science Arts 1.278036

### WEAT Career, Family / male, female


In [16]:
career = ["executive", "management", "professional", "corporation", "salary", "office", "business", "career"]
family = ["home", "parents", "children", "family", "cousins", "marriage", "wedding", "relatives"]
maleNames = ["John", "Paul", "Mike", "Kevin", "Steve", "Greg", "Jeff", "Bill"]
femaleNames = ["Amy", "Joan", "Lisa", "Sarah", "Diana", "Kate", "Ann", "Donna"]

In [17]:
target1 = [model[w] for w in career]
target2 = [model[w] for w in family]
attribute1 = [model[w] for w in maleNames]
attribute2 = [model[w] for w in femaleNamesWEAT]
cohenD(target1, target2, attribute1, attribute2)

1.8607559

#### Cohen's D for Career Family / male female names  1.8607559

### WEAT Instruments  Weapons / Pleasant Unpleasant

In [4]:
instruments = ["bagpipe", "cello", "guitar", "lute", "trombone", "banjo", "clarinet", "harmonica", "mandolin",
"trumpet", "bassoon", "drum", "harp", "oboe", "tuba", "bell", "fiddle", "harpsichord", "piano", "viola", "bongo",
"flute", "horn", "saxophone", "violin"]

weapons = ["arrow", "club", "gun", "missile", "spear", "axe", "dagger", "harpoon", "pistol", "sword", "blade",
"dynamite", "hatchet", "rifle", "tank", "bomb", "firearm", "knife", "shotgun", "teargas", "cannon", "grenade",
"mace", "slingshot", "whip"]

In [18]:
glove.6B.300d

1.5495626

#### Cohen D for instruments  weapons  1.5495626

##  Profession Related

In [26]:
professions1 = ['programmer', 'engineer', 'scientist', "developer", "mechanic", "Javascript", "Google", "aerospace" ]
professions2 = ['nurse', 'teacher', 'librarian', "clerk", "volunteer", "community", "bookkeeper", "cashier" ]
#MAN	HE	MEN	HIM	BOY	HIS	
male = ['man', "men", "his", 'he', 'him']
female = ['woman', "women", 'she', 'her', "hers"]

model['Javascript']

In [27]:
nurse = ["care", "patient", "nurse", "nursing", "patients", "health", "hospital", "RN", "staff", "therapy"

         "healthcare", "provider", "practitioner", "staffing", "outpatient", "family", "famiies", "home", "clinic", 

male = ['man', 'men', 'brother', "father", "uncle", "grandfather", "son", "he", "his", "him"]

female = ['woman', 'women', "sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"]target1 = [model[w] for w in professions1]
target2 = [model[w] for w in professions2]
attribute1 = [model[w] for w in male]
attribute2 = [model[w] for w in female]
cohenD(target1, target2, attribute1, attribute2)

1.5763113

## Profession Related cohen 1.57
next try to find the most common words in resumes


# FastText  Model

In [28]:
glove.6B.300dimport pickle
fasttext = pickle.load(open("fasttext.p", 'rb'))

## Flowers Insects

In [29]:
flowers = ['aster', 'clover', 'hyacinth', 'marigold', 'poppy', 'azalea', 'crocus', 'iris', 'orchid', 
           'rose', 'bluebell', 'daffodil', 'lilac', 'pansy', 'tulip', 'buttercup', 'daisy', 'lily', 
           'peony', 'violet', 'carnation', 'geranium','magnolia', 'petunia', 'zinnia']  

insects = ['ant', 'caterpillar', 'flea', 'locust', 'spider', 'bedbug', 'centipede', 'fly', 'maggot', 
           'tarantula', 'bee', 'cockroach', 'gnat', 'mosquito', 'termite', 'beetle', 'cricket', 
           'hornet', 'moth', 'wasp', 'blackfly','dragonfly', 'horsefly', 'roach', 'weevil']

pleasant = ['caress', 'freedom', 'health', 'love', 'peace', 'cheer', 'friend','heaven', 'loyal',
            'pleasure', 'diamond', 'gentle', 'honest', 'lucky', 'rainbow', 'diploma', 'gift', 
            'honor', 'miracle', 'sunrise', 'family', 'happy', 'laughter', 'paradise', 'vacation']

unpleasant = ['abuse', 'crash', 'filth', 'murder', 'sickness', 'accident', 'death', 
              'grief', 'poison', 'stink',  'assault', 'disaster', 'hatred', 'pollute', 'tragedy', 
              'divorce', 'jail', 'poverty', 'ugly', 'cancer', 'kill', 'rotten', 'vomit', 'agony', 'prison']

In [30]:
target1 = [fasttext[w] for w in flowers]
target2 = [fasttext[w] for w in insects]
attribute1 = [fasttext[w] for w in pleasant]
attribute2 = [fasttext[w] for w in unpleasant]
cohenD(target1, target2, attribute1, attribute2)

1.5240668

#### FastText Flower Insect Cohen D 1,5240668

## Science Arts

In [31]:
science = ["science", "technology", "physics", "chemistry", "Einstein", "NASA", "experiment", "astronomy"]
arts = ['poetry', "art", "Shakespeare", "dance", "literature", "novel", "symphony", "drama"]
male = ['brother', "father", "uncle", "grandfather", "son", "he", "his", "him"]
female = ["sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"]

In [32]:
target1 = [fasttext[w] for w in science]
target2 = [fasttext[w] for w in arts]
attribute1 = [fasttext[w] for w in male]
attribute2 = [fasttext[w] for w in female]
cohenD(target1, target2, attribute1, attribute2)

1.008413

#### FastText Science Arts Cohen D 1.008413

## Career Family

In [34]:
glove.6B.300dtarget1 = [fasttext[w] for w in career]
target2 = [fasttext[w] for w in family]
attribute1 = [fasttext[w] for w in maleNames]
attribute2 = [fasttext[w] for w in femaleNames]
cohenD(target1, target2, attribute1, attribute2)

1.7501867

#### FastText Career Family  Cohen D 1.7501867

## Instruments Weapons

In [35]:
target1 = [fasttext[w] for w in instruments]
target2 = [fasttext[w] for w in weapons]
attribute1 = [fasttext[w] for w in pleasant]
attribute2 = [fasttext[w] for w in unpleasant]
cohenD(target1, target2, attribute1, attribute2)glove.6B.300d

1.6313261

#### FastText Instruments Weapons Cohen D 1.6313261

In [1]:
!ls


AssociationTestFasttext.ipynb  fasttext.p	      resumes
GloVe.ipynb		       fasttext_test.ipynb    text8.zip
Untitled.ipynb		       glove.840B.300d.txt    w2v.p
Untitled1.ipynb		       glove840b.300d.w2v     word2vec-small.ipynb
crawl-300d-2M-subword.bin      iat.ipynb	      word2vec-v1.ipynb
crawl-300d-2M-subword.vec      keras_word2vec.py      word2vec.model
data1.txt		       linkedInData_1000.zip  word2vec.py
data1.zip		       out


# GloVe Wikipedia Corpus

## Load the GloVe pre-trained embeddings

From the Wikepedia Corpus, 300-dimensional vector

In [3]:
from gensim.scripts.glove2word2vec import glove2word2vec
glove_input_file = 'glove.6B.300d.txt' # glove.840B.300d
word2vec_output_file = 'glove6b.300d.w2v'
#glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 300)

## Now load it

In [6]:
from gensim.models import KeyedVectors
model = KeyedVectors.load_word2vec_format(word2vec_output_file)

In [7]:
target1 = [model[w] for w in flowers]
target2 = [model[w] for w in insects]
attribute1 = [model[w] for w in pleasant]
attribute2 = [model[w] for w in unpleasant]
print(len(target1), len(target2))glove.6B.300d

25 25


In [11]:
cohenD(target1, target2, attribute1, attribute2)

1.4024032

#### GloVE Wikipedia - flowers insects 1.4024032

In [16]:
science = ["science", "technology", "physics", "chemistry",  "space", "experiment", "astronomy", "theoretical"]
arts = ['poetry', "art", "song6495955", "dance", "literature", "novel", "symphony", "drama"]
male = ['brother', "father", "uncle", "grandfather", "son", "he", "his", "him"]
female = ["sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"]
target1 = [model[w] for w in science]
target2 = [model[w] for w in arts]
attribute1 = [model[w] for w in male]
attribute2 = [model[w] for w in female]
cohenD(target1, target2, attribute1, attribute2)

1.5329299

#### GloVE Wikipedia - science arts 1.5329299 - Enstein NASA out of vocabulary

In [17]:
instruments = ["bagpipe", "cello", "guitar", "lute", "trombone", "banjo", "clarinet", "harmonica", "mandolin",
"trumpet", "bassoon", "drum", "harp", "oboe", "tuba", "bell", "fiddle", "harpsichord", "piano", "viola", "bongo",
"flute", "horn", "saxophone", "violin"]

weapons = ["arrow", "club", "gun", "missile", "spear", "axe", "dagger", "harpoon", "pistol", "sword", "blade",
"dynamite", "hatchet", "rifle", "tank", "bomb", "firearm", "knife", "shotgun", "teargas", "cannon", "grenade",
"mace", "slingshot", "whip"]

target1 = [model[w] for w in instruments]
target2 = [model[w] for w in weapons]
attribute1 = [model[w] for w in pleasant]
attribute2 = [model[w] for w in unpleasant]
cohenD(target1, target2, attribute1, attribute2)

1.4501393

#### GloVE Wikipedia instruments / weapons 1.45

In [18]:
career = ["executive", "management", "professional", "corporation", "salary", "office", "business", "career"]
family = ["home", "parents", "children", "family", "cousins", "marriage", "wedding", "relatives"]
maleNames = ["John", "Paul", "Mike", "Kevin", "Steve", "Greg", "Jeff", "Bill"]
femaleNames = ["Amy", "Joan", "Lisa", "Sarah", "Diana", "Kate", "Ann", "Donna"]
target1 = [model[w] for w in career]
target2 = [model[w] for w in family]
attribute1 = [model[w] for w in maleNames]
attribute2 = [model[w] for w in femaleNamesWEAT]
cohenD(target1, target2, attribute1, attribute2)

KeyError: "word 'John' not in vocabulary"

# word2vec

In [19]:
w2v = api.load('word2vec-google-news-300')

## Flowers Insects - Cohen D 1.555256

In [20]:
target1 = [w2v[w] for w in flowers]
target2 = [w2v[w] for w in insects]
attribute1 = [w2v[w] for w in pleasant]
attribute2 = [w2v[w] for w in unpleasant]
cohenD(target1, target2, attribute1, attribute2)

1.5552562

## Musical Instruments Weapons - 1.6495955  -> axe -> ax

In [22]:
instruments = ["bagpipe", "cello", "guitar", "lute", "trombone", "banjo", "clarinet", "harmonica", "mandolin",
"trumpet", "bassoon", "drum", "harp", "oboe", "tuba", "bell", "fiddle", "harpsichord", "piano", "viola", "bongo",
"flute", "horn", "saxophone", "violin"]

weapons = ["arrow", "club", "gun", "missile", "spear", "ax", "dagger", "harpoon", "pistol", "sword", "blade",
"dynamite", "hatchet", "rifle", "tank", "bomb", "firearm", "knife", "shotgun", "teargas", "cannon", "grenade",
"mace", "slingshot", "whip"]
target1 = [w2v[w] for w in instruments]
target2 = [w2v[w] for w in weapons]
attribute1 = [w2v[w] for w in pleasant]
attribute2 = [w2v[w] for w in unpleasant]
cohenD(target1, target2, attribute1, attribute2)

1.6495955

## Career Family - 1.7738411

In [23]:
career = ["executive", "management", "professional", "corporation", "salary", "office", "business", "career"]
family = ["home", "parents", "children", "family", "cousins", "marriage", "wedding", "relatives"]
maleNames = ["John", "Paul", "Mike", "Kevin", "Steve", "Greg", "Jeff", "Bill"]
femaleNames = ["Amy", "Joan", "Lisa", "Sarah", "Diana", "Kate", "Ann", "Donna"]

target1 = [w2v[w] for w in career]
target2 = [w2v[w] for w in family]
attribute1 = [w2v[w] for w in maleNames]
attribute2 = [w2v[w] for w in femaleNames]
cohenD(target1, target2, attribute1, attribute2)

1.7738411

## Science and Arts - 1.284648

In [24]:
science = ["science", "technology", "physics", "chemistry", "Einstein", "NASA", "experiment", "astronomy"]
arts = ['poetry', "art", "Shakespeare", "dance", "literature", "novel", "symphony", "drama"]
male = ['brother', "father", "uncle", "grandfather", "son", "he", "his", "him"]
female = ["sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"]

target1 = [w2v[w] for w in science]
target2 = [w2v[w] for w in arts]
attribute1 = [w2v[w] for w in male]
attribute2 = [w2v[w] for w in female]
cohenD(target1, target2, attribute1, attribute2)

1.284648

In [2]:
# Large FastText

In [5]:
model = gensim.models.KeyedVectors.load_word2vec_format('crawl-300d-2M-subword.vec')  

In [6]:
flowers = ['aster', 'clover', 'hyacinth', 'marigold', 'poppy', 'azalea', 'crocus', 'iris', 'orchid', 
           'rose', 'bluebell', 'daffodil', 'lilac', 'pansy', 'tulip', 'buttercup', 'daisy', 'lily', 
           'peony', 'violet', 'carnation', 'geranium','magnolia', 'petunia', 'zinnia']  

insects = ['ant', 'caterpillar', 'flea', 'locust', 'spider', 'bedbug', 'centipede', 'fly', 'maggot', 
           'tarantula', 'bee', 'cockroach', 'gnat', 'mosquito', 'termite', 'beetle', 'cricket', 
           'hornet', 'moth', 'wasp', 'blackfly','dragonfly', 'horsefly', 'roach', 'weevil']

pleasant = ['caress', 'freedom', 'health', 'love', 'peace', 'cheer', 'friend','heaven', 'loyal',
            'pleasure', 'diamond', 'gentle', 'honest', 'lucky', 'rainbow', 'diploma', 'gift', 
            'honor', 'miracle', 'sunrise', 'family', 'happy', 'laughter', 'paradise', 'vacation']

unpleasant = ['abuse', 'crash', 'filth', 'murder', 'sickness', 'accident', 'death', 
              'grief', 'poison', 'stink',  'assault', 'disaster', 'hatred', 'pollute', 'tragedy', 
              'divorce', 'jail', 'poverty', 'ugly', 'cancer', 'kill', 'rotten', 'vomit', 'agony', 'prison']

### Flowers 1.605917

In [11]:
target1 = [model[w] for w in flowers]
target2 = [model[w] for w in insects]
attribute1 = [model[w] for w in pleasant]
attribute2 = [model[w] for w in unpleasant]
cohenD(target1, target2, attribute1, attribute2)

1.605917

In [12]:
science = ["science", "technology", "physics", "chemistry", "Einstein", "NASA", "experiment", "astronomy"]
arts = ['poetry', "art", "Shakespeare", "dance", "literature", "novel", "symphony", "drama"]
male = ['brother', "father", "uncle", "grandfather", "son", "he", "his", "him"]
female = ["sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"]

target1 = [model[w] for w in science]
target2 = [model[w] for w in arts]
attribute1 = [model[w] for w in male]
attribute2 = [model[w] for w in female]
cohenD(target1, target2, attribute1, attribute2)

1.1989039

In [14]:
nurse = ["care", "patient", "nurse", "nursing", "patients", "health", "hospital", "RN", "staff", "therapy"

         "healthcare", "provider", "practitioner", "staffing", "outpatient", "family", "famiies", "home", "clinic", 

male = ['man', 'men', 'brother', "father", "uncle", "grandfather", "son", "he", "his", "him"]

female = ['woman', 'women', "sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"]career = ["executive", "management", "professional", "corporation", "salary", "office", "business", "career"]
family = ["home", "parents", "children", "family", "cousins", "marriage", "wedding", "relatives"]
maleNames = ["John", "Paul", "Mike", "Kevin", "Steve", "Greg", "Jeff", "Bill"]
femaleNames = ["Amy", "Joan", "Lisa", "Sarah", "Diana", "Kate", "Ann", "Donna"]
target1 = [model[w] for w in career]
target2 = [model[w] for w in family]
attribute1 = [model[w] for w in maleNames]
attribute2 = [model[w] for w in femaleNames]
cohenD(target1, target2, attribute1, attribute2)

1.7742442

In [15]:
instruments = ["bagpipe", "cello", "guitar", "lute", "trombone", "banjo", "clarinet", "harmonica", "mandolin",
"trumpet", "bassoon", "drum", "harp", "oboe", "tuba", "bell", "fiddle", "harpsichord", "piano", "viola", "bongo",
"flute", "horn", "saxophone", "violin"]

weapons = ["arrow", "club", "gun", "missile", "spear", "axe", "dagger", "harpoon", "pistol", "sword", "blade",
"dynamite", "hatchet", "rifle", "tank", "bomb", "firearm", "knife", "shotgun", "teargas", "cannon", "grenade",
"mace", "slingshot", "whip"]
target1 = [model[w] for w in instruments]
target2 = [model[w] for w in weapons]
attribute1 = [model[w] for w in pleasant]
attribute2 = [model[w] for w in unpleasant]
cohenD(target1, target2, attribute1, attribute2)

1.4734423

#### misc 1.38

In [77]:
#nurse = ["clinical", "care", "patient", "nurse", "nursing", "patients", "health", "hospital", "RN", "staff", "therapy",
#         "healthcare", "provider", "practitioner", "staffing", "outpatient", "family", "famiies", "home", "clinic", "clinical"]
engineer = ["engineer", "development", "data", "products", "tools", "technology", "Javascript", "JavaScript", 'expertise', "Amazon", "code",
           "Google", "technical", "computer", "industry", "design", "technologies", "web", "software", "Science", "science" ]
nurse = ["care", "patient", "Nurse", "nursing", 'patients', "clinical", "health", "staff", "Health", "Nursing", "Care", 'hospital' ]
male = ['man', 'men', 'brother', "father", "uncle", "grandfather", "son", "he", "his", "him"]
female = ['woman', 'women', "sister", "mother", "aunt", "grandmother", "daughter", "she", "hers", "her"]

In [78]:
target1 = [model[w] for w in nurse]
target2 = [model[w] for w in engineer]
attribute2 = [model[w] for w in male]
attribute1 = [model[w] for w in female]
print(len(target1), len(target2))

12 21


In [82]:
def mean_sim(word, attributes):
    cos = [cosine_similarity([word], [a]) for a in attributes]
    cos2 = [x[0][0] for x in cos]
    return (np.mean(cos2))

def testWord(w, A, B):
    # this gives us the s(w, A, B) from the original paper
    print("females: %f   males %f" % (mean_sim(w, A) , mean_sim(w, B)))
    return mean_sim(w, A) - mean_sim(w, B)

In [83]:
testWord(target1[3], attribute1, attribute2)

females: 0.327193   males 0.210416


0.11677688

In [84]:
cohenD(target1, target2, attribute1, attribute2)

females: 0.345356   males 0.307276
females: 0.322488   males 0.264841
females: 0.162548   males 0.028264
females: 0.327193   males 0.210416
females: 0.232252   males 0.168572
females: 0.143824   males 0.091845
females: 0.263055   males 0.214436
females: 0.210137   males 0.226772
females: 0.070028   males 0.010425
females: 0.101862   males -0.017706
females: 0.051313   males 0.008487
females: 0.366775   males 0.326970
females: 0.166306   males 0.292008
females: 0.126159   males 0.150269
females: 0.058905   males 0.052183
females: 0.092080   males 0.078372
females: 0.074782   males 0.106126
females: 0.086201   males 0.113220
females: -0.078201   males -0.059521
females: -0.053625   males -0.014253
females: 0.138887   males 0.162285
females: 0.058699   males 0.042382
females: 0.065541   males 0.087401
females: 0.051500   males 0.096820
females: 0.054945   males 0.121904
females: 0.194768   males 0.206341
females: 0.130289   males 0.141997
females: 0.138995   males 0.130078
females: 0.0229

1.6188601

In [51]:
nurse[4]

'patients'