# Operations on word vectors

1. 认识 word embeddings： word vectors 长啥样
2. cosine similarity
3. word analogy 单词类比
4. Debiasing word vectors， 去除性别/种族偏差

## 1. word embeddings

使用已经训练好的 word embeddings： glove.6B.50d.txt

下载地址： https://nlp.stanford.edu/projects/glove/  ， [glove.6B.zip](http://nlp.stanford.edu/data/glove.6B.zip)

**注意**： jupyter notebook的默认IO速度忒慢， 1,000,000 bytes/sec， 也就是1M/s 的速度。 这个文件有100多M，等的蛋疼

启动的时候设置： `jupyter notebook --NotebookApp.iopub_data_rate_limit=10000000000`  ， 10G/s 应该不是瓶颈了

In [1]:
import numpy as np

# Read the data into a list of strings.
def read_glove_vecs(glove_file):
    with open(glove_file, 'r') as f:
        words = set()
        word_to_vec_map = {}
        
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(line[0])
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
            
    return words, word_to_vec_map


words, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')

In [2]:
print(len(words))
for word in word_to_vec_map:
    print(word)
    print(word_to_vec_map[word])
    break

400000
the
[  4.18000000e-01   2.49680000e-01  -4.12420000e-01   1.21700000e-01
   3.45270000e-01  -4.44570000e-02  -4.96880000e-01  -1.78620000e-01
  -6.60230000e-04  -6.56600000e-01   2.78430000e-01  -1.47670000e-01
  -5.56770000e-01   1.46580000e-01  -9.50950000e-03   1.16580000e-02
   1.02040000e-01  -1.27920000e-01  -8.44300000e-01  -1.21810000e-01
  -1.68010000e-02  -3.32790000e-01  -1.55200000e-01  -2.31310000e-01
  -1.91810000e-01  -1.88230000e+00  -7.67460000e-01   9.90510000e-02
  -4.21250000e-01  -1.95260000e-01   4.00710000e+00  -1.85940000e-01
  -5.22870000e-01  -3.16810000e-01   5.92130000e-04   7.44490000e-03
   1.77780000e-01  -1.58970000e-01   1.20410000e-02  -5.42230000e-02
  -2.98710000e-01  -1.57490000e-01  -3.47580000e-01  -4.56370000e-02
  -4.42510000e-01   1.87850000e-01   2.78490000e-03  -1.84110000e-01
  -1.15140000e-01  -7.85810000e-01]


## 2 Cosine similarity

$$\text{CosineSimilarity(u, v)} = \frac {u . v} {||u||_2 ||v||_2} = cos(\theta) \tag{1}$$

<img src="images/cosine_sim.png" style="width:800px;height:250px;">
<caption><center> **Figure 1**: The cosine of the angle between two vectors is a measure of how similar they are</center></caption>


In [3]:
from scipy.spatial.distance import cosine #  cosine = 1 - cosine_similarity

def cosine_similarity(u, v):   
    distance = 0.0
    dot = np.dot(u.T, v)
    norm_u = np.linalg.norm(u, 2)
    norm_v = np.linalg.norm(v, 2)
    cosine_similarity = dot / norm_u / norm_v
    
    return cosine_similarity


In [4]:
father = word_to_vec_map["father"]
mother = word_to_vec_map["mother"]
ball = word_to_vec_map["ball"]
crocodile = word_to_vec_map["crocodile"]
france = word_to_vec_map["france"]
italy = word_to_vec_map["italy"]
paris = word_to_vec_map["paris"]
rome = word_to_vec_map["rome"]

print("cosine_similarity(father, mother) = ", cosine_similarity(father, mother))
print("cosine_similarity(ball, crocodile) = ",cosine_similarity(ball, crocodile))
print("cosine_similarity(france - paris, rome - italy) = ",cosine_similarity(france - paris, rome - italy))

cosine_similarity(father, mother) =  0.890903844289
cosine_similarity(ball, crocodile) =  0.274392462614
cosine_similarity(france - paris, rome - italy) =  -0.675147930817


**Expected Output**:

<table>
    <tr>
        <td>
            **cosine_similarity(father, mother)** =
        </td>
        <td>
         0.890903844289
        </td>
    </tr>
        <tr>
        <td>
            **cosine_similarity(ball, crocodile)** =
        </td>
        <td>
         0.274392462614
        </td>
    </tr>
        <tr>
        <td>
            **cosine_similarity(france - paris, rome - italy)** =
        </td>
        <td>
         -0.675147930817
        </td>
    </tr>
</table>

## 3 Word analogy task


In [5]:
def complete_analogy(word_a, word_b, word_c, word_to_vec_map):
    # convert words to lower case
    word_a, word_b, word_c = word_a.lower(), word_b.lower(), word_c.lower()
    
    e_a, e_b, e_c = word_to_vec_map[word_a], word_to_vec_map[word_b], word_to_vec_map[word_c]
    
    max_cosine_sim = -100              # Initialize max_cosine_sim to a large negative number
    best_word = None                   # Initialize best_word with None, it will help keep track of the word to output

    # loop over the whole word vector set
    for w in word_to_vec_map.keys():        
        if w in [word_a, word_b, word_c] :
            continue
        cosine_sim = cosine_similarity(e_b - e_a, word_to_vec_map[w] - e_c)
        
        if cosine_sim > max_cosine_sim:
            max_cosine_sim = cosine_sim
            best_word = w
    return best_word

In [6]:
triads_to_try = [('italy', 'italian', 'spain'), ('india', 'delhi', 'japan'), ('man', 'woman', 'boy'), ('small', 'smaller', 'large')]
for triad in triads_to_try:
    print ('{} -> {} :: {} -> {}'.format( *triad, complete_analogy(*triad,word_to_vec_map)))

italy -> italian :: spain -> spanish
india -> delhi :: japan -> tokyo
man -> woman :: boy -> girl
small -> smaller :: large -> larger


**Expected Output**:

<table>
    <tr>
        <td>
            **italy -> italian** ::
        </td>
        <td>
         spain -> spanish
        </td>
    </tr>
        <tr>
        <td>
            **india -> delhi** ::
        </td>
        <td>
         japan -> tokyo
        </td>
    </tr>
        <tr>
        <td>
            **man -> woman ** ::
        </td>
        <td>
         boy -> girl
        </td>
    </tr>
        <tr>
        <td>
            **small -> smaller ** ::
        </td>
        <td>
         large -> larger
        </td>
    </tr>
</table>

In [7]:
triad = ('man', 'woman', 'king')
print ('{} -> {} :: {} -> {}'.format( *triad, complete_analogy(*triad,word_to_vec_map)))

man -> woman :: king -> princess


In [8]:
triad = ('man', 'woman', 'computer')
print ('{} -> {} :: {} -> {}'.format( *triad, complete_analogy(*triad,word_to_vec_map)))

man -> woman :: computer -> 816-822-8448


In [9]:
triad = ('good', 'bad', 'money')
print ('{} -> {} :: {} -> {}'.format( *triad, complete_analogy(*triad,word_to_vec_map)))

good -> bad :: money -> subprime


In [10]:
triad = ('lovely', 'money', 'evil')
print ('{} -> {} :: {} -> {}'.format( *triad, complete_analogy(*triad,word_to_vec_map)))

lovely -> money :: evil -> funds


In [20]:
triad = ('yang', 'dong', 'macbook')
print ('{} -> {} :: {} -> {}'.format( *triad, complete_analogy(*triad,word_to_vec_map)))

yang -> dong :: macbook -> vlcc


## 4 - Debiasing word vectors 

In [13]:
bias_woman_man = word_to_vec_map['woman'] - word_to_vec_map['man']
print(bias_woman_man)

[-0.087144    0.2182     -0.40986    -0.03922    -0.1032      0.94165
 -0.06042     0.32988     0.46144    -0.35962     0.31102    -0.86824
  0.96006     0.01073     0.24337     0.08193    -1.02722    -0.21122
  0.695044   -0.00222     0.29106     0.5053     -0.099454    0.40445
  0.30181     0.1355     -0.0606     -0.07131    -0.19245    -0.06115
 -0.3204      0.07165    -0.13337    -0.25068714 -0.14293    -0.224957
 -0.149       0.048882    0.12191    -0.27362    -0.165476   -0.20426
  0.54376    -0.271425   -0.10245    -0.32108     0.2516     -0.33455
 -0.04371     0.01258   ]


一些名字是具有性别特征的，可以从名字与 `bias_woman_man` 矢量（轴）上的夹角（cosine）可以看出来

In [21]:
print ('List of names and their similarities with constructed vector:')

# girls and boys name
name_list = ['john', 'marie', 'sophie', 'ronaldo', 'priya', 'rahul', 'danielle', 'reza', 'katy', 'yasmin']

for w in name_list:
    print (w, cosine_similarity(word_to_vec_map[w], bias_woman_man))

List of names and their similarities with constructed vector:
john -0.23163356146
marie 0.315597935396
sophie 0.318687898594
ronaldo -0.312447968503
priya 0.17632041839
rahul -0.169154710392
danielle 0.243932992163
reza -0.079304296722
katy 0.283106865957
yasmin 0.233138577679



不幸？的是，职业也有性别特征：

In [27]:
print('Other words and their similarities:')
word_list = ['lipstick', 'guns', 'science', 'arts', 'literature', 'warrior','doctor', 'tree', 'receptionist', 
             'technology',  'fashion', 'teacher', 'engineer', 'pilot', 'computer', 'singer']
for w in word_list:
    print ('%12s  %10.4f' % (w, cosine_similarity(word_to_vec_map[w], bias_woman_man)))

Other words and their similarities:
    lipstick      0.2769
        guns     -0.1888
     science     -0.0608
        arts      0.0082
  literature      0.0647
     warrior     -0.2092
      doctor      0.1190
        tree     -0.0709
receptionist      0.3308
  technology     -0.1319
     fashion      0.0356
     teacher      0.1792
    engineer     -0.0804
       pilot      0.0011
    computer     -0.1033
      singer      0.1850



### 4.1 - Neutralize bias for non-gender specific words 


<img src="images/neutral.png" style="width:800px;height:300px;">
<caption><center> **Figure 2**: The word vector for "receptionist" represented before and after applying the neutralize operation. </center></caption>


$$e^{bias\_component} = \frac{e \cdot g}{||g||_2^2} * g\tag{2}$$
$$e^{debiased} = e - e^{bias\_component}\tag{3}$$


In [None]:
def neutralize(word, bias_vector, word_to_vec_map):
    bias_unit_vector = bias_vector / np.linalg.norm(bias_vector, 2)
    e = word_to_vec_map[word]
    e_biascomponent = np.dot(e, bias_unit_vector) * bias_unit_vector
    e_debiased = e - e_biascomponent
    
    return e_debiased

In [None]:
e = "receptionist"
print("cosine similarity between " + e + " and g, before neutralizing: ", cosine_similarity(word_to_vec_map["receptionist"], g))

e_debiased = neutralize("receptionist", g, word_to_vec_map)
print("cosine similarity between " + e + " and g, after neutralizing: ", cosine_similarity(e_debiased, g))

**Expected Output**: The second result is essentially 0, up to numerical roundof (on the order of $10^{-17}$).


<table>
    <tr>
        <td>
            **cosine similarity between receptionist and g, before neutralizing:** :
        </td>
        <td>
         0.330779417506
        </td>
    </tr>
        <tr>
        <td>
            **cosine similarity between receptionist and g, after neutralizing:** :
        </td>
        <td>
         -3.26732746085e-17
    </tr>
</table>

### 4.2 - Equalization algorithm for gender-specific words

<img src="images/equalize10.png" style="width:800px;height:400px;">


The derivation of the linear algebra to do this is a bit more complex. (See Bolukbasi et al., 2016 for details.) But the key equations are: 

$$ \mu = \frac{e_{w1} + e_{w2}}{2}\tag{4}$$ 

$$ \mu_{B} = (\mu \cdot \text{bias_unit_vector}) *\text{bias_unit_vector}  \tag{5}$$ 

$$\mu_{\perp} = \mu - \mu_{B} \tag{6}$$

$$ e_{w1B} = (e_{w1} \cdot \text{bias_unit_vector}) *\text{bias_unit_vector}
\tag{7}$$ 
$$ e_{w2B} = (e_{w2} \cdot \text{bias_unit_vector}) *\text{bias_unit_vector}
\tag{8}$$


$$e_{w1B}^{corrected} = \sqrt{ |{1 - ||\mu_{\perp} ||^2_2} |} * \frac{e_{\text{w1B}} - \mu_B} {||e_{w1B} - \mu_B||} \tag{9}$$


$$e_{w2B}^{corrected} = \sqrt{ |{1 - ||\mu_{\perp} ||^2_2} |} * \frac{e_{\text{w2B}} - \mu_B} {||e_{w2B} - \mu_B||} \tag{10}$$

$$e_1 = e_{w1B}^{corrected} + \mu_{\perp} \tag{11}$$
$$e_2 = e_{w2B}^{corrected} + \mu_{\perp} \tag{12}$$


**Exercise**: Implement the function below. Use the equations above to get the final equalized version of the pair of words. Good luck!

In [None]:
def equalize(pair, bias_vector, word_to_vec_map):
    
    # Step 1: Select word vector representation of "word". Use word_to_vec_map. (≈ 2 lines)
    w1, w2 = pair
    e_w1, e_w2 = word_to_vec_map[w1], word_to_vec_map[w2]
    
    # Step 2: Compute the mean of e_w1 and e_w2 (≈ 1 line)
    mu = (e_w1 + e_w2) / 2
    bias_unit_vector = bias_vector / np.linalg.norm(bias_vector, 2)

    # Step 3: Compute the projections of mu over the bias axis and the orthogonal axis (≈ 2 lines)
    mu_B = np.dot(mu, bias_unit_vector) * bias_unit_vector
    mu_orth = mu - mu_B

    # Step 4: Use equations (7) and (8) to compute e_w1B and e_w2B (≈2 lines)
    e_w1B = np.dot(e_w1, bias_unit_vector) * bias_unit_vector
    e_w2B = np.dot(e_w2, bias_unit_vector) * bias_unit_vector
        
    # Step 5: Adjust the Bias part of e_w1B and e_w2B using the formulas (9) and (10) given above (≈2 lines)
    factor = np.sqrt(np.abs(1 - np.sum(mu_orth * mu_orth)))
    corrected_e_w1B = factor / np.linalg.norm(e_w1B - mu_B, 2) * (e_w1B - mu_B)
    corrected_e_w2B = factor / np.linalg.norm(e_w2B - mu_B, 2) * (e_w2B - mu_B)

    # Step 6: Debias by equalizing e1 and e2 to the sum of their corrected projections (≈2 lines)
    e1 = mu_orth + corrected_e_w1B
    e2 = mu_orth + corrected_e_w2B
    
    return e1, e2

In [None]:
g = word_to_vec_map['woman'] - word_to_vec_map['man']

print("!!!!!!bias = woman - man!!!!!!!!")
print("cosine similarities before equalizing:")
print("cosine_similarity(word_to_vec_map[\"boy\"], gender) = ", cosine_similarity(word_to_vec_map["boy"], g))
print("cosine_similarity(word_to_vec_map[\"girl\"], gender) = ", cosine_similarity(word_to_vec_map["girl"], g))
print()
e1, e2 = equalize(("boy", "girl"), g, word_to_vec_map)
print("cosine similarities after equalizing:")
print("cosine_similarity(e1, gender) = ", cosine_similarity(e1, g))
print("cosine_similarity(e2, gender) = ", cosine_similarity(e2, g))

In [None]:
g1 = word_to_vec_map['woman'] - word_to_vec_map['man']
g2 = word_to_vec_map['mother'] - word_to_vec_map['father']
g3 = word_to_vec_map['girl'] - word_to_vec_map['boy']

print(cosine_similarity(g1, g2))
print(cosine_similarity(g1, g3))

**References**:
- The debiasing algorithm is from Bolukbasi et al., 2016, [Man is to Computer Programmer as Woman is to
Homemaker? Debiasing Word Embeddings](https://papers.nips.cc/paper/6228-man-is-to-computer-programmer-as-woman-is-to-homemaker-debiasing-word-embeddings.pdf)
- The GloVe word embeddings were due to Jeffrey Pennington, Richard Socher, and Christopher D. Manning. (https://nlp.stanford.edu/projects/glove/)
