# Unsupervised Language Learning, Lab1 - Word Similarity

## Adriaan de Vries (10795227), Verna Dankers (10761225)

Before being able to run this code, please import the following libraries and set the following paths to the datasets. Afterwards, the code should run without issues.

In [None]:
# Requirements
from tqdm import tqdm
from pprint import pprint
from collections import defaultdict, Counter
from scipy.stats import spearmanr, pearsonr
from tabulate import tabulate
from gensim.models import KeyedVectors
from gensim.corpora.dictionary import Dictionary
from gensim.scripts.glove2word2vec import glove2word2vec
import numpy as np
import os
import itertools
import matplotlib.pyplot as plt

# Paths to datasets
bow2_filename = "data/bow2.words"
bow5_filename = "data/bow5.words"
deps_filename = "data/deps.words"
simlex_filename = "data/SimLex-999.txt"
men_filename = "data/men/MEN_dataset_natural_form_full"
analogy_filename = "data/questions-words.txt"
common_words_filename = "data/common_words.words"
create_figures = False

In [9]:
# Read in word embeddings
glove2word2vec(bow2_filename, bow2_filename.split(".")[0] + ".txt")
glove2word2vec(bow5_filename, bow5_filename.split(".")[0] + ".txt")
glove2word2vec(deps_filename, deps_filename.split(".")[0] + ".txt")
bow2 = KeyedVectors.load_word2vec_format(bow2_filename.split(".")[0] + ".txt", binary=False)
bow2.init_sims(replace=True)
bow5 = KeyedVectors.load_word2vec_format(bow5_filename.split(".")[0] + ".txt", binary=False)
bow5.init_sims(replace=True)
deps = KeyedVectors.load_word2vec_format(deps_filename.split(".")[0] + ".txt", binary=False)
deps.init_sims(replace=True)

## Quantitative Analysis

To evaluate the performance on the word similarity task quantitatively, we research the strength of the correlation between the SimLex and MEN scores, and the word embeddings' Cosine similarity for these words. First, we visualize this relation. For the pairs scored by SimLex, words in pairs have the same POS tag. We used this fact to also visualize the relation per POS tag. We did not do this for the MEN dataset, because it contains pairs with mixed POS tags.

<img src="SimLex.png" />
<img src="MEN.png" />

Second, we quantify the strength using Pearson's correlation coefficient and Spearman's correlation coefficient.

Correlation Coefficients for all pairs in the data
<table>
<thead>
<tr><th>Embeddings  </th><th>Gold standard  </th><th style="text-align: right;">  Spearman's r</th><th style="text-align: right;">  Spearman p-value</th><th style="text-align: right;">  Pearson's r</th><th style="text-align: right;">  Pearson p-value</th></tr>
</thead>
<tbody>
<tr><td>bow2        </td><td>MEN            </td><td style="text-align: right;">      0.699905</td><td style="text-align: right;">      0           </td><td style="text-align: right;">     0.677698</td><td style="text-align: right;">     0           </td></tr>
<tr><td>bow2        </td><td>SimLex         </td><td style="text-align: right;">      0.414146</td><td style="text-align: right;">      1.22681e-42 </td><td style="text-align: right;">     0.428459</td><td style="text-align: right;">     7.993e-46   </td></tr>
<tr><td>bow5        </td><td>MEN            </td><td style="text-align: right;">      0.723169</td><td style="text-align: right;">      0           </td><td style="text-align: right;">     0.708236</td><td style="text-align: right;">     0           </td></tr>
<tr><td>bow5        </td><td>SimLex         </td><td style="text-align: right;">      0.367396</td><td style="text-align: right;">      2.97758e-33 </td><td style="text-align: right;">     0.375601</td><td style="text-align: right;">     8.60741e-35 </td></tr>
<tr><td>deps        </td><td>MEN            </td><td style="text-align: right;">      0.617823</td><td style="text-align: right;">      2.37026e-315</td><td style="text-align: right;">     0.597402</td><td style="text-align: right;">     1.01838e-289</td></tr>
<tr><td>deps        </td><td>SimLex         </td><td style="text-align: right;">      0.445641</td><td style="text-align: right;">      7.4143e-50  </td><td style="text-align: right;">     0.461901</td><td style="text-align: right;">     6.83893e-54 </td></tr>
</tbody>
</table>
Correlation Coefficients per POS tag
<table>
<thead>
<tr><th>Embeddings  </th><th>Gold standard  </th><th style="text-align: right;">  Spearman's r</th><th style="text-align: right;">  Spearman p-value</th><th style="text-align: right;">  Pearson's r</th><th style="text-align: right;">  Pearson p-value</th></tr>
</thead>
<tbody>
<tr><td>bow2        </td><td>SimLex + N     </td><td style="text-align: right;">      0.436845</td><td style="text-align: right;">       2.07111e-32</td><td style="text-align: right;">     0.437518</td><td style="text-align: right;">      1.62424e-32</td></tr>
<tr><td>bow2        </td><td>SimLex + V     </td><td style="text-align: right;">      0.322356</td><td style="text-align: right;">       9.76449e-07</td><td style="text-align: right;">     0.360907</td><td style="text-align: right;">      3.35007e-08</td></tr>
<tr><td>bow2        </td><td>SimLex + A     </td><td style="text-align: right;">      0.538252</td><td style="text-align: right;">       1.1097e-09 </td><td style="text-align: right;">     0.523061</td><td style="text-align: right;">      3.87401e-09</td></tr>
<tr><td>bow5        </td><td>SimLex + N     </td><td style="text-align: right;">      0.380137</td><td style="text-align: right;">       2.53128e-24</td><td style="text-align: right;">     0.385199</td><td style="text-align: right;">      5.53728e-25</td></tr>
<tr><td>bow5        </td><td>SimLex + V     </td><td style="text-align: right;">      0.273831</td><td style="text-align: right;">       3.67757e-05</td><td style="text-align: right;">     0.300255</td><td style="text-align: right;">      5.53094e-06</td></tr>
<tr><td>bow5        </td><td>SimLex + A     </td><td style="text-align: right;">      0.48118 </td><td style="text-align: right;">       9.00169e-08</td><td style="text-align: right;">     0.463548</td><td style="text-align: right;">      2.99891e-07</td></tr>
<tr><td>deps        </td><td>SimLex + N     </td><td style="text-align: right;">      0.474313</td><td style="text-align: right;">       1.16325e-38</td><td style="text-align: right;">     0.486374</td><td style="text-align: right;">      7.63785e-41</td></tr>
<tr><td>deps        </td><td>SimLex + V     </td><td style="text-align: right;">      0.384972</td><td style="text-align: right;">       3.21298e-09</td><td style="text-align: right;">     0.416133</td><td style="text-align: right;">      1.14898e-10</td></tr>
<tr><td>deps        </td><td>SimLex + A     </td><td style="text-align: right;">      0.496927</td><td style="text-align: right;">       2.89754e-08</td><td style="text-align: right;">     0.48534 </td><td style="text-align: right;">      6.70912e-08</td></tr>
</tbody>
</table>

#### 1. Read in the data

In [11]:
# Read in simlex and men data
simlex = dict()
men = dict()

with open(simlex_filename, 'r') as f:
    headers = f.readline().split()[2:]
    for line in f:
        line = line.split()
        simlex[(line[0], line[1])] = dict(
            [(header, float(score)) for header, score in zip(headers[1:], line[3:])]
        )
        simlex[(line[0], line[1])][headers[0]] = line[2] 

with open(men_filename, 'r') as f:
    for line in f:
        line = line.split()
        men[(line[0], line[1])] = float(line[2])

#### 2. Create graphs showing the Cosine similarity for word pairs vs. the SimLex or MEN score

In [12]:
def score_men(men, embeddings):
    scores = [[], []]
    bad = []
    best = []
    for pair in men:
        try:
            scores[0].append(embeddings.similarity(pair[0], pair[1]))
            scores[1].append(men[pair])
            if (scores[0][-1] > 0.75 and scores[1][-1] < 12.5) or (scores[0][-1] < 0.25 and scores[1][-1] > 37.5):
                bad.append("({}, {}), {:.2f}, {:.2f}".format(pair[0], pair[1], scores[0][-1], scores[1][-1]))
            if (scores[0][-1] > 0.75 and scores[1][-1] > 37.5):
                best.append("({}, {}), {:.2f}, {:.2f}".format(pair[0], pair[1], scores[0][-1], scores[1][-1]))
        except KeyError:
            continue
    return (scores[0], scores[1], bad, best)

def score_simlex(simlex, name, embeddings):
    scores = [[], []]
    bad = []
    best = []
    for pair in simlex:
        try:
            scores[0].append(embeddings.similarity(pair[0], pair[1]))
            scores[1].append(simlex[pair][name])
            if (scores[0][-1] > 0.75 and scores[1][-1] < 2.5) or (scores[0][-1] < 0.25 and scores[1][-1] > 7.5):
                bad.append("({}, {}), {:.2f}, {:.2f}".format(pair[0], pair[1], scores[0][-1], scores[1][-1]))
            if (scores[0][-1] > 0.75 and scores[1][-1] > 7.5):
                best.append("({}, {}), {:.2f}, {:.2f}".format(pair[0], pair[1], scores[0][-1], scores[1][-1]))
        except KeyError:
            continue
    return (scores[0], scores[1], bad, best)

def score_simlex_pos(simlex, name, embeddings):
    scores = [defaultdict(list), defaultdict(list)]
    for pair in simlex:
        try:
            scores[0][simlex[pair]["POS"]].append(embeddings.similarity(pair[0], pair[1]))
            scores[1][simlex[pair]["POS"]].append(simlex[pair][name])
        except KeyError:
            continue
    return (scores[0], scores[1])

In [23]:
if create_figures:
    figure = plt.figure()
    figure.set_size_inches(15, 5)
    for i, (data, name) in enumerate([(bow2, 'bow2'), (bow5, 'bow5'), (deps, 'deps')]):
        yplot, xplot = score_simlex_pos(simlex, "SimLex999", data)

        colours = ["blue", "green", "red"]
        plt.subplot(1, 3, i+1)
        for j, pos in enumerate(xplot.keys()):
            plt.scatter(xplot[pos], yplot[pos], alpha=0.3, label=pos)
        plt.legend()
        plt.xlabel("SimLex999")
        plt.ylabel("Cosine Similarity")
        plt.title(name)
    plt.show()

    figure = plt.figure()
    figure.set_size_inches(15, 5)
    for i, (data, name) in enumerate([(bow2, 'bow2'), (bow5, 'bow5'), (deps, 'deps')]):
        yplot, xplot, men_bad_samples, men_best_samples = score_men(men, data)
        plt.subplot(1, 3, i+1)
        plt.scatter(xplot, yplot, alpha=0.3)
        plt.xlabel("MEN")
        plt.ylabel("Cosine Similarity")
        plt.title(name)
    plt.show()

#### 3. Pearson's $\rho$ and Spearman's $\rho$

In [52]:
results = []
results_pos = []
all_men_bad_samples = dict()
all_simlex_bad_samples = dict()
all_men_best_samples = dict()
all_simlex_best_samples = dict()
for i, (data, name) in enumerate([(bow2, 'bow2'), (bow5, 'bow5'), (deps, 'deps')]):
    # MEN
    embed_results, gold, men_bad_samples, men_best_samples = score_men(men, data)
    all_men_bad_samples[name] = men_bad_samples
    all_men_best_samples[name] = men_best_samples
    spearman = spearmanr(embed_results, gold)
    pearson = pearsonr(embed_results, gold)
    results.append((name, "MEN", spearman[0], spearman[1], pearson[0], pearson[1]))

    # SIMLEX
    embed_results, gold, simlex_bad_samples, simlex_best_samples = score_simlex(simlex, "SimLex999", data)
    all_simlex_bad_samples[name] = simlex_bad_samples
    all_simlex_best_samples[name] = simlex_best_samples
    spearman = spearmanr(embed_results, gold)
    pearson = pearsonr(embed_results, gold)
    results.append((name, "SimLex", spearman[0], spearman[1], pearson[0], pearson[1]))
    
    # SIMLEX per POS tag
    embed_results, gold = score_simlex_pos(simlex, "SimLex999", data)
    for POS in embed_results:
        spearman = spearmanr(embed_results[POS], gold[POS])
        pearson = pearsonr(embed_results[POS], gold[POS])
        results_pos.append((name, "SimLex + {}".format(POS), spearman[0], spearman[1], pearson[0], pearson[1]))

if create_figures:
    print("Correlation Coefficients for all pairs in the data")
    headers = ['Embeddings', 'Gold standard', 'Spearman\'s r',
               'Spearman p-value', 'Pearson\'s r', 'Pearson p-value' ]
    print(tabulate(results, headers=headers, tablefmt="html"))

    print("Correlation Coefficients per POS tag")
    headers = ['Embeddings', 'Gold standard', 'Spearman\'s r',
               'Spearman p-value', 'Pearson\'s r', 'Pearson p-value' ]
    print(tabulate(results_pos, headers=headers, tablefmt="html"))

## Qualitative Analysis

To perform a qualitative analysis, we first take a look at pairs for which either the ground truth gives a high score and the embeddings give a low score, or the ground truth gives a low score and the embeddings give a high score. High has been defined as > 7.5 for SimLex, > 37.5 for MEN and > 0.75 for the Cosine similarity. Low has been defined as < 2.5 for SimLex, < 12.5 for MEN and < 0.25 for Cosine similarity. Second, we also check the best samples: samples for which the embeddings and MEN or SimLex mostly agree (> 7.5, > 37.5, > 0.75). Below the tables are listed, first for the bad samples and second for the best samples.

We list our findings about these samples:
1. Upon comparing the SimLex and MEN results for bad samples, we noticed that the set of wrongly scored samples compared to the SimLex scores contains many antonyms (e.g. _encourage / discourage_, _reject / accept_, _summer / winter_), whereas the bad samples retrieved for MEN do not contain antonyms. The antonym pairs are assigned a relatively high similarity score according to the embeddings, but a low score according to SimLex. This can be explained by taking into account how the SimLex score was determined, by asking participants to judge how similar two words are. Although humans might judge that antonyms are related, they may not be judged similar, because they are actually opposites. In text corpora however, it can be difficult to capture this subtle difference. Antonyms may seem similar in corpora because their contexts are often very similar. (_'After the speech I felt encouraged / discouraged'_, _'The company accepted / rejected the proposal.'_, _'I love the summer / winter weather.'_)
2. For MEN bad samples, we only see scores that are underestimated for all three types of embeddings. Although it is not consistent for all pairs, we do notice that quite some pairs contain different part of speech tags (e.g. (bed, relaxed), (automobile, parking), (morning, sunny)). In contrast, the best samples only contain pairs of words that have roughly the same part of speech tag. Possibly, words within the same domain but of a different type are farther apart than words of the same type. For SimLex we cannot distinguish this because all word pairs in the data set have the same tag.
3. For SimLex bad samples, Bow2 and Bow5 both show underestimations and overestimations, but Deps only gives overestimations. The overestimations are mainly antonyms (discussed in 1.). The underestimations are mainly words that behave similar in a sentence, but not necessarily belong to one topic or domain. That the Deps embeddings do not show such badly scored samples, may be because they are better at capturing functional similarity (Levy and Goldberg, 2014).

MEN - bad samples

<table>
<thead>
<tr><th>Bow2                              </th><th>Bow5                               </th><th>Deps                          </th></tr>
</thead>
<tbody>
<tr><td>(day, sunset), 0.24, 43.00        </td><td>(day, sunset), 0.23, 43.00         </td><td>(bed, relaxed), 0.24, 39.00   </td></tr>
<tr><td>(guitar, rock), 0.24, 39.00       </td><td>(handwriting, written), 0.22, 44.00</td><td>(leaf, nature), 0.18, 43.00   </td></tr>
<tr><td>(day, sunshine), 0.23, 42.00      </td><td>(automobile, parking), 0.24, 41.00 </td><td>(petals, rose), 0.21, 42.00   </td></tr>
<tr><td>(automobile, parking), 0.24, 41.00</td><td>(ivy, plant), 0.17, 45.00          </td><td>(pregnant, women), 0.22, 41.00</td></tr>
<tr><td>(ivy, plant), 0.18, 45.00         </td><td>(van, vehicle), 0.14, 40.00        </td><td>(parrot, wing), 0.20, 38.00   </td></tr>
<tr><td>(van, vehicle), 0.23, 40.00       </td><td>(interior, room), 0.21, 39.00      </td><td>                              </td></tr>
<tr><td>(shore, water), 0.24, 45.00       </td><td>(shore, water), 0.22, 45.00        </td><td>                              </td></tr>
<tr><td>(morning, sunny), 0.22, 40.00     </td><td>(morning, sunny), 0.23, 40.00      </td><td>                              </td></tr>
<tr><td>(bed, relaxed), 0.21, 39.00       </td><td>(bed, relaxed), 0.12, 39.00        </td><td>                              </td></tr>
<tr><td>(clothes, jean), 0.16, 39.00      </td><td>(clothes, jean), 0.09, 39.00       </td><td>                              </td></tr>
</tbody>
</table>

SimLex-999 - bad samples
<table>
<thead>
<tr><th>Bow2                               </th><th>Bow5                          </th><th>Deps                               </th></tr>
</thead>
<tbody>
<tr><td>(encourage, discourage), 0.77, 1.58</td><td>(dog, cat), 0.77, 1.75        </td><td>(dog, cat), 0.78, 1.75             </td></tr>
<tr><td>(buddy, companion), 0.16, 8.65     </td><td>(keep, possess), 0.15, 8.27   </td><td>(accept, reject), 0.77, 0.83       </td></tr>
<tr><td>(south, north), 0.83, 2.20         </td><td>(certain, sure), 0.20, 8.42   </td><td>(encourage, discourage), 0.84, 1.58</td></tr>
<tr><td>(winter, summer), 0.76, 2.38       </td><td>(satisfy, please), 0.17, 7.67 </td><td>(south, north), 0.90, 2.20         </td></tr>
<tr><td>                                   </td><td>(come, attend), 0.21, 8.10    </td><td>(cat, rabbit), 0.78, 2.37          </td></tr>
<tr><td>                                   </td><td>(begin, originate), 0.19, 8.20</td><td>(winter, summer), 0.84, 2.38       </td></tr>
<tr><td>                                   </td><td>(buddy, companion), 0.14, 8.65</td><td>(lawyer, banker), 0.77, 1.88       </td></tr>
<tr><td>                                   </td><td>(racket, noise), 0.18, 8.10   </td><td>                                   </td></tr>
<tr><td>                                   </td><td>(south, north), 0.85, 2.20    </td><td>                                   </td></tr>
<tr><td>                                   </td><td>(winter, summer), 0.84, 2.38  </td><td>                                   </td></tr>
</tbody>
</table>

MEN - best samples
<table>
<thead>
<tr><th>Bow2                               </th><th>Bow5                               </th><th>Deps                             </th></tr>
</thead>
<tbody>
<tr><td>(gold, silver), 0.79, 40.00        </td><td>(autumn, spring), 0.76, 44.00      </td><td>(guitar, piano), 0.78, 43.00     </td></tr>
<tr><td>(beef, meat), 0.77, 43.00          </td><td>(gold, silver), 0.84, 40.00        </td><td>(beetles, insects), 0.75, 40.00  </td></tr>
<tr><td>(bicycle, bike), 0.81, 45.00       </td><td>(beef, meat), 0.84, 43.00          </td><td>(gold, silver), 0.83, 40.00      </td></tr>
<tr><td>(amphibians, reptiles), 0.87, 41.00</td><td>(beef, chicken), 0.80, 38.00       </td><td>(cafe, restaurant), 0.79, 42.00  </td></tr>
<tr><td>(cattle, sheep), 0.83, 38.00       </td><td>(bicycle, bike), 0.79, 45.00       </td><td>(aircraft, airplane), 0.76, 46.00</td></tr>
<tr><td>(carrots, potatoes), 0.81, 39.00   </td><td>(amphibians, reptiles), 0.84, 41.00</td><td>(harbor, harbour), 0.80, 40.00   </td></tr>
<tr><td>                                   </td><td>(cattle, sheep), 0.87, 38.00       </td><td>(chapel, church), 0.76, 45.00    </td></tr>
<tr><td>                                   </td><td>(daughter, son), 0.83, 41.00       </td><td>(bus, tram), 0.78, 38.00         </td></tr>
<tr><td>                                   </td><td>(pink, purple), 0.75, 38.00        </td><td>(town, village), 0.79, 43.00     </td></tr>
<tr><td>                                   </td><td>(coffee, tea), 0.76, 45.00         </td><td>(bicycle, bike), 0.83, 45.00     </td></tr>
</tbody>
</table>

SimLex-999 - best samples
<table>
<thead>
<tr><th>Bow2                              </th><th>Bow5                           </th><th>Deps                               </th></tr>
</thead>
<tbody>
<tr><td>(appoint, elect), 0.76, 8.17      </td><td>(stupid, dumb), 0.75, 9.58     </td><td>(inform, notify), 0.82, 9.25       </td></tr>
<tr><td>(investigate, examine), 0.77, 8.10</td><td>(movie, film), 0.82, 8.87      </td><td>(orthodontist, dentist), 0.78, 8.27</td></tr>
<tr><td>(vanish, disappear), 0.75, 9.80   </td><td>(analyze, evaluate), 0.76, 8.03</td><td>(clarify, explain), 0.79, 8.33     </td></tr>
<tr><td>(protect, defend), 0.76, 9.13     </td><td>(large, huge), 0.76, 9.47      </td><td>(attorney, lawyer), 0.79, 9.35     </td></tr>
<tr><td>(acquire, obtain), 0.76, 8.57     </td><td>                               </td><td>(investigate, examine), 0.81, 8.10 </td></tr>
<tr><td>(movie, film), 0.82, 8.87         </td><td>                               </td><td>(stupid, dumb), 0.80, 9.58         </td></tr>
<tr><td>(analyze, evaluate), 0.80, 8.03   </td><td>                               </td><td>(achieve, accomplish), 0.82, 8.57  </td></tr>
<tr><td>(large, huge), 0.78, 9.47         </td><td>                               </td><td>(crucial, important), 0.76, 8.82   </td></tr>
<tr><td>                                  </td><td>                               </td><td>(vanish, disappear), 0.84, 9.80    </td></tr>
<tr><td>                                  </td><td>                               </td><td>(protect, defend), 0.78, 9.13      </td></tr>
</tbody>
</table>

In [56]:
if create_figures:
    print("MEN - bad samples")
    table = list(itertools.zip_longest(all_men_bad_samples["bow2"],
                 all_men_bad_samples["bow5"], all_men_bad_samples["deps"]))[:5]
    print(tabulate(table,
                   headers=['Bow2', 'Bow5', 'Deps'], tablefmt='fancy'))

    print("SimLex-999 - bad samples")
    table = list(itertools.zip_longest(all_simlex_bad_samples["bow2"],
                 all_simlex_bad_samples["bow5"], all_simlex_bad_samples["deps"]))[:5]
    print(tabulate(table,
                   headers=['Bow2', 'Bow5', 'Deps'], tablefmt='fancy'))

    print("MEN - best samples")
    table = list(itertools.zip_longest(all_men_best_samples["bow2"],
                 all_men_best_samples["bow5"], all_men_best_samples["deps"]))[:5]
    print(tabulate(table,
                   headers=['Bow2', 'Bow5', 'Deps'], tablefmt='fancy'))

    print("SimLex-999 - best samples")
    table = list(itertools.zip_longest(all_simlex_best_samples["bow2"],
                 all_simlex_best_samples["bow5"], all_simlex_best_samples["deps"]))[:5]
    print(tabulate(table,
                   headers=['Bow2', 'Bow5', 'Deps'], tablefmt='fancy'))