# Unsupervised Language Learning, Lab1

## Adriaan de Vries (10795227), Verna Dankers (10761225)

Hier komt een verhaaltje over de eerste opdracht.

Before being able to run this code, please import the following libraries and set the following paths to the datasets. Afterwards, the code should run without issues.

In [None]:
# Requirements
from tqdm import tqdm
from pprint import pprint
from collections import defaultdict, Counter
from scipy.stats import spearmanr, pearsonr
from tabulate import tabulate
from gensim.models import KeyedVectors
from gensim.corpora.dictionary import Dictionary
from gensim.scripts.glove2word2vec import glove2word2vec
import numpy as np
import os
import matplotlib.pyplot as plt

# Paths to datasets
bow2_filename = "data/bow2.words"
bow5_filename = "data/bow5.words"
deps_filename = "data/deps.words"
simlex_filename = "data/SimLex-999.txt"
men_filename = "data/men/MEN_dataset_natural_form_full"
analogy_filename = "data/questions-words.txt"
common_words_filename = "data/common_words.words"

glove2word2vec(bow2_filename, bow2_filename.split(".")[0] + ".txt")
glove2word2vec(bow5_filename, bow5_filename.split(".")[0] + ".txt")
glove2word2vec(deps_filename, deps_filename.split(".")[0] + ".txt")
bow2 = KeyedVectors.load_word2vec_format(bow2_filename.split(".")[0] + ".txt", binary=False)
bow2.init_sims(replace=True)
bow5 = KeyedVectors.load_word2vec_format(bow5_filename.split(".")[0] + ".txt", binary=False)
bow5.init_sims(replace=True)
deps = KeyedVectors.load_word2vec_format(deps_filename.split(".")[0] + ".txt", binary=False)
deps.init_sims(replace=True)

### 1. Collect and examine the word embeddings

In [None]:
print(deps.most_similar(positive=['reddish'])[0])
print(bow5.most_similar(positive=['cop'])[0])
print(bow2.most_similar(positive=['tissue'])[0])

print(deps.most_similar(positive=['sudoku'])[0])
print(bow5.most_similar(positive=['sudoku'])[0])
print(bow2.most_similar(positive=['sudoku'])[0])

### 2. Collect the SimLex and MEN data to evaluate the quality of the word embeddings

In [None]:
def score_men(men, embeddings):
    scores = [[], []]
    for pair in men:
        try:
            scores[0].append(embeddings.similarity(pair[0], pair[1]))
            scores[1].append(men[pair])
        except KeyError:
            continue
    return (scores[0], scores[1])

def score_simlex(simlex, name, embeddings):
    scores = [[], []]
    for pair in simlex:
        try:
            scores[0].append(embeddings.similarity(pair[0], pair[1]))
            scores[1].append(simlex[pair][name])
        except KeyError:
            continue
    return (scores[0], scores[1])

def score_simlex_pos(simlex, name, embeddings):
    scores = [defaultdict(list), defaultdict(list)]
    for pair in simlex:
        try:
            scores[0][simlex[pair]["POS"]].append(embeddings.similarity(pair[0], pair[1]))
            scores[1][simlex[pair]["POS"]].append(simlex[pair][name])
        except KeyError:
            continue
    return (scores[0], scores[1])

simlex = dict()
men = dict()

with open(simlex_filename, 'r') as f:
    headers = f.readline().split()[2:]
    for line in f:
        line = line.split()
        simlex[(line[0], line[1])] = dict(
            [(header, float(score)) for header, score in zip(headers[1:], line[3:])]
        )
        simlex[(line[0], line[1])][headers[0]] = line[2] 

with open(men_filename, 'r') as f:
    for line in f:
        line = line.split()
        men[(line[0], line[1])] = float(line[2])

In [None]:
figure = plt.figure()
figure.set_size_inches(15, 5)
for i, (data, name) in enumerate([(bow2, 'bow2'), (bow5, 'bow5'), (deps, 'deps')]):
    yplot, xplot = score_simlex_pos(simlex, "SimLex999", data)
    
    colours = ["blue", "green", "red"]
    plt.subplot(1, 3, i+1)
    for j, pos in enumerate(xplot.keys()):
        plt.scatter(xplot[pos], yplot[pos], alpha=0.3, label=pos)
    plt.legend()
    plt.xlabel("SimLex999")
    plt.ylabel("Cosine Similarity")
    plt.title(name)
plt.show()
    

In [None]:
figure = plt.figure()
figure.set_size_inches(15, 5)
for i, (data, name) in enumerate([(bow2, 'bow2'), (bow5, 'bow5'), (deps, 'deps')]):
    yplot, xplot = score_men(men, data)
    plt.subplot(1, 3, i+1)
    plt.scatter(xplot, yplot, alpha=0.3)
    plt.xlabel("MEN")
    plt.ylabel("Cosine Similarity")
    plt.title(name)
plt.show()

### 3. Pearson's $\rho$ and Spearman's $\rho$

In [None]:
results = []
results_pos = []
for i, (data, name) in enumerate([(bow2, 'bow2'), (bow5, 'bow5'), (deps, 'deps')]):
    # MEN
    embed_results, gold = score_men(men, data)
    spearman = spearmanr(embed_results, gold)
    pearson = pearsonr(embed_results, gold)
    results.append((name, "MEN", spearman[0], spearman[1], pearson[0], pearson[1]))

    # SIMLEX
    embed_results, gold = score_simlex(simlex, "SimLex999", data)
    spearman = spearmanr(embed_results, gold)
    pearson = pearsonr(embed_results, gold)
    results.append((name, "SimLex", spearman[0], spearman[1], pearson[0], pearson[1]))
    
    # SIMLEX per POS tag
    embed_results, gold = score_simlex_pos(simlex, "SimLex999", data)
    for POS in embed_results:
        spearman = spearmanr(embed_results[POS], gold[POS])
        pearson = pearsonr(embed_results[POS], gold[POS])
        results_pos.append((name, "SimLex + {}".format(POS), spearman[0], spearman[1], pearson[0], pearson[1]))

In [None]:
print("Correlation Coefficients for all pairs in the data")
headers = ['Embeddings', 'Gold standard', 'Spearman\'s r',
           'Spearman p-value', 'Pearson\'s r', 'Pearson p-value' ]
print(tabulate(results, headers=headers, tablefmt="fancy_grid"))

print("Correlation Coefficients per POS tag")
headers = ['Embeddings', 'Gold standard', 'Spearman\'s r',
           'Spearman p-value', 'Pearson\'s r', 'Pearson p-value' ]
print(tabulate(results_pos, headers=headers, tablefmt="fancy_grid"))

### 4. Analogy Task

In [None]:
analogies = defaultdict(list)
with open(analogy_filename, 'r') as f:
    for line in f:
        if line[0] == ":":
            topic = line.split()[-1]
        else:
            analogies[topic].append(tuple(line.split()))

In [None]:
def analogy_task(analogies, embeddings):
    """Calculate the accuracy and MRR for embeddings on an analogy task.
    
    Args:
        analogies: dictionary with topics as keys and a list of word tuples as values
        embeddings: dictionary of word embeddings, words as keys and vectors as values
    Returns:
        float: accuracy
        float: MRR
    """
    results = []
    all_correct = 0
    all_analogies_in_vectors = 0
    for topic in analogies:
#         if topic != "capital-common-countries" and topic != "capital-world":
#             continue
        correct = 0
        analogies_in_vectors = 0
        for (a, a_star, b, b_star) in analogies[topic]:
            
            # If words with capital letters are not in the vectors, try to lowercase
            if a not in embeddings and a.lower() in embeddings:
                a = a.lower()
            if a_star not in embeddings and a_star.lower() in embeddings:
                a_star = a_star.lower()
            if b not in embeddings and b.lower() in embeddings:
                b = b.lower()
            if b_star not in embeddings and b_star.lower() in embeddings:
                b_star = b_star.lower()
            if (a not in embeddings or a_star not in embeddings or
                b not in embeddings or b_star not in embeddings):
                continue

            # Get the vector closest to the calculated analogy vector
            analogies_in_vectors += 1
            all_analogies_in_vectors += 1
            b_star_embed = embeddings[b] + (embeddings[a_star] - embeddings[a])
            b_star_embed = b_star_embed / np.linalg.norm(b_star_embed)
            closest = embeddings.similar_by_vector(b_star_embed, topn=2)
            b_star_approx = closest[0][0]
            if b_star_approx == b:
                b_star_approx = closest[1][0]
            if b_star_approx == b_star:
                correct += 1
                all_correct += 1
        results.append((topic, correct / analogies_in_vectors))
    results.append(("All topics", all_correct / all_analogies_in_vectors))
    return results

        
# sghs = KeyedVectors.load_word2vec_format("../honours/SGHS_d1000_i15_w25_annotated.bin",
#                                          binary=True)
# sghs.init_sims(replace=True)
results = analogy_task(analogies, bow2)
print(tabulate(results, headers=['Topic', 'Accuracy'], tablefmt="fancy_grid"))

### 5. Clustering

In [None]:
data = []
common_words = []
with open(common_words_filename, 'r') as f:
    for line in f:
        # 'fig.' is a word, changing it to 'fig' here.
        if line[-2] == '.':
            line = line[:-2]
        common_words.append(line.split()[0])
for key in tqdm(common_words):
    data.append(bow5[key])
embedding = TSNE()
result = embedding.fit_transform(data)
plt.scatter(result[:, 0], result[:, 1])
plt.title('t-SNE plot of the embeddings of the 1999 common words using bow5')
plt.show()

In [None]:
data = np.array(data)
whitened_data = whiten(data)
x = []
y = []
for k in tqdm(range(2,40)):
    centroids, error = kmeans(whitened_data, k)
    x.append(k)
    y.append(error)
plt.plot(x, y)
plt.show()

In [None]:
data = np.array(data)
whitened_data = whiten(data)
x = []
y = []
for k in tqdm(range(2,40)):
    centroids, error = kmeans(data, k)
    x.append(k)
    y.append(error)
plt.plot(x, y)
plt.show()