# Unsupervised Language Learning, Lab1

## Adriaan de Vries (10795227), Verna Dankers (10761225)

Before being able to run this code, please import the following libraries and set the following paths to the datasets. Afterwards, the code should run without issues.

## 5. Clustering

In [None]:
%matplotlib notebook
# Requirements
from tqdm import tqdm
from pprint import pprint
from collections import defaultdict, Counter
from scipy.stats import spearmanr, pearsonr
from tabulate import tabulate
from gensim.models import KeyedVectors
from gensim.corpora.dictionary import Dictionary
from gensim.scripts.glove2word2vec import glove2word2vec
from sklearn.manifold import TSNE
from scipy.cluster.vq import whiten, kmeans
from sklearn.mixture import GaussianMixture
from os.path import isfile
import numpy as np
import os
import matplotlib.pyplot as plt

# Paths to datasets
bow2_filename = "data/bow2.words"
bow5_filename = "data/bow5.words"
deps_filename = "data/deps.words"
simlex_filename = "data/SimLex-999.txt"
men_filename = "data/men/MEN_dataset_natural_form_full"
analogy_filename = "data/questions-words.txt"
common_words_filename = "data/common_words.words"

In [None]:
if not isfile(bow2_filename.split(".")[0] + ".txt"): 
    glove2word2vec(bow2_filename, bow2_filename.split(".")[0] + ".txt")
if not isfile(bow5_filename.split(".")[0] + ".txt"): 
    glove2word2vec(bow5_filename, bow5_filename.split(".")[0] + ".txt")
if not isfile(deps_filename.split(".")[0] + ".txt"): 
    glove2word2vec(deps_filename, deps_filename.split(".")[0] + ".txt")
bow2 = KeyedVectors.load_word2vec_format(bow2_filename.split(".")[0] + ".txt", binary=False)
bow2.init_sims(replace=True)
print("bow2 done.")
bow5 = KeyedVectors.load_word2vec_format(bow5_filename.split(".")[0] + ".txt", binary=False)
bow5.init_sims(replace=True)
print("bow5 done.")
deps = KeyedVectors.load_word2vec_format(deps_filename.split(".")[0] + ".txt", binary=False)
deps.init_sims(replace=True)
print("deps done.")

In [None]:
data = []
common_words = []
with open(common_words_filename, 'r') as f:
    for line in f:
        # 'fig.' is a word, changing it to 'fig' here.
        if line[-2] == '.':
            line = line[:-2]
        common_words.append(line.split()[0])
for key in tqdm(common_words):
    data.append(bow5[key])


In [None]:
data = np.array(data)
whitened_data = whiten(data)
x=[]
aic=[]
bic=[]
for k in tqdm(range(2, 40)):
    gmm = GaussianMixture(n_components=k, n_init=4).fit(np.array(whitened_data))
    labels = gmm.predict(whitened_data)
    aic.append(gmm.aic(whitened_data))
    bic.append(gmm.bic(whitened_data))
    x.append(k)
plt.plot(x, aic, label = 'AIC')
plt.plot(x, bic, label = 'BIC')
plt.xlabel("Number of clusters")
plt.legend()
plt.show()

In [None]:
def cluster_and_TSNE(embeddings, words, n_clusters=27):
    word_embeddings = whiten([embeddings[key] for key in words])
    gmm = GaussianMixture(n_components=n_clusters, n_init=10).fit(np.array(word_embeddings))
    labels = gmm.predict(word_embeddings)
    TSNE_fit = TSNE().fit_transform(word_embeddings)
    return labels, TSNE_fit


label_lists = []
TSNE_fits = []
for embedding in tqdm([bow2, bow5, deps]):
    labels, TSNE_fit = cluster_and_TSNE(embedding, common_words)
    label_lists.append(labels)
    TSNE_fits.append(TSNE_fit)
    
finaldata = [sorted(list(zip(label_lists[embedding_method], TSNE_fits[embedding_method][:,0], TSNE_fits[embedding_method][:,1], common_words))) for embedding_method in range(3)]


In [None]:
father_cluster_labels = []
father_clusters = []
for i, f in enumerate(finaldata):
    for point in f:
        if point[3] == 'father':
            father_cluster_labels.append(point[0])
    cluster = []
    for point in f:
        if point[0] == father_cluster_labels[i]:
            cluster.append(point[3])
    father_clusters.append(cluster)


father_clusters = [set(f) for f in father_clusters]
    
intersection = father_clusters[0]
for c in father_clusters[1:]:
    intersection = intersection & c
    
remainders = [f-intersection for f in father_clusters]
    
pprint([sorted(x) for x in remainders])

pprint(intersection)

In [None]:
def plot_tsne_clusters(data, labels, max_groups_to_plot = 10**4):
    """
    makes plots of data given in the form of a list of lists 
    of tuples consisting of (<cluster label>, <x>, <y>, <word>)
    labels is a list of equal length to data, giving the titles
    of the plots.
    
    plots both the unlabeled clusters, and the clusters so 
    that each data point is annotated with the appropriate
    word. 
    """
    
    figure = plt.figure()
    figure.set_size_inches(15, 10)
    
    for i, (d, l) in enumerate(zip(data, labels)):
        groups = []
        cluster = d[0][0]
        group = []
        for datapoint in d:
            if datapoint[0] != cluster:
                groups.append(group)
                cluster = datapoint[0]
                group = []
            group.append(datapoint)
        groups.append(group)
        plt.subplot(2, len(data), i+1)
        for group in groups[:max_groups_to_plot]:
            group = np.array(group)
            xy = group[:,1:3].astype(np.float64)
            plt.scatter(xy[:,0], xy[:,1], alpha = 0.5)
            plt.title(l)
        plt.subplot(2, len(data), i+1+len(data))
        for group in groups[:max_groups_to_plot]:
            group = np.array(group)
            xy = group[:,1:3].astype(np.float64)
            plt.scatter(xy[:,0], xy[:,1], alpha = 0.5)
            for point, xypoint in zip(group, xy):
                plt.annotate(point[3], xy=(xypoint[0],xypoint[1]), xytext=(0, 0), textcoords='offset points')
    plt.show()
    

plot_tsne_clusters(finaldata, ['bow2', 'bow5', 'deps'], 5)