### Assigment 4

**Submission deadlines**:

* get at least 4 points by Tuesday, 12.05.2022
* remaining points: last lab session before or on Tuesday, 19.05.2022

**Points:** Aim to get 12 out of 15+ possible points

All needed data files are on Drive: <https://drive.google.com/drive/folders/1HaMbhzaBxxNa_z_QJXSDCbv5VddmhVVZ?usp=sharing> (or will be soon :) )

## Task 1 (5 points)

Implement simplified word2vec with negative sampling from scratch (using pure numpy). Assume that in the training data objects and contexts are given explicitly, one pair per line, and objects are on the left. The result of the training should be object vectors. Please, write them to a file using *natural* text format, ie

<pre>
word1 x1_1 x1_2 ... x1_N 
word2 x2_1 x2_2 ... x2_N
...
wordK xK_1 xK_2 ... xk_N
</pre>

Use the loss from Slide 3 in Lecture NLP.2, compute the gradient manually. You can use some gradient clipping, or regularisation. 

**Remark**: the data is specially prepared to make the learning process easier. 
Present vectors using the code below. In this task we define success as 'obtaining a result which looks definitely not random'


In [None]:
import contextlib

@contextlib.contextmanager
def ignored(*exceptions):
    try:
        yield
    except exceptions:
        pass

In [None]:
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from numpy.random import choice

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def logsig(x):
    return np.log(sigmoid(x))


class Word2Vec:
    def __init__(self, corpus, vlen = 5):
        self.corpus = corpus

        self.w2i = {}
        self.i2w = {}
        self.words = set(w for pair in self.iter(corpus) for w in pair)
        for i, word in enumerate(self.words):
            self.w2i[word] = i
            self.i2w[i] = word

        self.objects = defaultdict(int)
        for w, v in self.iter(corpus):
            w = self.w2i[w]
            v = self.w2i[v]
            self.objects[w] += 1

        self.objects = self.__counts_to_distribution(self.objects)
        self.W1 = np.random.uniform(-1, 1, (vlen, len(self.words)))
        self.W2 = np.random.uniform(-1, 1, (len(self.words), vlen))

        self.dist = list(self.objects.values())


    def __counts_to_distribution(self, words):
        word_count = sum(words.values())
        words = {k : (v / word_count) ** 0.75 for k, v in words.items()}
        word_count = sum(words.values())
        words = {k : v / word_count for k, v in words.items()}
        return words

    
    def iter(self, corpus):
        for sentence in corpus:
            with ignored(ValueError):
                w, v = sentence
                yield w, v


    def sample(self, size = 1):
        return choice(len(self.objects), size=size, p=self.dist)


    def word2onehot(self, word):
        return self.idx2onehot(self.w2i[word])

    
    def idx2onehot(self, widx):
        onehot = np.zeros(len(self.words))
        onehot[widx] = 1
        return onehot


    def train(self, α = 0.01):
        loss = 0
        for word, c_pos in tqdm(self.iter(self.corpus)):
            word = self.W1.T[self.w2i[word]]
            c_pos = self.w2i[c_pos]

            h = word.copy()
            for c in list(self.sample(5)) + [c_pos]:
                cvec     = self.W2[c]
                pos      = c == c_pos
                sig      = α * (sigmoid(cvec @ h) - pos)
                cvec    -= sig * h
                word    -= sig * cvec
                loss    -= logsig( (2*pos-1) * cvec @ h )
        return loss

In [None]:
# Initialize the model
from gensim.models.word2vec import LineSentence
corpus = list(LineSentence('task1_objects_contexts_polish.txt'))
# corpus = [x for i, x in enumerate(corpus) if i < 10000]
w2v = Word2Vec(corpus, vlen = 300)

In [None]:
# Train the model
with ignored(KeyboardInterrupt):
    print(w2v.train())

5525116it [6:04:37, 252.55it/s]

27195789.277297378





In [None]:
# Save the model
weights = w2v.W1.T
with open("task1model", "w") as f:
    for i, v in enumerate(weights):
        strv = " ".join(str(x) for x in v)
        f.write(f"{w2v.i2w[i]} {strv}\n")

In [None]:
# Save the model
weights = w2v.W1.T
with open("task1model", "w") as f:
    f.write(f"{weights.shape[0]} {weights.shape[1]}\n")
    for i, v in enumerate(weights):
        strv = " ".join(str(x) for x in v)
        f.write(f"{w2v.i2w[i]} {strv}\n")

In [None]:
def print_similarities(word_vectors, example_words):
    for w0 in example_words:
        print ('WORD:', w0)
        for w, v in word_vectors.most_similar(w0):
            print ('   ', w, v)
        print ()

In [None]:
from gensim.models import KeyedVectors
task1_wv = KeyedVectors.load_word2vec_format('task1model', binary=False)

In [None]:
example_polish_words = ['rower', 'niebo', 'czarny', 'ręcznik', 'muzyka', 'sklep']
print_similarities(task1_wv, example_polish_words)

WORD: rower
    klub 0.5208747386932373
    część 0.515491783618927
    góra 0.5153363943099976
    ziemia 0.512688398361206
    forma 0.5123075842857361
    osoba 0.5111129283905029
    noga 0.5106934309005737
    informacja 0.5105332136154175
    koło 0.5104271173477173
    konstrukcja 0.5099648833274841

WORD: niebo
    charakter 0.6272304654121399
    temat 0.6225338578224182
    droga 0.6140062808990479
    element 0.6091148257255554
    świat 0.6056784391403198
    historia 0.6027140617370605
    strona 0.6017667055130005
    program 0.6017217636108398
    próba 0.6000298261642456
    obraz 0.5997411012649536

WORD: czarny
    G1_kostiumik 0.24910378456115723
    G2_szansonista 0.23791734874248505
    G1_zlew 0.23663567006587982
    G1_przemęczenie 0.23451882600784302
    AND_demel 0.23287728428840637
    AND_demonizm 0.231751948595047
    G2_zachwianie 0.22970683872699738
    pejzażysta 0.22816139459609985
    AND_rykoszetowanie 0.22522741556167603
    transportujący 0.223974153

## Task 2 (4 points)

Your task is to train the embeddings for Simple Wikipedia titles, using gensim library. As the example below shows, training is really simple:

```python
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
model = Word2Vec(sentences=common_texts, vector_size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")
```
*sentences* can be a list of list of tokens, you can also use *gensim.models.word2vec.LineSentence(source)* to create restartable iterator from file. At first, use [this file] containing such pairs of titles, that one article links to another.

We say that two titles are *related* if they both contain a word (or a word bigram) which is not very popular (it occurs only in several titles). Make this definition more precise, and create the corpora which contains pairs of related titles. Make a mixture of the original corpora, and the new one, then train title vectors again.

Compare these two approaches using similar code to the code from Task 1.

In [None]:
from gensim.models.word2vec import LineSentence
sentences = LineSentence('simple.wiki.links.txt')

In [None]:
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
linked_model = Word2Vec(sentences=sentences, size=100, window=1, min_count=1, workers=8)

In [None]:
from collections import defaultdict
from re import split

def tokens(text):
    for line in text:
        for word in line:
            yield word

def related(text, rarity_ceiling = 4):
    groups = defaultdict(set)
    for t in tokens(text):
        for word in split('_|/|:|\(|\)', t):
            groups[word].add(t)

    for g in groups.values():
        if 1 < len(g) <= rarity_ceiling:
            yield g

In [None]:
with open("related.links.txt", "w") as f:
    for g in related(sentences):
        f.writelines(f"{w1} {w2}\n" for w1 in g for w2 in g if w1 != w2)

In [None]:
from random import random

mixed_corpus = [
    linked if random() < 0.5 else related
    for linked, related in zip(LineSentence('simple.wiki.links.txt'), LineSentence('related.links.txt'))
]

mixed_model = Word2Vec(sentences=mixed_corpus, size=100, window=1, min_count=1, workers=8)

In [None]:
examples = [mixed_corpus[0][1], mixed_corpus[100][1], mixed_corpus[300][1]]

print_similarities(mixed_model.wv, examples)
print("*" * 50)
print_similarities(linked_model.wv, examples)

WORD: wikipedia:requests_for_deletion/requests/2017/abigail_kubeka
    enrico_intra 0.45743367075920105
    wimille 0.435638427734375
    fredenbeck 0.43357521295547485
    alpha_crateris 0.42946475744247437
    chókwè 0.41917476058006287
    sidi_m'hamed_district 0.4177285432815552
    ifk_eskilstuna 0.4080437421798706
    1147 0.40449756383895874
    kalvarija,_lithuania 0.40180152654647827
    xalapa 0.3985223174095154

WORD: category:jewish_businesspeople
    giuditta_pasta 0.45154088735580444
    list_of_members_of_the_house_of_representatives_of_the_netherlands,_1981–82 0.4196167290210724
    judge_joe_brown 0.41875016689300537
    ponte_della_paglia 0.4161037504673004
    ariadne_merione 0.4156762361526489
    category:presidents_of_tajikistan 0.41340094804763794
    file:emblem_of_the_egyptian_armed_forces.png 0.41262415051460266
    judith_estrin 0.40861380100250244
    andrey_arshavin 0.4014664888381958
    yusuke_kawabuchi 0.4002338647842407

WORD: damion_stewart
    de_haan

# Task 3 (4 points)

Suppose that we have two languages: Upper and Lower. This is an example Upper sentence:

<pre>
THE QUICK BROWN FOX JUMPS OVER THE LAZY DOG.
</pre>

And this is its translation into Lower:

<pre>
the quick brown fox jumps over the lazy dog
</pre>

You have two corpora for these languages (with different sentences). Your task is to train word embedings for both languages together, so as to make embeddings of the words which are its translations as close as possible. But unfortunately, you have the budget which allows you to prepare the translation only for 1000 words (we call it D, you have to deside which words you want to be in D)

Prepare the corpora wich contains three kind of sentences:
* Upper corpus sentences
* Lower corpus sentences
* sentences derived from Upper/Lower corpus, modified using D

There are many possible ways of doing this, for instance this one (ROT13.COM: hfr rirel fragrapr sebz obgu pbecben gjvpr: jvgubhg nal zbqvsvpngvbaf, naq jvgu rirel jbeqf sebz Q ercynprq ol vgf genafyngvba)

We define the score for an Upper WORD as  $\frac{1}{p}$, where $p$ is a position of its translation in the list of **Lower** words most similar to WORD. For instance, when most similar words to DOG are:

<pre>
WOLF, CAT, WOLVES, LION, gopher, dog
</pre>

then the score for the word DOG is 0.5. Compute the average score separately for words from D, and for words out of D (hint: if the computation takes to much time do it for a random sample).


# Task 4 (4 points)

In this task you are asked to do two things:
1. compare the embeddings computed on small corpus (like Brown Corpus , see: <https://en.wikipedia.org/wiki/Brown_Corpus>) with the ones coming from Google News Corpus
2. Try to use other resourses like WordNet to enrich to corpus, and obtain better embeddings

You can use the following code snippets:

```python
# printing tokenized Brown Corpora
from nltk.corpus import brown
for s in brown.sents():
    print(*s)
    
#iterating over all synsets in WordNet
from nltk.corpus import wordnet as wn

for synset_type in 'avrns': # n == noun, v == verb, ...
    for synset in list(wn.all_synsets(synset_type)))[:10]:
        print (synset.definition())
        print (synset.examples())
        print ([lem.name() for lem in synset.lemmas()])
        print (synset.hyperonims()) # nodes 1 level up in ontology
        
# loading model and compute cosine similarity between words

model = Word2Vec.load('models/w2v.wordnet5.model') 
print (model.wv.similarity('dog', 'cat'))
```

Embeddings will be tested using WordSim-353 dataset, the code showing the quality is in the cell below. Prepare the following corpora:
1. Tokenized Brown Corpora
2. Definitions and examples from Princeton WordNet
3. (1) and (2) together
4. (3) enriched with pseudosentences containing (a subset) of WordNet knowledge (such as 'tiger is a carnivore')

Train 4 Word2Vec models, and raport Spearman correletion between similarities based on your vectors, and similarities based on human judgements.



In [None]:
from pprint import pprint
from gensim.models.word2vec import LineSentence
from gensim.test.utils import common_texts
from gensim.models import Word2Vec
from scipy.stats import spearmanr
from nltk.corpus import brown
from nltk.corpus import wordnet
import nltk

nltk.download('brown')
nltk.download('wordnet')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Corpora

In [None]:
WORDNET = []
for synset_type in 'avrns': # n == noun, v == verb, ...
    for synset in list(wordnet.all_synsets(synset_type))[:11400]:
        WORDNET.append( (" ".join(synset.examples() + [synset.definition()])).split() )

In [None]:
BROWN = brown.sents()

In [None]:
BROWNET = BROWN + WORDNET

In [None]:
def text(synset):
    return synset.lemmas()[0].name()

RICHNET = WORDNET + [
    f"{text(synset)} {text(synset.hypernyms()[0])}"
    for synset in list(wordnet.all_synsets(wordnet.NOUN))
    if synset.hypernyms()
]

### Models

In [None]:
wordnet2vec = Word2Vec(sentences=WORDNET, size=300, window=10, min_count=1, workers=8)
print("Corpus size:", len(WORDNET))
pprint(wordnet2vec.wv.most_similar('fish'))

Corpus size: 49221
[('tentacles', 0.997155487537384),
 ('spiny', 0.9969766139984131),
 ('thin', 0.9969537258148193),
 ('America', 0.9967796206474304),
 ('western', 0.9962552785873413),
 ('tan', 0.9962497353553772),
 ('ray', 0.995707631111145),
 ('purple', 0.9956783652305603),
 ('horny', 0.9955117702484131),
 ('spine', 0.9954314827919006)]


In [None]:
brown2vec = Word2Vec(sentences=BROWN, size=300, window=10, min_count=1, workers=8)
print("Corpus size:", len(BROWN))
pprint(brown2vec.wv.most_similar('fish'))

Corpus size: 57340
[('medicine', 0.9726433753967285),
 ('presumably', 0.9709492921829224),
 ('edges', 0.9704681634902954),
 ('neatly', 0.9684012532234192),
 ('Chromspun', 0.9677953124046326),
 ('beveled', 0.9674861431121826),
 ('Rio', 0.9673547744750977),
 ('Democrats', 0.9670670628547668),
 ('troops', 0.9669222235679626),
 ('Eastman', 0.9665273427963257)]


In [None]:
brownet2vec = Word2Vec(sentences=BROWNET, size=300, window=10, min_count=1, workers=8)
print("Corpus size:", len(BROWNET))
pprint(brownet2vec.wv.most_similar('fish'))

Corpus size: 106561
[('horny', 0.9684784412384033),
 ('wild', 0.9677807092666626),
 ('spiked', 0.9676572680473328),
 ('narrow', 0.9660624265670776),
 ('scallop', 0.9634495377540588),
 ('flowers', 0.9616557359695435),
 ('dried', 0.961398720741272),
 ('damp', 0.9613034725189209),
 ('banded', 0.9612137079238892),
 ('wood', 0.9609308242797852)]


In [None]:
richnet2vec = Word2Vec(sentences=RICHNET, size=300, window=10, min_count=1, workers=8)
print("Corpus size:", len(RICHNET))
pprint(richnet2vec.wv.most_similar('fish'))

Corpus size: 123610
[('markings;', 0.9641100764274597),
 ('dinosaur', 0.9605161547660828),
 ('tail;', 0.9597213268280029),
 ('wings', 0.9528306722640991),
 ('yellow', 0.9527287483215332),
 ('toes', 0.9514091610908508),
 ('thick', 0.9513073563575745),
 ('tinged', 0.9508973360061646),
 ('feline', 0.9507126212120056),
 ('tailless', 0.9502142667770386)]


In [None]:
import gensim.downloader as api
gn = api.load("word2vec-google-news-300") 
gn.wv.most_similar("fish")

  This is separate from the ipykernel package so we can avoid doing imports until


[('trout', 0.7780654430389404),
 ('catfish', 0.7453915476799011),
 ('striped_bass', 0.7439615726470947),
 ('fishes', 0.7301375865936279),
 ('salmon', 0.7214174270629883),
 ('sturgeon', 0.7204243540763855),
 ('rainbow_trout', 0.7189892530441284),
 ('freshwater_fish', 0.7125004529953003),
 ('brown_trout', 0.7081860303878784),
 ('mussels', 0.704771101474762)]

### Correlation


In [None]:
import numpy as np
from gensim import matutils  # utility fnc for pickling, common scipy operations etc

def cosine_similarity(vec1, vec2):
    cosine_similarity = np.dot(matutils.unitvec(vec1), matutils.unitvec(vec2))
    return cosine_similarity

In [None]:
# Code for computing correlation between W2V similarity, and human judgements
def correlation(model):
    for similarity_type in ['relatedness', 'similarity']:
        human = []; w2v = []
        for x in open(f'task4_wordsim_{similarity_type}_goldstandard.txt'): 
            try:
                a,b,val = x.split()
                w2v.append(cosine_similarity(model.wv[a], model.wv[b]))
                human.append(val)
            except: ...


        #spearmanr returns 2 values: correlation and pval. pval should be close to zero
        print (similarity_type + ':', spearmanr(human, w2v)) 

In [None]:
correlation(wordnet2vec)

relatedness: SpearmanrResult(correlation=0.21755192158457107, pvalue=0.0009694002008041033)
similarity: SpearmanrResult(correlation=0.17678730965248646, pvalue=0.01696828821614318)


In [None]:
correlation(brown2vec)

relatedness: SpearmanrResult(correlation=0.017019637736190054, pvalue=0.7965196950116096)
similarity: SpearmanrResult(correlation=0.02398155053061496, pvalue=0.7479482475836634)


In [None]:
correlation(brownet2vec)

relatedness: SpearmanrResult(correlation=0.11468398597861194, pvalue=0.07198730771145768)
similarity: SpearmanrResult(correlation=0.17598941712463317, pvalue=0.0136105429362918)


In [None]:
correlation(richnet2vec)

relatedness: SpearmanrResult(correlation=0.1693712023099948, pvalue=0.010580954111206659)
similarity: SpearmanrResult(correlation=0.1698040437050389, pvalue=0.021924901655091197)


In [None]:
correlation(gn)

relatedness: SpearmanrResult(correlation=0.6355305235732446, pvalue=6.484385280296368e-30)
similarity: SpearmanrResult(correlation=0.747667761718117, pvalue=1.379744650280001e-37)


  
