# Exercise Sheet 3 - Vector Space Models and Word Embeddings


## Learning Objectives
    - Introduction to word vectors
    - Overview on distributional semantics
----

In [None]:
# setting the stage ;)
import nltk
nltk.download('all')

# Preprocessing: Tokenization & POS tagging

In [None]:
# Sentence tokenization with nltk
import nltk

sentence = "The quick brown fox jumps over the lazy dog."
tokens = nltk.word_tokenize(sentence)

print(tokens)

In [None]:
# Part-of-Speech (POS) Tagging

tagged = nltk.pos_tag(tokens)
print(tagged)

### Exercise 1: How to get rid of the punctuations from the text?

In [None]:
text = "Remember, remember, the fifth of November, Gunpowder, treason and plot! If you can't give us one, we'll take two; The better for us and the worse for you!"
sentences = nltk.sent_tokenize(text)
tokens = [nltk.word_tokenize(sent) for sent in sentences]

# Your code goes here

# p.s.: print your_refined_tokens_without_punct


Hint: a list of punctuations could be used (Check string.punctuation on https://docs.python.org/3/library/string.html)

----
# Distributional Semantics & Word Vectors


## Context Words

### Exercise 2: looks up every occurrence of the word "*affection*" and prints out it's context in  the text of Sense and Sensibility by Jane Austen

In [None]:
# your code goes here

Hint: We have already done that in Exercise Sheet 1.1 using NLTK.

## n-grams

In [None]:
from nltk.util import ngrams

text = "You shall know a word by the company it keeps meaning - Firth (1957)"
tokenize = nltk.word_tokenize(text)

In [None]:
# show all possible n-grams
bigrams = ngrams(tokenize,2)
for b in bigrams:
    print(b)

### Frequency of occurence for each bigram

In [None]:
#### showing n-grams with raw frequency

from nltk.collocations import *
import nltk
#You should tokenize your text
text = "The cat lies on the mat and a dog lies on the floor"
tokens = nltk.wordpunct_tokenize(text)
bigrams = BigramCollocationFinder.from_words(tokens)
for bigram, freq in bigrams.ngram_fd.items():  
      print(bigram, freq)

In [None]:
from nltk.book import *

In [None]:
# show n-grams measured using Pointwise Mutual Information

from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

#finder = BigramCollocationFinder.from_words(nltk.corpus.genesis.words('english-web.txt'))
finder = BigramCollocationFinder.from_words(text1)

finder.nbest(bigram_measures.pmi, 10)

... making PMI more interpretable, print out only n-grams which apear about a certain threshold 

In [None]:
freq_threshold = 30
finder = BigramCollocationFinder.from_words(text1)
finder.apply_freq_filter(freq_threshold)
finder.nbest(bigram_measures.pmi, 10)

### Exercise 3: Interpret the variation in pmi based on frequency threshold?

In [None]:
# Your code goes here

Hint: Change frequency threshold to 100 and 10

*Please write your interpretation here*

## Pointwise Mutual Information (PMI)

In [None]:
import nltk
from nltk.collocations import *
from nltk.tokenize import word_tokenize

text = "eat sleep code repeat sleep dream code repeat"

Bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(word_tokenize(text))

for i in finder.score_ngrams(Bigram_measures.pmi):
    print(i)

### Exercise 4: What role PMI may play in language modelling?

*Please write your answer here*


# Word Vectors

In [None]:
# example of a sparse word vector

from sklearn.feature_extraction.text import CountVectorizer
 
corpus = [
'All my cats in a row.',
'When my cat sits down, she looks like a Furby toy!',
'The cat from outer space',
'Sunshine loves to sit like this for some reason.'
]
vectorizer = CountVectorizer()
print(vectorizer.fit_transform(corpus).todense())
print(vectorizer.vocabulary_)

### Exercise 5: Write your intepretation of above vector representation

*Please write your answer here*

## Word Co-occurrence Matrix in sparse CRS (Compressed Sparse Row) format



In [None]:
from sklearn.feature_extraction.text import CountVectorizer

docs = ['the cat lies on the map',
        'the cat lies on the floor',
        'an cat sits near the floor']

# docs = [
# 'All my cats in a row.',
# 'When my cat sits down, she looks like a Furby toy!',
# 'The cat from outer space',
# 'Sunshine loves to sit like this for some reason.'
# ]

count_model = CountVectorizer(ngram_range=(1,1)) # default unigram model
X = count_model.fit_transform(docs)
Xc = (X.T * X)
Xc.setdiag(0)
print(count_model.vocabulary_)

In [None]:
print(Xc.toarray()) # print out matrix in dense format

### Exercise 6: Explain the matrix

*Write your answer here*

# Word2Vec

Mikolov, Tomas; et al. "Efficient Estimation of Word Representations in Vector Space". arXiv:1301.3781

Word2vec is a group of related models that are used to produce word embeddings. These models are shallow, two-layer neural networks that are trained to reconstruct linguistic contexts of words. Word2vec takes as its input a large corpus of text and produces a vector space, typically of several hundred dimensions, with each unique word in the corpus being assigned a corresponding vector in the space. Word vectors are positioned in the vector space such that words that share common contexts in the corpus are located in close proximity to one another in the space.[Wikipedia]

In [None]:
# gensim - open-source vector space modeling and topic modeling toolkit

import gensim
from nltk.corpus import brown
model = gensim.models.Word2Vec(brown.sents())
model.save('brown.embedding')

In [None]:
new_model = gensim.models.Word2Vec.load('brown.embedding')

In [None]:
# word vector dimensionality

len(new_model['university'])

In [None]:
# show word similarity between words, calculated on the non-zero word vectors

new_model.similarity('university','school')

### Exercise 7: Calculate and interpret the similarity between (bank, river) and (bank, deposit)? How will you handle word sense disambiguation?

In [None]:
# Your code goes here

*Your answer goes here*

In [None]:
# # download the Google newsW1v model from 
! wget "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

In [None]:
# Mount your drive to the colab notebook
from google.colab import drive
drive.mount("/content/drive") #authorization is required here

In [None]:
from nltk.data import find
import gensim
from gensim.models.word2vec import Word2Vec

# Mention path where GoogleNews-vectors-negative300.bin.gz file is downloaded (it should be on the same loacation where your colab notebook is)
filename = '/content/drive/My Drive/Colab Notebooks/GoogleNews-vectors-negative300.bin.gz'
model = gensim.models.KeyedVectors.load_word2vec_format(filename, binary=True)



In [None]:
# number of entries in the Word2Vec matrix

len(model.vocab)

In [None]:
# dimensionality of of the dense word vectors

len(model['university'])

In [None]:
# most similar words based on Word2Vec

model.most_similar(positive=['soccer'], topn = 10)

In [None]:
# most dissimilar entry among provided words

model.doesnt_match('wrestling cooking dinner potato'.split())

## Vector operations

In [None]:
model.most_similar(positive=['woman','king'], negative=['man'], topn = 1)

In [None]:
model.most_similar(positive=['Delhi','Russia'], negative=['Mosco'], topn = 1)

### Excercise 8: Can you think of similar examples?

Actor - male + female?

batman - bat + spider?

summer - sun + cold?

*Guess the result here before executing the code*

In [None]:
# Your code goes here

# Visualizing Vector Space

In [None]:
import numpy as np
labels = []
count = 0
max_count = 50
X = np.zeros(shape=(max_count,len(model['university'])))

for term in model.vocab:
    X[count] = model[term]
    labels.append(term)
    count+= 1
    if count >= max_count: break

# It is recommended to use PCA first to reduce to ~50 dimensions
from sklearn.decomposition import PCA
pca = PCA(n_components=50)
X_50 = pca.fit_transform(X)

# Using TSNE to further reduce to 2 dimensions
from sklearn.manifold import TSNE
model_tsne = TSNE(n_components=2, random_state=0)
Y = model_tsne.fit_transform(X_50)

# Show the scatter plot
import matplotlib.pyplot as plt
plt.scatter(Y[:,0], Y[:,1], 20)

# Add labels
for label, x, y in zip(labels, Y[:, 0], Y[:, 1]):
    plt.annotate(label, xy = (x,y), xytext = (0, 0), textcoords = 'offset points', size = 10)

plt.show()
