In [None]:
from IPython.core.display import HTML
HTML("<style>" + open("style.css").read() + "</style>")

<div class="headline">
Language Technology / Sprachtechnologie
<br><br>
Wintersemester 2019/2020
</div>
<br>
<div class="description">
    Übung zum Thema <i id="topic">"Word Sense Induction"</i>
    <br><br>
    Deadline Abgabe: <i #id="submission">Thursday, 21.11.2019 (23:55 Uhr)</i>
</div>

# Präsenzübung

In [None]:
import nltk
import string
from nltk.corpus import wordnet as wn  
from nltk.book import text2, text3
from nltk.text import Text
from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.cluster import KMeans  

## Warm Up

<div class="task_description">
    <i class="task">Task 5.1:</i> <br>
</div>

Stopword: Which of the following statements are true?

1. A stopword is a word used to stop a parsing process.
2. A stopword is a high-frequency word.
3. Punctuation marks like ``.``, ``,``, and ``;`` are stopwords.
4. "the, to, and" are stopwords.
5. Stopwords have a maximum length of 3.
6. NLTK offers a list of English stopwords through: nltk.corpus.stopwords.words('english')


<strong style="color: blue">Lösung</strong>

1. False. A stopword is a frequent word with little lexical content.
2. True. Additionally, stopwords ('a', 'the', 'to', 'also') must also carry little lexical content.
3. False. First, punctuation marks are not really ''words'', and are thus usually not considered stopwords. In a sense, they are similar to stopwords as they carry no lexical content, i.e. they have no real meaning.
4. True
5. False. There are stopwords like 'while' or 'will' with length more than 3 characters
6. True. There are 127 words in this list.


<div class="task_description">
    <i class="task">Task 5.2:</i> <br>
</div>

Synset: Which of the following statements are true?


1. A synset is a set of words that are interchangeable in some context without changing the meaning of a sentence in which they are embedded.
2. A synset is a set of all bigrams within WordNet.
3. Within WordNet (corpus / lexical ressource) each word corresponds to one or more synsets.
4. You may access the synsets of "dog" in NLTK by using nltk.corpus.wordnet.synsets('dog')


<strong style="color: blue">Lösung</strong>

1. True. Synset is defined as a set of words grouped by some specific meaning. Since we need only one meaning in the context, the statement is correct.
2. False. Synset is a set of words and not bigrams.
3. True. A single word may be assigned to different synsets. For example, the word ‘car’ is assigned to such synsets as [‘car’, ‘cable_car’] and [‘car’, ‘elevator_car’].
4. True. The output is [Synset(‘dog.n.01’), Synset(‘frump.n.01’), Synset(‘dog.n.03’), Synset(‘cad.n.01’), Synset(‘frank.n.02’), Synset(‘pawl.n.01’), Synset(‘andiron.n.01’), Synset(‘chase.v.01’)]. ‘frump.n.01’ means that ‘dog’ belongs to the synset with the first sense of the noun (‘n’) ‘frump’.

<div class="task_description">
    <i class="task">Task 5.3:</i> <br>
</div>

There are two possible definitions of 'word':

* Word is the same as token, for example ‘see’ and ‘saw’ are different words;
* Word is the same as lemma, hence ‘things’ and ‘thing’ become the same word. 
<br>

Which of the following statements are true?


1. The lemma of “appeared” is “appear”.
2. The lemma of a verb is its infinitive.
3. Two words having the same lemma are called homonyms.
4. Each token has one and only one lemma.
5. Each lemma belongs to one and only one word.

<strong style="color: blue">Lösung</strong>

1. True, because ‘appear’ is the normalized form for ‘appeared’.
2. True, because the infinitive is the normalized form of a verb.
3. False. Homonyms are defined as distinct words that share the same spelling and pronunciation. However, the lemmas of these words may be different. For example, ‘goes’ and ‘went’ have both the lemma ‘go’, but are not homonyms.
4. False, as homographs share the same surface form, but might have different lemmas.
5. False, because e.g. the lemma ‘be’ corresponds to two words ‘is’ and ‘are’.

## Lexical Resources

<div class="task_description">
    <i class="task">Task 5.4:</i> <br>
</div>

A stopword list contains high-frequency words like “the”, “to”, “and”, or “also” that we sometimes want to filter out of a document before further processing. Stopwords usually have little lexical content, and their presence in a text fails to distinguish it from other texts. NLTK provides a predefined list of stopwords: nltk.corpus.stopwords

<div class="task_description">
   <i class="subtask">5.4.1</i> <i class="l2">L2</i> <br>
</div>
Find all non-stopwords in carroll-alice.txt of the corpus Gutenberg.

<strong style="color: blue">Lösung</strong>

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
print([word for word in nltk.corpus.gutenberg.words('carroll-alice.txt') if word.lower() not in stopwords])

It iterates over each word in carrol-alice.txt and outputs the word if its lowercase form is not inside the NLTK stopword list.

<div class="task_description">
   <i class="subtask">5.4.2</i> <i class="l2">L2</i> <br>
</div>
Implement a function is_the_same_vocab(text1, text2) that compares two texts. It should return true, if the texts have the same vocabulary after removing the stopwords.

<strong style="color: blue">Lösung</strong>

In [None]:
def get_vocab_without_stopwords(text):  
    """returns the vocabulary of the text without the stopwords"""
    
    return set(text.split()) - set(nltk.corpus.stopwords.words('english'))  

def is_the_same_vocab(text1, text2):  
    """check if the texts have the same vocabulary, after removing the stopwords -- works case sensitive"""
    
    text1_vocab = get_vocab_without_stopwords(text1)
    text2_vocab = get_vocab_without_stopwords(text2)
    print("Text 1 Vocab:", text1_vocab, "\nText 2 Vocab:", text2_vocab)
    
    return(text1_vocab == text2_vocab)  

text1 = "People are hungry before lunch and not hungry after lunch"  
text2 = "People are not hungry before lunch and hungry after lunch"  
text3 = "Humans are hungry before dinner and not hungry after dinner"  

print("Have same Vocab?", is_the_same_vocab(text1, text2)) # True  
print("Have same Vocab?", is_the_same_vocab(text2, text3)) # False 

<div class="task_description">
    <i class="task">Task 5.5:</i> <i class="l1">L1</i> <br>
</div>

Take a look at the code below for some examples on how to work with WordNet. You can also execute it on your computer to see how the output looks like for a better understanding.

Explain the code line by line, what datatype and meaning have the methods used?

In [None]:
print(wn.synsets('motorcar'), "\n")
print(wn.synset('car.n.01').lemma_names(), "\n")
print(wn.synset('car.n.01').definition(), "\n")
print(wn.synset('car.n.01').examples(), "\n")
print(wn.synset('car.n.01').lemmas(), "\n")
print(wn.lemma('car.n.01.automobile'), "\n")
print(wn.lemma('car.n.01.automobile').synset(), "\n")
print(wn.lemma('car.n.01.automobile').name(), "\n")
print(wn.synsets('car'), "\n")
print(wn.lemmas('car'), "\n")

<strong style="color: blue">Lösung</strong>

In [None]:
#returns the synset [Synset('car.n.01')] corresponding to the word 'motorcar'.  
print(wn.synsets('motorcar'))  

# returns all words ['car', 'auto', 'automobile', 'machine', 'motorcar'] that exist in the synset 'car.n.01'  
print(wn.synset('car.n.01').lemma_names())  

# returns the definition 'a motor vehicle with four wheels; usually propelled by an internal combustion engine' 
# of the synset 'car.n.01'  
print(wn.synset('car.n.01').definition())  

# returns the examples ['he needs a car to get to work'] for the synset 'car.n.01'  
print(wn.synset('car.n.01').examples())  

# returns all lemmas [Lemma('car.n.01.car'), Lemma('car.n.01.auto'), Lemma('car.n.01.automobile'), Lemma('car.n.01.machine'),
# Lemma('car.n.01.motorcar')] that exist in the synset 'car.n.01'  
print(wn.synset('car.n.01').lemmas())  

# returns the lemma 'car.n.01.automobile'  
print(wn.lemma('car.n.01.automobile'))  

# returns the synset 'car.n.01' corresponding to the lemma 'car.n.01.automobile'  
print(wn.lemma('car.n.01.automobile').synset())  

# returns the name of the lemma 'car.n.01.automobile'  
print(wn.lemma('car.n.01.automobile').name())  

# returns the synsets [Synset('car.n.01'), Synset('car.n.02'), Synset('car.n.03'), Synset('car.n.04'), Synset('cable_car.n.01')]
# corresponding to the word 'car'. We note that word 'car' has more meanings than 'motorcar', 
# hence more synsets are retrieved from WordNet.  
print(wn.synsets('car'))  

# returns the lemmas [Lemma('car.n.01.car'), Lemma('car.n.02.car'), Lemma('car.n.03.car'), Lemma('car.n.04.car'), 
# Lemma('cable_car.n.01.car')]. 'car.n.0x.car' here are different lemmas that belong to different synsets.  
print(wn.lemmas('car')) 


<div class="task_description">
    <i class="task">Task 5.6:</i> 
</div>

<div class="task_description">
   <i class="subtask">5.6.1.</i> <i class="l2">L2</i> <br>
</div>
List all the senses of the word "like" that you can think of. Now we want to see how many different meanings the word ”like” has with the help of WordNet, using the same operations we used above. 

<strong style="color: blue">Lösung</strong>

In [None]:
for synset in wn.synsets('like'):  
    print("Lemma name:\n", synset.lemma_names())  
    print("Lemma:\n", synset.lemmas())  
    print("Definition:\n", synset.definition())  
    print("POS:\n:", synset.pos())
    print("Example:\n", synset.examples(),"\n____________\n")

<div class="task_description">
   <i class="subtask">5.6.2.</i> <i class="l2">L2</i> <br>
</div>
As you can see above, the words can have different POS tags. Suppose you are only interested in the adjective "warm". Write a function that will output definitions and examples of the word "warm" under this condition.

<strong style="color: blue">Lösung</strong>

In [None]:
for synset in wn.synsets('warm', wn.ADJ):  
    print("Definition:\n",synset.definition())
    print("Example:\n",synset.examples(),"\n______________\n")  

<div class="task_description">
   <i class="subtask">5.6.3.</i> <i class="l3">L3</i> <br>
</div>
Write functions that outputs the hypernyms and hyponyms of the word 'bank'.

<strong style="color: blue">Lösung</strong>

In [None]:
for syn in wn.synsets("bank", wn.NOUN): 
    for l in syn.hypernyms(): 
            print(syn.name(),"is a", l.name(),"which is", syn.definition(),"\n_______\n")

In [None]:
for syn in wn.synsets("bank", wn.NOUN): 
    for l in syn.hyponyms(): 
            print(syn.name(),"has subconcept", l.name())

<div class="task_description">
   <i class="subtask">5.6.4</i> <i class="l3">L3</i> <br>
</div>

Another aspect we can examine is if a word has an antonym (word with opposite meaning). Write a function that searches an antonym for the word "like".

<strong style="color: blue">Lösung</strong>

In [None]:
for syn in wn.synsets("like"): 
    for l in syn.lemmas(): 
        print(syn.name(), "(", l.name(), ")")
        if l.antonyms():
            print("Has the antonym", [m.name() for m in l.antonyms()], "\n")
        else: print("Doesn't have an antonym\n")      

<div class="task_description">
    <i class="task">Task 5.7:</i> <br>
</div>

The polysemy of a word is the number of senses it has. We now want to compute the average polysemy of nouns, verbs, adjectives and adverbs according to WordNet and discuss the results.

<div class="task_description">
   <i class="subtask">5.7.1</i> <i class="l1">L1</i> <br>
</div>

Try to think of an algorithm that caclulates the ploysemy of a POS.

<strong style="color: blue">Lösung</strong>

To calculate the average polysemy, we need to count every lemma of the respective POS and the number of meanings the lemma has (size of the synset). Then we divide the total number of meanings for all lemmas of the given POS by the total number of lemmas of the given POS.

<div class="task_description">
   <i class="subtask">5.7.2</i> <i class="l2">L2</i> <br>
</div>

Take a look at the code below. Implement it and enhance the main function so that the average polysemy is calculated not only for nouns but also for verbs, adjectives and adverbs (do not implement your algorithm yet.)

In [None]:
def avgPolysemy():
    #TODO, return average polysemy
    
avgN = avgPolysemy()
    #TODO

print("Average polysemy of nouns: ", avgN)
    #TODO

<strong style="color: blue">Lösung</strong>

In [None]:
def avgPolysemy(pos):
    nrOfSynsets = 0
    nrOfLemmas = 0
    lemmas = []
    #TODO

avgN = avgPolysemy(wn.NOUN)
avgV = avgPolysemy(wn.VERB)
avgADJ = avgPolysemy(wn.ADJ)
avgADV = avgPolysemy(wn.ADV)

print("Average polysemy of nouns:", avgN)
print("Average polysemy of verbs:", avgV)
print("Average polysemy of adjectives:", avgADJ)
print("Average polysemy of adverbs:", avgADV)

<div class="task_description">
   <i class="subtask">5.7.3</i> <i class="l3">L3</i> <br>
</div>

Now implement your algorithm and calculate the average polysemy for nouns, verbs, adjectives and adverbs.

In [None]:
def avgPolysemy(pos):
    nrOfSynsets = 0
    nrOfLemmas = 0
    lemmas = []
    
    for synset in wn.all_synsets(pos):
        for lemma in synset.lemmas():
            lemmas.append(lemma.name())
    lemmas = set(lemmas)
    
    for lemma in lemmas:
        count = len(wn.synsets(lemma, pos))
        if count > 0:
            nrOfLemmas += 1
            nrOfSynsets += count
            
    return nrOfSynsets/nrOfLemmas

avgN = avgPolysemy(wn.NOUN)
avgV = avgPolysemy(wn.VERB)
avgADJ = avgPolysemy(wn.ADJ)
avgADV = avgPolysemy(wn.ADV)

print("average polysemy of nouns: ", avgN)
print("average polysemy of verbs: ",  avgV)
print("average polysemy of adjectives: ",  avgADJ)
print("average polysemy of adverbs: ",  avgADV)

## Concordance

<div class="task_description">
    <i class="task">Task 5.8:</i> <br>
</div>

A concordance view shows us every occurrence of a given word, together with some context.

<div class="task_description">
   <i class="subtask">5.8.1.</i> <i class="l2">L2</i> <br>
</div>
Extract context samples of word ‘affection’ in “Sense and Sensibility” (text2) corpus of nltk and word ‘lived’ in “Book of Genesis” (text3) corpus.

<strong style="color: blue">Lösung</strong>

In [None]:
nltk.book.text2.concordance("affection")
print()
nltk.book.text3.concordance("lived")

<div class="task_description">
   <i class="subtask">5.8.2.</i> <i class="l2">L2</i> <br>
</div>

Similar words for the term can be found if words share same context as that of token. 

For example: monsterous occurred in contexts such as the ___ pictures and a ___ size . What other words appear in a similar range of contexts? We can find out by appending the term similar to the name of the text in question, then inserting the relevant word in parentheses:

Find out similar words for affection in “Sense and Sensibility” (text2) corpus of nltk.


<strong style="color: blue">Lösung</strong>

In [None]:
nltk.book.text2.similar("affection")  

## Clustering - Tf-Idf Vectorizer 

<div class="task_description">
    <i class="task">Task 5.9:</i> <br>
</div>

<div class="task_description">
   <i class="subtask">5.9.1.</i> <i class="l2">L2</i> <br>
</div>

Extract all the context of term affection in the “Sense and Sensibility” (text2) corpus of nltk with a window size of 21 (each context should have 10 words before and after the term). Provide the total number of contexts.

In [None]:
c = nltk.ConcordanceIndex(nltk.book.text2.tokens, key=lambda s: s.lower())
window_size = 10
affection_contexts = []
for index in c.offsets('affection'):  
    if index > 10:
           #TODO
print(len(affection_contexts))

<strong style="color: blue">Lösung</strong>

In [None]:
c = nltk.ConcordanceIndex(nltk.book.text2.tokens, key=lambda s: s.lower())  
window_size = 10  
affection_contexts = []  
for index in c.offsets('affection'):  
    if index > 10:  
        affection_contexts.append(nltk.book.text2.tokens[index - window_size:index + window_size])  
print(len(affection_contexts))  # return 79

<div class="task_description">
   <i class="subtask">5.9.2.</i> <i class="l2">L2</i> <br>
</div>

Preprocess the contexts by removing punctuation and stopwords. 

In [None]:
stopwords = set(nltk.corpus.stopwords.words('english'))  
affection_contexts_no_stopwords = []  
for context in affection_contexts:  
    # remove punctuation
    #TODO
    
    # remove stopwords  
    #TODO

<strong style="color: blue">Lösung</strong>

In [None]:
stopwords = set(nltk.corpus.stopwords.words('english'))  
affection_contexts_no_stopwords = []  
for context in affection_contexts: 
    # remove punctuation 
    context = [word for word in context if word not in string.punctuation]
    # remove stopwords  
    context_no_stopwords = []  
    for token in context:
        if token not in stopwords:  
            context_no_stopwords.append(token)  
    affection_contexts_no_stopwords.append(' '.join(context_no_stopwords))

<div class="task_description">
   <i class="subtask">5.9.3.</i> <i class="l2">L2</i> <br>
</div>
Generate a tf-idf feature for the preprocessed text using sci-kit learn feature extractor.

In [None]:
vectorizer = TfidfVectorizer()  
X_tf_idf = #TODO

<strong style="color: blue">Lösung</strong>

In [None]:
vectorizer = TfidfVectorizer()  
X_tf_idf = vectorizer.fit_transform(affection_contexts_no_stopwords)

<div class="task_description">
   <i class="subtask">5.9.4.</i> <i class="l3">L3</i> <br>
</div>

Apply k means algorithm with k = 3(number of cluster)

In [None]:
true_k = 3 # number of clusters  

# Apply algorithm 
model_tf_idf = #TODO

# Training the algorithm
#TODO 

<strong style="color: blue">Lösung</strong>

In [None]:
true_k = 3 # number of clusters  
model_tf_idf = KMeans(n_clusters=true_k) # apply algorithm  
model_tf_idf.fit(X_tf_idf) # training the algorithm

<div class="task_description">
   <i class="subtask">5.9.5.</i> <i class="l3">L3</i> <br>
</div>

Print the top-ten terms that represent each cluster.

In [None]:
order_centroids = model_tf_idf.cluster_centers_.argsort()[:, ::-1]  # getting indexes of centroids   
terms = vectorizer.get_feature_names() # getting feature terms   
for i in range(true_k):  
    print("Cluster %d:" % i)
    
    # get top 10 terms
    
    #TODO

<strong style="color: blue">Lösung</strong>

In [None]:
order_centroids = model_tf_idf.cluster_centers_.argsort()[:, ::-1]  # getting indexes of centroids   
terms = vectorizer.get_feature_names() # getting feature terms   
for i in range(true_k):  
    print("Cluster %d:" % i)  
    for ind in order_centroids[i, :10]:  
        print(" %s" % terms[ind])

<div class="task_description">
   <i class="subtask">5.9.6.</i> <i class="l3">L3</i> <br>
</div>
Assign context to the corresponding cluster predicted by the k-means algorithm.

In [None]:
clusters = {}  
for context in affection_contexts:  
    X = vectorizer.transform([context])  # get tf-idf vector for the context  
    predicted = model_tf_idf.predict(X)  # predict the cluster  
    
    # generate clusters
    
    # TODO    
print(clusters)

<strong style="color: blue">Lösung</strong>

In [None]:
clusters = {}  
for context in affection_contexts:  
    X = vectorizer.transform([' '.join(context)])   # get tf-idf vector for the context  
    predicted = model_tf_idf.predict(X)[0]          # predict the cluster  
    if predicted not in clusters.keys():    
        clusters[predicted] = []  
    clusters[predicted].append(context)   
print(clusters)

# Working with Embeddings

<div class="task_description">
    <i class="task">Task 5.10:</i> <br>
</div>

Word embedding is capable of capturing the meaning of a word in a document, semantic and syntactic similarity, relation with other words.

## Installation:

In [None]:
import sys
!{sys.executable} -m pip install gensim

<div class="task_description">
   <i class="subtask">5.10.1.</i> <i class="l2">L2</i> <br>
</div>
Import common_text corpus from gensim library and display first 10 sentences.

In [None]:
from gensim.test.utils import common_texts
#TODO

<strong style="color: blue">Lösung</strong>

In [None]:
from gensim.test.utils import common_texts
common_texts[:10]

<div class="task_description">
   <i class="subtask">5.10.2.</i> <i class="l2">L2</i> <br>
</div>
Train the Word2vec model based on gensim with the follwing parameters:
    - Size of the dimension = 100
    - Context window size = 5
    - Minimum frequency count = 1

In [None]:
from gensim.models import Word2Vec
model = #TODO

<strong style="color: blue">Lösung</strong>

In [None]:
from gensim.models import Word2Vec
model = Word2Vec(common_texts, vector_size=100, window=5, min_count=1)

<div class="task_description">
   <i class="subtask">5.10.3.</i> <i class="l2">L2</i> <br>
</div>

Display the vector for the word "computer".

<strong style="color: blue">Lösung</strong>

In [None]:
model.wv['computer']

<div class="task_description">
   <i class="subtask">5.10.4.</i> <i class="l2">L2</i> <br>
</div>

Compute the similarity score between the words 'graph' and 'trees'.

<strong style="color: blue">Lösung</strong>

In [None]:
model.wv.similarity('graph', 'trees')

<div class="task_description">
   <i class="subtask">5.10.5.</i> <i class="l2">L2</i> <br>
</div>

Calculate the top 5 most similar word to the word 'graph'.

<strong style="color: blue">Lösung</strong>

In [None]:
model.wv.most_similar('graph')[:5]

## Working with pretrained google word2vec embeddings

<div class="task_description">
    <i class="task">Task 5.11:</i> <br>
</div>

The Google word2vec embeddings was trained on Google news data (about 100 billion words); it contains 3 million words and phrases and was fit using 300-dimensional word vectors.

Download the GoogleNews-vectors-negative300.bin.gz embeddings in your current working directory and unzip it. 

It is a 1.53 Gigabytes file. You can download it from here:

https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing

<div class="task_description">
   <i class="subtask">5.11.1.</i> <i class="l2">L2</i> <br>
</div>

Generate gensim word2vec model using google pretrained embeddings.

In [None]:
from gensim.models import KeyedVectors
filename = 'GoogleNews-vectors-negative300.bin'
google_word2vec_model = #TODO # load model

<strong style="color: blue">Lösung</strong>

In [None]:
from gensim.models import KeyedVectors
filename = 'GoogleNews-vectors-negative300.bin'
google_word2vec_model = KeyedVectors.load_word2vec_format(filename, binary=True)

<div class="task_description">
   <i class="subtask">5.11.2.</i> <i class="l2">L2</i> <br>
</div>

Operation (king – man) + woman = ? can be performed as

In [None]:
google_word2vec_model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)

Calculate the (Germany – Berlin) + Moscow = ? operation.

<strong style="color: blue">Lösung</strong>

In [None]:
google_word2vec_model.most_similar(positive=['Moscow', 'Germany'], negative=['Berlin'], topn=1)

# Homework

<div class="task_description">
    <i class="task">Task 5.1:</i> <br>
</div>


<div class="task_description">
   <i class="subtask">5.1.1.</i> :::5 Homework points:::
</div>

Try K-means clustering with genism word2vec embeddings features instead of tf-idf features.

In [None]:
from gensim.models import Word2Vec #get gensim word2vec feature extraction utility  
affection_contexts_no_stopwords = [context.split(' ') for context in affection_contexts_no_stopwords]  

# Apply word2vec algorithm

w2v = #TODO

X_w2v = w2v[w2v.wv.vocab]  


model_w2v = KMeans(n_clusters=true_k) 

# Train kmeans algorithm
#TODO 


<div class="task_description">
   <i class="subtask">5.1.2.</i> :::5 Homework points:::
</div>

Try K-means clustering with genism word2vec embeddings features instead of tf-idf features. Evaluate the performance of the k-means algorithm using silhouette metric for embedding and tf-idf features.

In [None]:
from sklearn.metrics import silhouette_score  

model_w2v = KMeans(n_clusters=true_k)  
model_w2v.fit(X_w2v)  

# print Silhouette scores

# TODO 