<a href="https://colab.research.google.com/github/virtualspark/YCNG232-NLP_Fundamentals/blob/main/Reflexion_Week_8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
import string
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:
# Reference: https://github.com/MihailSalnikov/tf-idf_and_k-means/blob/master/main.ipynb
# https://medium.com/@MSalnikov/text-clustering-with-k-means-and-tf-idf-f099bcf95183
# https://medium.com/mlearning-ai/text-clustering-with-tf-idf-in-python-c94cd26a31e7

In [3]:
all_text = """
Google and Facebook are strangling the free press to death. Democracy is the loser
Your 60-second guide to security stuff Google touted today at Next '18
A Guide to Using Android Without Selling Your Soul to Google
Review: Lenovo’s Google Smart Display is pretty and intelligent
Google Maps user spots mysterious object submerged off the coast of Greece - and no-one knows what it is
Android is better than IOS
In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency
is a numerical statistic that is intended to reflect
how important a word is to a document in a collection or corpus.
It is often used as a weighting factor in searches of information retrieval
text mining, and user modeling. The tf-idf value increases proportionally
to the number of times a word appears in the document
and is offset by the frequency of the word in the corpus
""".split("\n")[1:-1]

In [4]:
all_text

['Google and Facebook are strangling the free press to death. Democracy is the loser',
 "Your 60-second guide to security stuff Google touted today at Next '18",
 'A Guide to Using Android Without Selling Your Soul to Google',
 'Review: Lenovo’s Google Smart Display is pretty and intelligent',
 'Google Maps user spots mysterious object submerged off the coast of Greece - and no-one knows what it is',
 'Android is better than IOS',
 'In information retrieval, tf–idf or TFIDF, short for term frequency–inverse document frequency',
 'is a numerical statistic that is intended to reflect',
 'how important a word is to a document in a collection or corpus.',
 'It is often used as a weighting factor in searches of information retrieval',
 'text mining, and user modeling. The tf-idf value increases proportionally',
 'to the number of times a word appears in the document',
 'and is offset by the frequency of the word in the corpus']

In [5]:
# Preprocessing and tokenizing
def preprocessing(line):
    line = line.lower()
    line = re.sub(r"[{}]".format(string.punctuation), " ", line)
    return line

TD-IDF

In [6]:
tfidf_vectorizer = TfidfVectorizer(preprocessor=preprocessing)

In [7]:
tfidf = tfidf_vectorizer.fit_transform(all_text)

In [8]:
kmeans = KMeans(n_clusters=2).fit(tfidf)

In [9]:
lines_for_predicting = ["Google and Facebook are strangling the free press to death. Democracy is the loser", "Your 60-second guide to security stuff Google touted today at Next '18"]
predicted = kmeans.predict(tfidf_vectorizer.transform(lines_for_predicting))

In [10]:
from sklearn.metrics import rand_score

In [11]:
rand_score(lines_for_predicting, predicted)

0.0

PMI

In [12]:
import nltk
from nltk.collocations import *
from nltk.tokenize import word_tokenize

In [13]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [14]:
text = "Google and Facebook are strangling the free press to death. Democracy is the loser Your 60-second guide to security stuff Google touted today at Next '18 A Guide to Using Android Without Selling Your Soul to Google"

In [15]:
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(word_tokenize(text))

In [16]:
for i in finder.score_ngrams(bigram_measures.pmi):
  print (i)

(("'18", 'A'), 5.247927513443585)
(('.', 'Democracy'), 5.247927513443585)
(('60-second', 'guide'), 5.247927513443585)
(('A', 'Guide'), 5.247927513443585)
(('Android', 'Without'), 5.247927513443585)
(('Democracy', 'is'), 5.247927513443585)
(('Facebook', 'are'), 5.247927513443585)
(('Next', "'18"), 5.247927513443585)
(('Using', 'Android'), 5.247927513443585)
(('Without', 'Selling'), 5.247927513443585)
(('and', 'Facebook'), 5.247927513443585)
(('are', 'strangling'), 5.247927513443585)
(('at', 'Next'), 5.247927513443585)
(('death', '.'), 5.247927513443585)
(('free', 'press'), 5.247927513443585)
(('security', 'stuff'), 5.247927513443585)
(('today', 'at'), 5.247927513443585)
(('touted', 'today'), 5.247927513443585)
(('Selling', 'Your'), 4.247927513443585)
(('Your', '60-second'), 4.247927513443585)
(('Your', 'Soul'), 4.247927513443585)
(('is', 'the'), 4.247927513443585)
(('loser', 'Your'), 4.247927513443585)
(('strangling', 'the'), 4.247927513443585)
(('the', 'free'), 4.247927513443585)
(('th

In [17]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = TrigramCollocationFinder.from_words(word_tokenize(text))

In [18]:
for i in finder.score_ngrams(trigram_measures.pmi):
  print (i)

(("'18", 'A', 'Guide'), 10.49585502688717)
(('.', 'Democracy', 'is'), 10.49585502688717)
(('Android', 'Without', 'Selling'), 10.49585502688717)
(('Facebook', 'are', 'strangling'), 10.49585502688717)
(('Next', "'18", 'A'), 10.49585502688717)
(('Using', 'Android', 'Without'), 10.49585502688717)
(('and', 'Facebook', 'are'), 10.49585502688717)
(('at', 'Next', "'18"), 10.49585502688717)
(('death', '.', 'Democracy'), 10.49585502688717)
(('today', 'at', 'Next'), 10.49585502688717)
(('touted', 'today', 'at'), 10.49585502688717)
(('Democracy', 'is', 'the'), 9.49585502688717)
(('Selling', 'Your', 'Soul'), 9.49585502688717)
(('Without', 'Selling', 'Your'), 9.49585502688717)
(('Your', '60-second', 'guide'), 9.49585502688717)
(('are', 'strangling', 'the'), 9.49585502688717)
(('is', 'the', 'loser'), 9.49585502688717)
(('loser', 'Your', '60-second'), 9.49585502688717)
(('strangling', 'the', 'free'), 9.49585502688717)
(('the', 'free', 'press'), 9.49585502688717)
(('Google', 'and', 'Facebook'), 8.91089

Skipgram embedding

In [19]:
# Reference: https://github.com/nickvdw/word2vec-from-scratch/blob/master/word2vec.ipynb

In [20]:
import numpy as np
import keras.backend as K
import tensorflow as tf
import operator
from tensorflow import keras
from keras.utils import np_utils

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape, Lambda
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing import sequence
from sklearn.metrics.pairwise import cosine_distances

from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors as nn
from matplotlib import pylab
import pandas as pd

In [21]:
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'+"'")
tokenizer.fit_on_texts(all_text)

In [22]:
corpus = tokenizer.texts_to_sequences(all_text)
n_samples = sum(len(s) for s in corpus) # Total number of words in the corpus
V = len(tokenizer.word_index) + 1 # Total number of unique words in the corpus

In [23]:
n_samples, V

(152, 96)

In [24]:
# Example of how word to integer mapping looks like in the tokenizer
print(list((tokenizer.word_index.items()))[:5])

[('the', 1), ('is', 2), ('to', 3), ('a', 4), ('google', 5)]


In [25]:
# Parameters
window_size = 2 
window_size_corpus = 4

# Set numpy seed for reproducible results
np.random.seed(42)

In [26]:
# Prepare data for the skipgram model
def generate_data_skipgram(corpus, window_size, V):
    maxlen = window_size * 2
    all_in = []
    all_out = []
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            p = index - window_size
            n = index + window_size + 1

            in_words = []
            labels = []
            for i in range(p, n):
                if i != index and 0 <= i < L:
                    # Add the input word
                    all_in.append(word)
                    # Add one-hot of the context words
                    all_out.append(to_categorical(words[i], V))

    return (np.array(all_in), np.array(all_out))

In [27]:
# Create training data
X_skip, y_skip = generate_data_skipgram(corpus, window_size, V)
X_skip.shape, y_skip.shape

((530,), (530, 96))

In [28]:
# Create skipgram architecture
dims = [50, 150, 300]
skipgram_models = []

for dim in dims:
    # Initialize a Keras Sequential model
    skipgram = Sequential()

    # Add an Embedding layer
    skipgram.add(Embedding(input_dim=V, 
                           output_dim=dim, 
                           input_length=1, 
                           embeddings_initializer='glorot_uniform'))

    # Add a Reshape layer, which reshapes the output of the embedding layer (1,dim) to (dim,)
    skipgram.add(Reshape((dim, )))

    # Add a final Dense layer with the same size as in [1]
    skipgram.add(Dense(V, activation='softmax', kernel_initializer='glorot_uniform'))

    # Compile the model with a suitable loss function and select an optimizer.
    # Optimizer Adagrad was used in paper
    skipgram.compile(optimizer=keras.optimizers.Adam(),
                    loss='categorical_crossentropy',
                    metrics=['accuracy'])
    
    skipgram.summary()
    print("")
    skipgram_models.append(skipgram)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 50)             4800      
                                                                 
 reshape (Reshape)           (None, 50)                0         
                                                                 
 dense (Dense)               (None, 96)                4896      
                                                                 
Total params: 9,696
Trainable params: 9,696
Non-trainable params: 0
_________________________________________________________________

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1, 150)            14400     
                                                                 
 reshape_1 (Reshape)         (N

In [29]:
# Training the skipgram models
for skipgram in skipgram_models:
    skipgram.fit(X_skip, y_skip, batch_size=64, epochs=13, verbose=1)
    print("")

Epoch 1/13
Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13

Epoch 1/13
Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13

Epoch 1/13
Epoch 2/13
Epoch 3/13
Epoch 4/13
Epoch 5/13
Epoch 6/13
Epoch 7/13
Epoch 8/13
Epoch 9/13
Epoch 10/13
Epoch 11/13
Epoch 12/13
Epoch 13/13



In [30]:
for skipgram in skipgram_models:
    # Save embeddings for vectors of length 50, 150 and 300 using skipgram model
    weights = skipgram.get_weights()

    # Get the embedding matrix
    embedding = weights[0]

    # Get word embeddings for each word in the vocabulary, write to file
    f = open(f"vectors_skipgram_{len(embedding[0])}.txt", "w")

    # Create columns for the words and the values in the matrix, makes it easier to read as dataframe
    columns = ["word"] + [f"value_{i+1}" for i in range(embedding.shape[1])]

    # Start writing to the file, start with the column names
    f.write(" ".join(columns))

    # Start a new line
    f.write("\n")

    for word, i in tokenizer.word_index.items():
        f.write(word)
        f.write(" ")
        f.write(" ".join(map(str, list(embedding[i,:]))))
        f.write("\n")
    f.close()

In [31]:
weights

[array([[ 0.00062926, -0.02311563, -0.06231514, ..., -0.04720856,
          0.00294507, -0.05793893],
        [ 0.00037591,  0.06759916,  0.07356013, ..., -0.05931689,
         -0.05297806,  0.03453358],
        [-0.15343884,  0.15313508,  0.01244714, ..., -0.12911746,
          0.04510623, -0.01036217],
        ...,
        [-0.05680786, -0.04049027,  0.06472202, ...,  0.14333192,
         -0.01946976,  0.066402  ],
        [-0.03266605, -0.05107694, -0.08680277, ..., -0.04681404,
          0.01358245,  0.07512807],
        [-0.11235233, -0.12496331, -0.11610719, ..., -0.05547908,
          0.15627381,  0.14527903]], dtype=float32),
 array([[ 0.16008094, -0.10403235, -0.14997895, ..., -0.10233754,
         -0.02826031, -0.0930145 ],
        [-0.03626607,  0.01960105, -0.14457835, ..., -0.11606526,
         -0.12537567,  0.00186577],
        [-0.11205559, -0.04200958, -0.08245798, ..., -0.0937067 ,
         -0.01436139,  0.06061462],
        ...,
        [ 0.1519396 ,  0.03956344,  0.0