In [1]:
# import all necessary packages for CBOW
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import random
import os
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import collections
import itertools
import re
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
# # Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# # print device name: get_device_name()
# print(torch.cuda.get_device_name(0))

In [3]:
# Load data from file and store list of sentences where sentences are list of words
class MakeSentences():
    def __init__(self, file_name):
        self.file_name = file_name
        self.sentences = self.read_file()

    def read_file(self):
        sentences = []
        with open(self.file_name, 'r') as f:
            i=0
            for line in f:
                sentences += ([x for x in line.strip().split('.') if x!=''])
                i+=1
                if i==25000:
                    break
        return sentences

In [None]:
sentences = MakeSentences('./wikitext-2-raw-v1/wikitext-2-raw/wiki.train.raw').sentences
print(len(sentences))
# for sentence in sentences:
#     print(type(sentence))

61719


### Preprocess

In [None]:
class Preprocess():
    def __init__(self, sentences):
        self.sentences = sentences

    def tokenize(self):
        self.sentences = [word_tokenize(sentence) for sentence in self.sentences]

    def lowercase(self):
        self.sentences = [[word.lower() for word in sentence] for sentence in self.sentences]

    def remove_stop_words(self):
        stop_words = set(stopwords.words('english'))
        self.sentences = [[word for word in sentence if word not in stop_words] for sentence in self.sentences]

    def stemmer(self):
        stemmer = nltk.stem.PorterStemmer()
        self.sentences = [[stemmer.stem(word) for word in sentence] for sentence in self.sentences]

    def remove_punctuation(self):
        self.sentences = [[word for word in sentence if word.isalpha()] for sentence in self.sentences]

    def remove_numbers(self):
        self.sentences = [[word for word in sentence if not word.isdigit()] for sentence in self.sentences]

    def remove_single_letter(self):
        self.sentences = [[word for word in sentence if len(word) > 1] for sentence in self.sentences]

    def remove_extra_spaces(self):
        self.sentences = [[word for word in sentence if word != ' '] for sentence in self.sentences]

    def remove_less_than_3(self):
        self.sentences = [[word for word in sentence if len(word) > 2] for sentence in self.sentences]



In [None]:
# preprocess
preprocess = Preprocess(sentences)
preprocess.tokenize()
# print(preprocess.sentences)
preprocess.lowercase()
preprocess.remove_stop_words()
# preprocess.stemmer()
preprocess.remove_punctuation()
preprocess.remove_numbers()
preprocess.remove_single_letter()
preprocess.remove_extra_spaces()
preprocess.remove_less_than_3()

print("Preprocessing done")
# print(preprocess.sentences)
sentences = preprocess.sentences
print(len(sentences))
# print(sentences)

Preprocessing done
61719


### Create word index mappings

In [None]:
# Flatten list of sentences into list of words
word_list = list(itertools.chain.from_iterable(sentences))
# print(word_list)

# Create a vocabulary of words
word_freq = Counter(word_list)

# Remove words that occur less than 5 times
vocab = set(word if word_freq[word] > 0 else '<unk>' for word in word_list)
# print(vocab)

# Add padding and unknown token to vocab
vocab.add('<pad>')
vocab.add('<unk>')
# Add start and end token to vocab
vocab.add('<start>')
vocab.add('<end>')

# Print length of vocab
print("Size of vocab: ", len(vocab))

# Create word to index and index to word mapping
word_to_idx = {word:idx for idx, word in enumerate(vocab)}
idx_to_word = {idx:word for idx, word in enumerate(vocab)}

# Print most common words
print("Most common words: ", word_freq.most_common(10))

Size of vocab:  49155
Most common words:  [('first', 2981), ('one', 2577), ('also', 2566), ('two', 2500), ('new', 1863), ('time', 1770), ('would', 1514), ('game', 1421), ('three', 1356), ('later', 1235)]


In [None]:
# create a co-oocurence matrix

co_occurence_matrix = np.zeros((len(vocab), len(vocab)))
window_size = 2

for sentence in sentences:
    # add start and end token to sentence
    sentence = ['<start>'] + sentence + ['<end>']
    for idx, word in enumerate(sentence):
        if word in vocab:
            # print(max(idx - window_size, 0), min(idx + window_size, len(sentence)) + 1)
            for neighbor in sentence[max(idx - window_size, 0) : min(idx + window_size, len(sentence)) + 1]:
                if neighbor != word and neighbor in vocab:
                    # print(word, neighbor)
                    co_occurence_matrix[word_to_idx[word]][word_to_idx[neighbor]] +=  1
                    # print(co_occurence_matrix[word_to_idx[word]][word_to_idx[neighbor]])
                    # co_occurence_matrix[word_to_idx[neighbor]][word_to_idx[word]] += 1


In [None]:
# Check co-occurence matrix
print(co_occurence_matrix[word_to_idx['valkyria']][word_to_idx['iii']])
print(co_occurence_matrix.shape)

15.0
(49155, 49155)


In [None]:
# Calculate embedding_dimension using SVD
U, S, V = np.linalg.svd(co_occurence_matrix)

# Check shape of U, S, V
# print(U.shape)
# print(S.shape)
# print(V.shape)

# Get variation to be 99%
var = 0.90
total_var = np.sum(S)
print(total_var)
var_sum = 0
for i in range(len(S)):
    var_sum += S[i]
    if var_sum/total_var >= var:
        break

# Get reduced dimension
dim = i + 1
print("Reduced dimension: ", dim)

# Get reduced U, S, V
U = U[:, :dim]
print(U.shape)


nan
Reduced dimension:  49155
(49155, 49155)


init_gesdd failed init


In [None]:
# Fit SVD
embedding_dim = 300

svd = TruncatedSVD(n_components=embedding_dim, n_iter=25, random_state=12, tol = 0.0, algorithm='arpack')
final_matrix = svd.fit_transform(co_occurence_matrix)

In [None]:
# Save final_matrix
pickle.dump(final_matrix, open('./partA_pth/final_matrix.pkl', 'wb'))

In [None]:
# Load final_matrix
final_matrix = pickle.load(open('./partA_pth/final_matrix.pkl', 'rb'))

In [None]:
# Write a function to get top k closest words to a given word
def get_closest_k_words(word, k, final_matrix, word_to_idx, idx_to_word):
    word_idx = word_to_idx[word]
    word_vector = final_matrix[word_idx]
    similarity_scores = cosine_similarity([word_vector], final_matrix)
    sorted_idxs = np.argsort(similarity_scores[0])[::-1]
    closest_idxs = sorted_idxs[:k]
    closest_words = [idx_to_word[idx] for idx in closest_idxs]
    return closest_words

In [None]:
# Check top 10 closest words to a given word
print(get_closest_k_words('woman', 10, final_matrix, word_to_idx, idx_to_word))

['woman', 'child', 'wife', 'marriage', 'baby', 'person', 'something', 'reala', 'gift', 'good']


### TSNE visualization

In [None]:
# Get the required indices
word_list = ['woman', 'wife', 'film', 'sex', 'politics']

all_words = set(word_list)

In [None]:
# Function to print top k closest words to words in word_list
def print_closest_k_words(word_list, k, final_matrix, word_to_idx, idx_to_word):
    for word in word_list:
        closest_words = get_closest_k_words(word, k, final_matrix, word_to_idx, idx_to_word)
        print("Closest words to ", word, ": ", closest_words)

In [None]:
# Call print_closest_k_words
print_closest_k_words(word_list, 10, final_matrix, word_to_idx, idx_to_word)

Closest words to  woman :  ['woman', 'child', 'wife', 'marriage', 'baby', 'person', 'something', 'reala', 'gift', 'good']
Closest words to  wife :  ['wife', 'marriage', 'child', 'mother', 'floor', 'baby', 'murder', 'nabucco', 'children', 'time']
Closest words to  film :  ['film', 'novel', 'david', 'spisevognselskap', 'instead', 'writer', 'today', 'sanger', 'larry', 'media']
Closest words to  sex :  ['sex', 'heart', 'religion', 'respectively', 'homosexuality', 'spp', 'kissing', 'loyalty', 'seamus', 'internet']
Closest words to  politics :  ['politics', 'background', 'legacy', 'sbcl', 'mizuta', 'ariga', 'deceiving', 'lamaceratops', 'disproportionates', 'pyrolysis']
