In [103]:
import os
import re
import numpy as np
import nltk
import scipy
import json

from nltk import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from rich.progress import track
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import normalize

# Parameters
min_word_freq = 5
win_size = 3
embed_dim = 300

data_path = "/home/data-master/evancw/flatten_gigaword/tmp/" # data file in Orion
# data_path = "/home/jim/coding/Research/WordEmbed/dataset/"     # data file in WSL

In [2]:
from utils import *

In [3]:
from arnoldi import arnoldi_iteration

In [83]:
# Read file paths

data_files = os.listdir(data_path)
nyt_file_paths = [
    os.path.join(data_path, p) for p in data_files if p[:3] == "nyt"
]
print("# Data files:\t", len(nyt_file_paths))

# Data files:	 197


In [84]:
# Preprocess the .flat data
# Generate the corpus

def preprocess(file_path):
    with open(file_path, 'r') as text_file:
        data = text_file.read().replace('\n', '')
        sentences = []
        for sent in sent_tokenize(data):
            sentences.append(preprocess_sent(sent))
    return sentences


def preprocess_sent(sentence):
    sentence = str(sentence)
    sentence = sentence.lower()
    rem_num = re.sub('[0-9]+', '', sentence)
    tokenizer = RegexpTokenizer(r'(?u)\b\w\w+\b')
    tokens = tokenizer.tokenize(rem_num)
#     return " ".join(tokens)
    return tokens

if not os.path.exists("./tmp/"):
    os.makedirs("./tmp/", exist_ok=True)
    corpus = []
    for nyt_file_path in track(nyt_file_paths):
        corpus += preprocess(nyt_file_path)
    print("# sentences:\t", len(corpus))

In [85]:
# Construct vocabulary & frequency statics

def get_vocab(corpus: list):
    vocab_freq = defaultdict(lambda: 0)
    for sentence in track(corpus, description="Collecting words..."):
        for word in sentence:
            vocab_freq[word] += 1
    return vocab_freq


def get_freq(w: str):
    return vocab_freq[w]

if not os.path.exists('./tmp/nyt_vocab.json'):
    vocab_freq = get_vocab(corpus)
    vocab = [word for word in vocab_freq.keys() if
             get_freq(word) >= min_word_freq]
    vocab = {word:i for i, word in enumerate(vocab)}
    with open('./tmp/nyt_vocab.json', 'w') as f:
        json.dump(vocab, f, indent=6)
    with open('./tmp/nyt_vocab_freq.json', 'w') as f:
        json.dump(vocab_freq, f, indent=6)
else:
    with open('./tmp/nyt_vocab.json', 'r') as f:
        vocab = json.load(f)
    with open('./tmp/nyt_vocab_freq.json', 'r') as f:
        vocab_freq = json.load(f)

vocab_len = len(vocab)

print("# vocab:\t", vocab_len)

# vocab:	 450781


In [86]:
# constrain the length of vocab to be 300,000.
max_vocab = 300000
sorted_vocab = dict(sorted(vocab_freq.items(), key=lambda x: x[1], reverse=True))
freq_vocab_set = set(list(sorted_vocab)[:max_vocab])
freq_vocab = {x: vocab[x] for x in freq_vocab_set}
freq_vocab = dict(sorted(freq_vocab.items(), key=lambda x:x[1]))

# rearrange the new vocab (freq_vocab)
vocab_ = {k: i for i, k in enumerate(freq_vocab.keys())}
len(vocab_)

300000

In [88]:
# Construct co-occurence matrix
# Note: The comatrix contains all the vocab in the corpus. i.e. It does not neglect any Out of Vocabulary word. To constrain the vocabulary,
#       use the subset of the matrix M
from collections import defaultdict


def get_comatrix(corpus, win_size, word_dict):
    vocab_len = len(word_dict)
    coo = defaultdict(lambda: 0)
    for sent in track(corpus, description="Extracting Co-Occurrence Matrix...\t"):
        words = sent
        sent_len = len(words)
        for i in range(sent_len):
            word = words[i]
            try:
                word_idx = word_dict[word]
            except KeyError:
                continue
            win_left = max(0, i - win_size)
            win_right = min(sent_len, i + win_size)
            contexts = words[win_left:i] + words[i + 1:win_right]
            for context in contexts:
                try:
                    context_idx = word_dict[context]
                except KeyError:
                    continue
                coo[(word_idx, context_idx)] += 1
    coordinate = np.array(list(coo.keys())).T
    data = np.array(list(coo.values()))
    W = scipy.sparse.csr_matrix((data, coordinate), shape=(
        vocab_len, vocab_len), dtype=np.float32)
    return W

# construct&save/load Co-occurence Matrix
import scipy
if not os.path.exists("./tmp/nyt_M.npz"):
    M = get_comatrix(corpus, win_size, vocab)
    scipy.sparse.save_npz('./tmp/nyt_M.npz', M)
else:
    M = scipy.sparse.load_npz('./tmp/nyt_M.npz')

In [89]:
# Capture the comatrix w.r.t. the most frequent <max_vocab> words.
indices = np.array(list(freq_vocab.values()))
M_ = (M[indices].T)[indices].T
M_.shape

(300000, 300000)

In [104]:
# Construct word embeddings by Arnoldi iteration
if os.path.exists('./tmp/nyt_Q.npy'):
    Q = np.load('./tmp/nyt_Q.npy')
else:
    b = np.random.random(size=max_vocab)  # initial vector
    Q, h = arnoldi_iteration(M_, b, embed_dim)
    np.save('./tmp/nyt_Q.npy', Q)  # save Word embeddings


Output()

In [105]:
# Word similarity
ws_words = [
    "market",
    "company",
    "ltd",
    "president",
    "nomura",
    "jump",
    "rupee",
    "frog"
]

In [106]:
# Word similarity
inv_vocab = list(vocab_.keys())
for dim in [10, 20, 30, 50, 60, 70, 80, 90, 100, 200, 300]: 
    Q_ = Q[:,:dim]
    WE_ = normalize(Q_, axis=1, norm="l2")
    print("----",dim,"----")
    for word in ws_words:
        rank = word_similarity(WE_, vocab_, inv_vocab, w=word, n=5)
        print(word, get_freq(word))
        for k, v in rank.items():
            print(f"\t{k:<15}{v:.3f}")
    print("\n" * 3)

---- 10 ----
market 706500
	facts          0.993
	cowboys        0.992
	olympics       0.991
	subcontinent   0.990
	convention     0.987
company 1291283
	committee      0.984
	fed            0.968
	agency         0.966
	project        0.963
	headline       0.960
ltd 78276
	inc            0.997
	sa             0.995
	tsp            0.994
	chopped        0.994
	cholesterol    0.993
president 1222903
	dinosaur       0.961
	prophecy       0.956
	editor         0.955
	theatre        0.948
	interior       0.948
nomura 10083
	daiwa          0.998
	dillon         0.995
	olivia         0.994
	wellcare       0.993
	nikko          0.993
jump 59035
	crack          0.992
	spin           0.979
	shake          0.976
	kick           0.976
	rein           0.975
rupee 811
	renminbi       1.000
	schrocks       1.000
	icc            1.000
	nacoes         1.000
	buyback        1.000
frog 4792
	turtle         0.999
	hatchback      0.998
	orion          0.998
	playwright     0.998
	lao            0.998




-

---- 200 ----
market 706500
	buybacks       0.672
	repurchases    0.561
	buyback        0.551
	exchange       0.495
	pickers        0.485
company 1291283
	manufacturer   0.410
	maker          0.395
	retailer       0.388
	firm           0.364
	industry       0.359
ltd 78276
	sakura         0.608
	deutsche       0.578
	japan          0.574
	hypo           0.573
	hokkaido       0.573
president 1222903
	versa          0.607
	presdient      0.478
	cheney         0.441
	provost        0.431
	presidents     0.382
nomura 10083
	nikko          0.975
	yamaichi       0.975
	daiwa          0.958
	sanyo          0.953
	prudential     0.929
jump 59035
	ups            0.547
	rehab          0.485
	inauspicious   0.484
	wineline       0.474
	peer           0.468
rupee 811
	harrassment    0.915
	minetta        0.912
	asafa          0.912
	drachma        0.912
	bayamon        0.909
frog 4792
	raccoon        0.958
	jar            0.942
	mozart         0.939
	scented        0.939
	blades         0.938






In [107]:
!ls tmp/

nyt_M.npz  nyt_Q.npy  nyt_vocab_freq.json  nyt_vocab.json


In [108]:
Q.shape

(300000, 301)

In [101]:
! rm ./tmp/nyt_Q.npy