In [1]:
# Import Packages
import os
import numpy as np
import tensorflow as tf
from nltk import RegexpTokenizer
from scipy.sparse import lil_matrix, coo_matrix, save_npz, load_npz
from scipy.sparse.linalg import svds
import time

In [2]:
# Generate co-occurence matrix.
def gen_coocurrence_matrix(data_file, voc2id_file, de2id_file):

    max_voc_id = 0
    with open(voc2id_file, 'r') as file:
        for line in file:
            voc_id = line.strip("\n").split("\t")[1]
            max_voc_id += 1
    max_de_id = 0
    with open(de2id_file, 'r') as file:
        for line in file:
            de_id = line.strip("\n").split("\t")[1]
            max_de_id += 1

    mat = np.zeros((max_de_id, max_voc_id))
    cnt = 0
    with open(data_file, 'r') as file:
        for line in file:
            cnt += 1
            if cnt % 1000000 == 0:
                print(cnt)
            info = line.strip('\n').split()
            split_ind = int(info[0]) + 2
            tokens = info[2:split_ind]
            deps = info[split_ind:]
            for d in deps:
                ind = d.split('|')
                mat[int(ind[2]), int(tokens[int(ind[1])])] += 1

    return mat

In [3]:
# Calculate PMI score for co-occurence matrix.
def calc_pmi_score(wc_matrix):
    
    '''
    Inputs:
    wc_matrix (scipy.lil_matrix): Word-context count matrix.
    
    Outputs:
    pmi (scipy.lil_matrix): Matrix with pointwise mutual information score for each word-context pair.
    '''
    
    pmi = np.zeros(wc_matrix.shape)
    sum_w = wc_matrix.sum(axis=1)
    sum_c = wc_matrix.sum(axis=0)
    total = wc_matrix.sum()
    np.seterr(divide='ignore', invalid='ignore')
    for i in range(pmi.shape[0]):
        print(f"\rCalculating PMI for row {i}", end = '')
        score = np.log((wc_matrix[i, :] * total) / ((sum_w[i] * sum_c) + 1e-5))
        pmi[i, :] = np.maximum(pmi[i, :], score)
    np.seterr(divide='warn', invalid='warn')
    print(f"\rCalculated PMI.", end = '')
    return pmi

In [4]:
# Generate word embeddings.
def gen_dep_embeddings(mat, k=300):
    if k > np.min(mat.shape):
        dim = np.min(mat.shape)
        U, S, _ = svds(mat, k=dim-1)
        w_temp = U.dot(np.diag(np.sqrt(S)))
        W = np.pad(w_temp, ((0, 0), (0, k - dim + 1)))
        return W
    else:
        U, S, _ = svds(mat, k=k)
        # W = U.dot(np.diag(np.sqrt(S)))
        return U

In [5]:
def save_word_embeddings(filepath, W, token_index):
    
    '''
    Inputs:
    filepath (str): Output filepath with extension.
    token_index (dict): Dictionary of {word: index} pairs.
    
    Outputs:
    (None)
    '''

    with open(filepath, "w") as file:
        for t in token_index:
            embedding = W[token_index[t]]
            # Write word.
            file.write(f"{t}")
            # Write embedding.
            [file.write(f" {e}") for e in embedding]
            file.write("\n")

In [6]:
def gen_random_embeddings(n, k):
    x = tf.ones(n,)
    lamb = 1e3
    model = tf.keras.Sequential()
    model.add(tf.keras.layers.Embedding(n, k, embeddings_initializer='uniform', input_shape=()))
    loss = lambda: -tf.reduce_mean(tf.linalg.matmul(tf.transpose(model(x)), model(x))) + \
                    lamb * tf.reduce_mean(tf.norm(model.trainable_weights[0], ord='euclidean', axis=1) - 1) ** 2
    var_list = lambda: model.trainable_weights 
    opt = tf.keras.optimizers.Adam()
    for i in range(1000):
        opt.minimize(loss, var_list)
    return model.trainable_weights[0].numpy()

In [8]:
t = time.time()

data_path = f"{os.getcwd()}/data"
data_file = f"{data_path}/data.txt"
voc2id_file = f"{data_path}/voc2id.txt"
de2id_file = f"{data_path}/de2id.txt"
dims = 300

voc2id = dict()
with open(voc2id_file, 'r') as file:
    for line in file:
        info = line.strip('\n').split("\t")
        voc2id[info[0]] = int(info[1])

print("1")
wc_mat= gen_coocurrence_matrix(data_file, voc2id_file, de2id_file)
print("2")

# PMI embedding.
pmi_mat = calc_pmi_score(wc_mat)
w = gen_dep_embeddings(pmi_mat)
emb = np.dot(wc_mat.T, w) / np.expand_dims(np.sum(wc_mat, axis=0) + 1e-05, axis=-1)
out_filepath = os.getcwd() + f"/embeddings/init_norm_emb.txt"
save_word_embeddings(out_filepath, emb, voc2id)

# Average embedding.
w = gen_random_embeddings(len(wc_mat), dims)
emb = np.dot(wc_mat.T, w) / np.expand_dims(np.sum(wc_mat, axis=0) + 1e-05, axis=-1)
out_filepath = os.getcwd() + f"/embeddings/init_avg_emb.txt"
save_word_embeddings(out_filepath, emb, voc2id)

# Random embedding.
emb = np.random.uniform(size=(len(voc2id), 300)) - 0.5
out_filepath = os.getcwd() + f"/embeddings/init_rand_emb.txt"
save_word_embeddings(out_filepath, emb, voc2id)

t = time.time() - t

Calculating PMI for row 0Calculating PMI for row 1Calculating PMI for row 2Calculating PMI for row 3Calculating PMI for row 4Calculating PMI for row 5Calculating PMI for row 6Calculating PMI for row 7Calculating PMI for row 8Calculating PMI for row 9Calculating PMI for row 10Calculating PMI for row 11Calculating PMI for row 12Calculating PMI for row 13Calculating PMI for row 14Calculating PMI for row 15Calculating PMI for row 16Calculating PMI for row 17Calculating PMI for row 18Calculating PMI for row 19Calculating PMI for row 20Calculating PMI for row 21Calculating PMI for row 22Calculating PMI for row 23Calculating PMI for row 24Calculating PMI for row 25Calculating PMI for row 26Calculating PMI for row 27Calculating PMI for row 28Calculating PMI for row 29Calculating PMI for row 30Calculating PMI for row 31Calculating PMI for row 32Calculating PMI for row 33Calculating PMI for row 34Calculating PMI for row 35Calculating PMI for row 36Calculatin

In [None]:
print(t)