In [1]:
from __future__ import division

import codecs
from numbers import Number

import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from sklearn.neighbors import NearestNeighbors
from scipy.spatial.distance import mahalanobis
import timeit
import scipy as sp
import matplotlib
matplotlib.use('Agg')
import seaborn as sns
import matplotlib.pyplot as plt
import os
import scipy.spatial.distance as sci_dist
import timeit
import warnings
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from pyjarowinkler import distance
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import CountVectorizer
from time import time

from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import CountVectorizer

MAIN_PATH='.'
EMBEDDING_RESULTS = 'google'
PATH_TO_SAVE_RESULTS = '{}/{}/results'.format(MAIN_PATH, EMBEDDING_RESULTS)
PATH_TO_SAVE_MODEL = '{}/{}/datasets/gn_w2v_models'.format(MAIN_PATH, EMBEDDING_RESULTS)
EMBEDDINGS_FILE_PATH = '{}/GoogleNews-vectors-negative300.bin'.format(MAIN_PATH)
EMBEDDINGS_BIN_TYPE = True
DATASET = 'all_titles'
DATASET_PATH = '../data/all_titles.txt'
N_THREADS = 15
N_TOPICS = 20

## Create directories

In [2]:
try:
    os.mkdir('{}/{}'.format(MAIN_PATH, EMBEDDING_RESULTS))
    os.mkdir('{}/{}/results'.format(MAIN_PATH, EMBEDDING_RESULTS))
    os.mkdir('{}/{}/datasets'.format(MAIN_PATH, EMBEDDING_RESULTS))
    os.mkdir('{}/{}/datasets/gn_w2v_models'.format(MAIN_PATH, EMBEDDING_RESULTS))
except FileExistsError:
    pass

## Create the w2v model for dataset

In [3]:
#read dataset
def read_raw_dataset(dataset):
    arq = open(dataset, 'r')
    doc = arq.readlines()
    arq.close()
    documents = list(map(str.rstrip, doc))
    return documents

In [4]:
#read embedding
def read_embedding(embedding_file_path, binary):
    t0 = time()
    model = KeyedVectors.load_word2vec_format(embedding_file_path, binary=binary)
    print('Embedding model read in %0.3fs.' % (time() - t0))
    return model

In [5]:
def create_embedding_models(dataset_path = DATASET_PATH, dataset=DATASET,
                            embedding_file_path=EMBEDDINGS_FILE_PATH,
                            embedding_type=EMBEDDINGS_BIN_TYPE,
                            path_to_save_model=PATH_TO_SAVE_MODEL):
    documents = read_raw_dataset(dataset_path)
    # Count the words in dataset
    dataset_cv = CountVectorizer().fit(documents)
    dataset_words = dataset_cv.get_feature_names()
    
    # Select just the words in dataset from Google News Word2Vec Model
    words_values = []
    model = read_embedding(embedding_file_path, embedding_type)
    for i in dataset_words:
        aux = [i + ' ']
        try:
            for k in model[i]:
                aux[0] += str(k) + ' '
        except KeyError:
            continue

        words_values.append(aux[0])

    n_words = len(words_values)  # Number of words selected

    print('{}:{}'.format(dataset, n_words))

    # save .txt model
    os.system('mkdir -p ' + path_to_save_model)
    file = open("""{}/{}.txt""".format(path_to_save_model, dataset), 'w+')
    file.write('{0} {1}\n'.format(n_words, '300'))
    for word_vec in words_values:
        file.write("%s\n" % word_vec)

    return n_words


In [None]:
print('Filter embedding space to {} dataset...'.format(DATASET_PATH))
n_words = create_embedding_models(dataset=DATASET,
                                  embedding_file_path=EMBEDDINGS_FILE_PATH,
                                  embedding_type=EMBEDDINGS_BIN_TYPE,
                                  path_to_save_model=PATH_TO_SAVE_MODEL)

Filter embedding space to ../data/all_titles.txt dataset...


## Alfa Knn

In [None]:
def save_cluwords(labels_array, n_words, k_neighbors, distances, indices, threshold):
        """
        Description
        -----------
        Save the cluwords of each word to csv using pandas. Dataframe.
        
        """
        list_cluwords = np.zeros((n_words, n_words), dtype=np.float16)

        # Check if cosine limit was set
        if threshold:
            for p in range(0, n_words):
                for i, k in enumerate(indices[p]):
                    # .875, .75, .625, .50
                    if 1 - distances[p][i] >= threshold:
                        list_cluwords[p][k] = round(1 - distances[p][i], 2)
                    else:
                        list_cluwords[p][k] = 0.0
        else:
            for p in range(0, n_words):
                for i, k in enumerate(indices[p]):
                    list_cluwords[p][k] = round(1 - distances[p][i], 2)

        np.savez_compressed('cluwords.npz',
                            data=list_cluwords,
                            index=np.asarray(labels_array),
                            cluwords=np.asarray(labels_array))

In [None]:
def build_word_vector_matrix(vector_file, n_words):
    """Read a GloVe array from sys.argv[1] and return its vectors and labels as arrays"""
    numpy_arrays = []
    labels_array = []

    with codecs.open(vector_file, 'r', 'utf-8') as f:
        _ = next(f)  # Skip the first line

        for c, r in enumerate(f):
            sr = r.split()
            labels_array.append(sr[0])
            numpy_arrays.append(np.array([float(i) for i in sr[1:]]))

            if c == n_words:
                return np.array(numpy_arrays), labels_array

    return np.array(numpy_arrays), labels_array

In [None]:
def create_cosine_cluwords(input_vector_file, n_words, k_neighbors, threshold, n_jobs):
        input_vector_file = input_vector_file
        df, labels_array = build_word_vector_matrix(input_vector_file, n_words)
        print('NearestNeighbors K={}'.format(k_neighbors))
        start = timeit.default_timer()
        nbrs = NearestNeighbors(n_neighbors=k_neighbors, algorithm='auto', metric='cosine', n_jobs=n_jobs).fit(
            df)
        end = timeit.default_timer()
        print('Time {}'.format(end - start))
        print('NN Distaces')
        start = timeit.default_timer()
        distances, indices = nbrs.kneighbors(df)
        end = timeit.default_timer()
        print('Time {}'.format(end - start))
        print('Saving cluwords')

        save_cluwords(labels_array, n_words, k_neighbors, distances, indices, threshold)

## Build Cluwords

In [None]:
def read_input(dataset_file_path):
        arq = open(dataset_file_path, 'r')
        doc = arq.readlines()
        arq.close()

        documents = list(map(str.rstrip, doc))
        n_documents = len(documents)
        return documents, n_documents

In [None]:
def cluwords_dataframe(embedding_file_path, n_words, k_neighbors, threshold=.85, n_jobs=1, verbose=0):
    print('kNN...')
    create_cosine_cluwords(input_vector_file=embedding_file_path,
                               n_words=n_words,
                               k_neighbors=k_neighbors, threshold=threshold, n_jobs=n_jobs)

In [None]:
def create_cluwords(dataset_file_path, n_words, path_to_save_cluwords,
        cossine_filter=1.0):
    path_to_save_cluwords_tfidf = path_to_save_cluwords + '/cluwords_features.libsvm'
    n_words = n_words
    cossine_filter = cossine_filter
    loaded = np.load('cluwords.npz')
    vocab = loaded['index']
    vocab_cluwords = loaded['cluwords']
    cluwords_data = loaded['data']

    print('Matrix{}'.format(cluwords_data.shape))
    del loaded
    print('\nCosine Filter: {}'.format(cossine_filter))

    documents, n_documents = read_input(dataset_file_path)
    return documents, n_documents, vocab, vocab_cluwords, cluwords_data

In [None]:
def raw_tf(documents, n_words, vocab, binary=False, dtype=np.float32):
        tf_vectorizer = CountVectorizer(max_features=n_words, binary=binary, vocabulary=vocab)
        tf = tf_vectorizer.fit_transform(documents)
        return tf

In [None]:
def cluwords_tf(documents, n_words, vocab, vocab_cluwords, cluwords_data, binary):
        start = timeit.default_timer()
        tf = raw_tf(documents, n_words, vocab, binary)

        print('tf shape {}'.format(tf.shape))

        hyp_aux = []
        for w in range(0, len(vocab_cluwords)):
            hyp_aux.append(np.asarray(cluwords_data[w], dtype=np.float16))

        hyp_aux = np.asarray(hyp_aux, dtype=np.float32)
        hyp_aux = csr_matrix(hyp_aux, shape=hyp_aux.shape, dtype=np.float32)  # test sparse matrix!

        cluwords_tf_idf = np.dot(tf, np.transpose(hyp_aux))
        cluwords_tf_idf = tf.dot(hyp_aux.transpose())

        end = timeit.default_timer()
        print("Cluwords TF done in %0.3fs." % (end - start))
        return cluwords_tf_idf

In [None]:
def cluwords_idf(documents, n_documents, n_words, vocab, vocab_cluwords, cluwords_data):
        start = timeit.default_timer()
        print('Read data')
        tf = raw_tf(binary, dtype=np.float32)
        hyp_aux = hyp_aux.todense()

        end = timeit.default_timer()
        print('Time {}'.format(end - start))

        start = timeit.default_timer()
        print('Dot tf and hyp_aux')
        _dot = np.dot(tf, np.transpose(hyp_aux))  # np.array n_documents x n_cluwords # Correct!
        end = timeit.default_timer()
        print('Time {}'.format(end - start))

        start = timeit.default_timer()
        print('Divide hyp_aux by itself')
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            # pdb.set_trace()
            # self.hyp_aux = self.hyp_aux.todense()
            # pdb.set_trace()
            bin_hyp_aux = np.nan_to_num(np.divide(hyp_aux, hyp_aux))
        end = timeit.default_timer()
        print('Time {}'.format(end - start))

        start = timeit.default_timer()
        print('Dot tf and bin hyp_aux')
        _dot_bin = np.dot(tf, np.transpose(bin_hyp_aux))

        end = timeit.default_timer()
        print('Time {}'.format(end - start))

        start = timeit.default_timer()
        print('Divide _dot and _dot_bin')
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            mu_hyp = np.nan_to_num(np.divide(_dot, _dot_bin))
        end = timeit.default_timer()
        print('Time {}'.format(end - start))

        start = timeit.default_timer()
        print('Sum')
        cluwords_idf = np.sum(mu_hyp, axis=0)
        end = timeit.default_timer()
        print('Time {}'.format(end - start))

        start = timeit.default_timer()
        print('log')
        cluwords_idf = np.log10(np.divide(n_documents, cluwords_idf))
        end = timeit.default_timer()
        print('Time {}'.format(end - start))
        return cluwords_idf

In [None]:
def cluwords_fit_transform(documents, n_documents, n_words, vocab, vocab_cluwords, cluwords_data, binary):
        """Compute cluwords tfidf."""

        # Set number of cluwords
        n_cluwords = n_words

       
        # Set vocabulary of cluwords
        n_cluwords = len(vocab_cluwords)
        print('Number of cluwords {}'.format(len(vocab_cluwords)))
        print('Matrix{}'.format(cluwords_data.shape))

        print('\nComputing TF...')
        cluwords_tf_idf = cluwords_tf(documents, n_words, vocab, vocab_cluwords, cluwords_data, binary)
        #print('\nComputing IDF...')
        #cluwords_tf_idf = cluwords_idf(documents, n_documents, n_words, vocab, vocab_cluwords, cluwords_data)
        
        print(cluwords_tf_idf.shape)
        return cluwords_tf_idf, n_cluwords

## Build Topics

In [None]:
def top_words(model, feature_names, n_top_words):
    topico = []
    for topic_idx, topic in enumerate(model.components_):
        top = ''
        top2 = ''
        top += ' '.join([feature_names[i]
                         for i in topic.argsort()[:-n_top_words - 1:-1]])
        top2 += ''.join(str(sorted(topic)[:-n_top_words - 1:-1]))

        topico.append(str(top))

    return topico

In [None]:
def parse_topics(topics):
    topics_t = []
    for topic in topics:
        topic_t = topic.split(' ')
        topics_t.append(topic_t)

    return topics_t

In [None]:
def nearest_neighbors(X_topic, X_raw, vocab, n_topics, dataset):
    X = _raw_tf(X_raw, vocab, binary=True)
    neigh = NearestNeighbors(n_neighbors=n_topics, algorithm='auto', metric='cosine')
    neigh.fit(X_topic)
    dist, ind = neigh.kneighbors(X)
    output = open('document_distribution_{}'.format(dataset), 'w')
    for doc in range(0, dist.shape[0]):
        topic_dist = np.zeros(dist.shape[1])
        for index in range(0, dist.shape[1]):
            topic_index = ind[index]
            topic_dist[topic_index] = 1. - dist[doc, topic_index]

        total_dist = np.sum(topic_dist)
        output.write('{} '.format(doc))
        for index in range(0, dist.shape[1]):
            output.write(' {}:{}'.format(index, round(topic_dist[index]/total_dist if total_dist > 0 else .0, 4)))

        output.write('\n')

In [None]:
def remove_redundant_words(topics):
    topics_t = []
    for topic in topics:
        filtered_topic = []
        insert_word = np.ones(len(topic))
        for w_i in range(0, len(topic)-1):
            if insert_word[w_i]:
                filtered_topic.append(topic[w_i])
                for w_j in range((w_i + 1), len(topic)):
                    if distance.get_jaro_distance(topic[w_i], topic[w_j], winkler=True, scaling=0.1) > 0.75:
                        insert_word[w_j] = 0

        topics_t.append(filtered_topic)

    return topics_t

### Metrics

In [None]:
def coherence(topic, word_frequency, term_docs):
        coherence = []

        for t in range(len(topic)):
            topico = topic[t]
            top_w = topico.split(" ")

            coherence_t = 0.0
            for i in range(1, len(top_w)):
                for j in range(0, i):
                    cont_wi = word_frequency[top_w[j]]
                    cont_wi_wj = float(
                        len(term_docs[top_w[j]].intersection(term_docs[top_w[i]])))
                    coherence_t += np.log((cont_wi_wj + 1.0) / cont_wi)

            coherence.append(coherence_t)

        return coherence

In [None]:
def pmi(topics, word_frequency, term_docs, n_docs, n_top_words):
        pmi = []
        npmi = []

        n_top_words = float(n_top_words)

        for t in range(len(topics)):
            top_w = topics[t]
            # top_w = topico.split(' ')

            pmi_t = 0.0
            npmi_t = 0.0

            for j in range(1, len(top_w)):
                for i in range(0, j):
                    ti = top_w[i]
                    tj = top_w[j]

                    c_i = word_frequency[ti]
                    c_j = word_frequency[tj]
                    c_i_and_j = len(term_docs[ti].intersection(term_docs[tj]))

                    pmi_t += np.log(((c_i_and_j + 1.0) / float(n_docs)) /
                                    ((c_i * c_j) / float(n_docs) ** 2))

                    npmi_t += -1.0 * np.log((c_i_and_j + 0.01) / float(n_docs))

            peso = 1.0 / (n_top_words * (n_top_words - 1.0))

            pmi.append(peso * pmi_t)
            npmi.append(pmi_t / npmi_t)

        return pmi, npmi

In [None]:
def w2v_metric(topics, t, path_to_save_model, distance_type, dataset, embedding_type=False):
        word_vectors = KeyedVectors.load_word2vec_format(fname='{}/{}.txt'.format(path_to_save_model, dataset), binary=embedding_type)
        model = word_vectors.wv
        values = []

        for topic in topics:
            words = topic.split(' ')
            value = Evaluation._calc_dist_2(words, model, distance_type, t)
            values.append(value)

        return values

In [None]:
def count_tf_idf_repr(topics, cw_words, tf_idf_t):
        cw_frequency = {}
        cw_docs = {}
        for iter_topic in topics:
            topic = iter_topic.split(' ')
            for word in topic:
                word_index = np.where(cw_words == word)[0]
                cw_frequency[word] = float(tf_idf_t[word_index].getnnz(1))
                cw_docs[word] = set(tf_idf_t[word_index].nonzero()[1])

        n_docs = 0
        for _cw in range(tf_idf_t.shape[0]):
            n_docs += float(tf_idf_t[_cw].getnnz(1))

        return cw_frequency, cw_docs, n_docs

In [None]:
def print_results(cluwords_freq, cluwords_docs, path_to_save_results, topics, n_docs):
    print(path_to_save_results)
    for t in [5, 10, 20]:
        with open('{}/result_topic_{}.txt'.format(path_to_save_results, t), 'w') as f_res:
            f_res.write('Topics {}\n'.format(t))
            f_res.write('Topics:\n')
            topics_t = []
            for topic in topics:
                topics_t.append(topic[:t])
                for word in topic[:t]:
                    f_res.write('{} '.format(word))

                f_res.write('\n')

            coherence = coherence(topics_t, cluwords_freq, cluwords_docs)
            f_res.write('Coherence: {} ({})\n'.format(np.round(np.mean(coherence), 4),
                                                       np.round(np.std(coherence), 4)))
            f_res.write('{}\n'.format(coherence))

            pmi, npmi = pmi(topics=topics_t,
                                       word_frequency=cluwords_freq,
                                       term_docs=cluwords_docs,
                                       n_docs=n_docs,
                                       n_top_words=t)
            f_res.write('PMI: {} ({})\n'.format(np.round(np.mean(pmi), 4), np.round(np.std(pmi), 4)))
            f_res.write('{}\n'.format(pmi))
            f_res.write('NPMI:\n')
            for score in npmi:
                f_res.write('{}\n'.format(score))

            f_res.write('avg NPMI: {} ({})\n'.format(np.round(np.mean(npmi), 4), np.round(np.std(npmi), 4)))

            w2v_l1 = Evaluation.w2v_metric(topics, t, path_to_save_model, 'l1_dist', dataset)
            f_res.write('W2V-L1: {} ({})\n'.format(np.round(np.mean(w2v_l1), 4), np.round(np.std(w2v_l1), 4)))
            f_res.write('{}\n'.format(w2v_l1))

            f_res.close()

In [None]:
#def generate_topics(dataset, word_count, path_to_save_model, dataset_path,
#                    path_to_save_results, n_threads, k, threshold, cossine_filter,
#                    n_components, algorithm_type):
    # Path to files and directories
dataset=DATASET
dataset_file_path=DATASET_PATH
word_count=n_words
path_to_save_model=PATH_TO_SAVE_MODEL
path_to_save_results=PATH_TO_SAVE_RESULTS
n_threads=N_THREADS
algorithm_type=EMBEDDINGS_BIN_TYPE
# k=n_words,
k=500
threshold=0.4
cossine_filter=0.9
n_components=N_TOPICS
embedding_file_path = """{}/{}.txt""".format(path_to_save_model, dataset)
path_to_save_results = '{}/{}'.format(path_to_save_results, dataset)

try:
    os.mkdir('{}'.format(path_to_save_results))
except FileExistsError:
    pass

cluwords_dataframe(embedding_file_path, word_count, k, threshold=threshold, n_jobs=n_threads, verbose=0)
documents, n_documents, vocab, vocab_cluwords, cluwords_data = create_cluwords(dataset_file_path=dataset_file_path, path_to_save_cluwords=path_to_save_results, n_words=word_count)


In [None]:
print('Computing TFIDF...')
cluwords_tfidf, n_cluwords = cluwords_fit_transform(documents, n_documents, n_words, vocab, vocab_cluwords, cluwords_data, algorithm_type)

start = timeit.default_timer()
# Fit the NMF model
print("\nFitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_documents, n_cluwords))
nmf = NMF(n_components=n_components,
          random_state=1,
          alpha=.1,
          l1_ratio=.5).fit(cluwords_tfidf)

end = timeit.default_timer()
print("NMF done in {}.".format(end - start))

In [None]:
with open('{}/matrix_w.txt'.format(path_to_save_results), 'w') as f:
    w = nmf.fit_transform(cluwords_tfidf)  # matrix W = m x k
    h = nmf.components_.transpose()  # matrix H = n x k
    print('W: {} H:{}'.format(w.shape, h.shape))
    for x in range(w.shape[0]):
        for y in range(w.shape[1]):
            f.write('{} '.format(w[x][y]))
        f.write('\n')
    f.close()
    del w
    del h


In [None]:
#Load topics
topics = top_words(nmf, list(vocab_cluwords), 500)

# Load Cluwords representation for metrics
cluwords_freq, cluwords_docs, n_docs =count_tf_idf_repr(topics, vocab_cluwords, cluwords_tfidf.transpose())
topics = parse_topics(topics)
one_hot_topics = get_one_hot_topics(topics, 500, np.array(vocab_cluwords), dataset)
_nearest_neighbors(one_hot_topics, documents, vocab_cluwords, n_components, dataset)
topics = remove_redundant_words(topics)

In [None]:
print_results(cluwords_freq=cluwords_freq,
              cluwords_docs=cluwords_docs,
              path_to_save_results=path_to_save_results,
              topics=topics,
              n_docs=n_docs
              )