In [1]:
import os
import pandas as pd
import numpy as np

from gensim.models import KeyedVectors
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn.decomposition import NMF
from pyjarowinkler import distance
from metrics import *

DATA_FILE = 'data/all_data_clean.csv'
EMBEDDING_FILE = 'embedding/all_data_clean.bin'
RESULTS = 'results'
SELECT_DATA = 2  ### 0 titles, 1 titles without stopwords, 2 titles without stopwords and lemmatization
SELECT_YEAR = 2015
EMBEDDING_BIN = True
K_NEIGHBORS = 500
N_THREADS = 1
THRESHOLD = 0.4
COSSINE_FILTER=0.9
N_COMPONENTS = 20
EMBEDDING_SIZE = 100

DATASET_RESULT = 'dataset'

In [2]:
switcher = {
        0: "pp_title",
        1: "pp_title_rm_sw",
        2: "pp_tile_rm_sw_lem"
    }

In [3]:
#try:
    #os.mkdir('{}'.format(DATASET_RESULT))
    #os.mkdir('{}/{}/results'.format(MAIN_PATH, EMBEDDING_RESULTS))
    #os.mkdir('{}/{}/datasets'.format(MAIN_PATH, EMBEDDING_RESULTS))
    #os.mkdir('{}/{}/datasets/gn_w2v_models'.format(MAIN_PATH, EMBEDDING_RESULTS))
#except FileExistsError:
#    pass

### Read Data

In [4]:
csv_data = pd.read_csv(DATA_FILE, sep='|', index_col=0)
csv_data.head()

Unnamed: 0,id,name,name_citation,doctorate,article,article_year,pp_title,pp_title_rm_sw,pp_tile_rm_sw_lem
0,3172487852109469,Abdelhakim Senhaji Hafid,"HAFID, A. S.;A. HAFID;HAFID, ABDELHAKIM;HAFID,...",Computer Science and Operational Research,Performance Management of IEEE 802.15.4 Wirele...,2015.0,performance management of ieee wireless sensor...,performance management ieee wireless sensor ne...,performance management ieee wireless sensor ne...
1,3172487852109469,Abdelhakim Senhaji Hafid,"HAFID, A. S.;A. HAFID;HAFID, ABDELHAKIM;HAFID,...",Computer Science and Operational Research,An Integrated Predictive Mobile-Oriented Bandw...,2014.0,an integrated predictive mobileoriented framew...,integrated predictive mobileoriented framework...,integrated predictive mobileoriented framework...
2,3172487852109469,Abdelhakim Senhaji Hafid,"HAFID, A. S.;A. HAFID;HAFID, ABDELHAKIM;HAFID,...",Computer Science and Operational Research,Cross-layer aware joint design of sensing and ...,2016.0,crosslayer aware joint design of sensing and f...,crosslayer aware joint design sensing frame du...,crosslayer aware joint design sensing frame du...
3,3172487852109469,Abdelhakim Senhaji Hafid,"HAFID, A. S.;A. HAFID;HAFID, ABDELHAKIM;HAFID,...",Computer Science and Operational Research,An Enhanced Reservation Based Medium Access Co...,2012.0,an enhanced reservation based medium access co...,enhanced reservation based medium access contr...,enhanced reservation based medium access contr...
4,3172487852109469,Abdelhakim Senhaji Hafid,"HAFID, A. S.;A. HAFID;HAFID, ABDELHAKIM;HAFID,...",Computer Science and Operational Research,Path-Based QoS Provisioning for Optical Burst ...,2011.0,pathbased qos provisioning for optical burst s...,pathbased qos provisioning optical burst switc...,pathbased qos provisioning optical burst switc...


In [5]:
dataset = [x.get(switcher.get(SELECT_DATA)) for index, x in csv_data.iterrows() if csv_data.at[index, 'article_year'] == SELECT_YEAR]
n_documents = len(dataset)
csv_data = None
dataset[:10]

['performance management ieee wireless sensor network precision agriculture',
 'mogamap2 multiobjective mapping algorithm parameter control optimize area performance power consumption fpga',
 'interfpga communication bus error detection dynamic clock phase adjustment',
 'bone dentistry digital xray bµaddx software pilot study analysis bone density digital dental xrays',
 'model checking cml tool development industrial application',
 'hefestos intelligent system applied ubiquitous accessibility',
 'logadm approach dynamic log analysis',
 'integrated infrastructure ubiquitous learning',
 'fault domainbased testing imperfect situation heuristic approach case study',
 'generation complete test suite mealy inputoutput transition system']

### Read Embedding

In [6]:
%%time
model = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=EMBEDDING_BIN)

CPU times: user 42.9 ms, sys: 0 ns, total: 42.9 ms
Wall time: 41.8 ms


### Filter embedding space to dataset

In [7]:
dataset_words = CountVectorizer().fit(dataset).get_feature_names()
dataset_words[:10]

['025v',
 '12conjecture',
 '130nm',
 '1a',
 '1h',
 '20nm',
 '24',
 '2d',
 '2d3d',
 '2d4d']

In [8]:
words_vector = {}
for word in dataset_words:
    try:
        words_vector[word] = model[word]         
    except KeyError:
        continue
dataset_words = None

In [9]:
n_words = len(words_vector)
print('Number of cluwords {}'.format(n_words))

Number of cluwords 2732


### Save my word vector

In [10]:
os.system('mkdir -p ' + DATASET_RESULT)
file = open("""{}/{}_{}.txt""".format(DATASET_RESULT, switcher.get(SELECT_DATA),  SELECT_YEAR), 'w')
file.write('{} {}\n'.format(n_words, str(EMBEDDING_SIZE)))
for index, word_vec in words_vector.items():
    file.write("%s\n" % str(index + ' ' + " ".join([str(round(x, 9)) for x in word_vec.tolist()])))
file.close()
model = None

### Create Cluwords

In [11]:
space_vector = [np.array([round(y,9) for y in words_vector[x].tolist()]) for x in words_vector]
space_vector = np.array(space_vector)
vocab_cluwords = np.asarray([x for x in words_vector])

array([[ 0.00521728,  0.02028853, -0.01500222, ..., -0.02085901,
         0.0051961 , -0.06698267],
       [ 0.01002557,  0.02945558, -0.0240077 , ..., -0.01943226,
         0.00024938, -0.04257441],
       [ 0.0146225 ,  0.04320482, -0.02451873, ..., -0.02097077,
         0.00505415, -0.05099378],
       ...,
       [ 0.01427482,  0.03594904, -0.03064483, ..., -0.02900777,
         0.01136413, -0.06841376],
       [ 0.01087763,  0.03139909, -0.02050509, ..., -0.02117275,
         0.00259017, -0.05183641],
       [ 0.01641259,  0.07519466, -0.04875149, ..., -0.04672395,
         0.01962009, -0.12792523]])

#### Get cosine similarity

In [12]:
%%time
nbrs = NearestNeighbors(n_neighbors=K_NEIGHBORS, algorithm='auto', metric='cosine', n_jobs=N_THREADS).fit(space_vector)
distances, indices = nbrs.kneighbors(space_vector)
space_vector = None

CPU times: user 343 ms, sys: 180 ms, total: 523 ms
Wall time: 226 ms


#### Get Cluwords

In [13]:
list_cluwords = np.zeros((n_words, n_words), dtype=np.float16)

In [14]:
if THRESHOLD:
    for p in range(0, n_words):
        for i, k in enumerate(indices[p]):
            # .875, .75, .625, .50
            if 1 - distances[p][i] >= THRESHOLD:
                list_cluwords[p][k] = round(1 - distances[p][i], 2)
            else:
                list_cluwords[p][k] = 0.0
else:
    for p in range(0, n_words):
        for i, k in enumerate(indices[p]):
            list_cluwords[p][k] = round(1 - distances[p][i], 2)
distances, indices = None, None

In [15]:
#np.savez_compressed('cluwords.npz', data=list_cluwords, index=np.asarray(vocab_cluwords), cluwords=np.asarray(vocab_cluwords))

### Computing TF-IDF

#### Computing TF

In [16]:
tf_vectorizer = CountVectorizer(max_features=n_words, binary=False, vocabulary=vocab_cluwords)
tf = tf_vectorizer.fit_transform(dataset)
n_cluwords = len(vocab_cluwords)
print('tf shape {}'.format(tf.shape))

tf shape (2609, 2732)


In [17]:
%%time
hyp_aux = []
for w in range(0, n_cluwords):
    hyp_aux.append(np.asarray(list_cluwords[w], dtype=np.float16))

hyp_aux = np.asarray(hyp_aux, dtype=np.float32)
hyp_aux = csr_matrix(hyp_aux, shape=hyp_aux.shape, dtype=np.float32)  # ?test sparse matrix!

cluwords_tf_idf = np.dot(tf, np.transpose(hyp_aux))
cluwords_tf_idf = tf.dot(hyp_aux.transpose())
tf = None

CPU times: user 235 ms, sys: 12 ms, total: 247 ms
Wall time: 246 ms


#### Computing IDF

In [18]:
'''hyp_aux = hyp_aux.todense()

print('Dot tf and hyp_aux')
_dot = np.dot(tf, np.transpose(hyp_aux))  # np.array n_documents x n_cluwords # Correct!
end = timeit.default_timer()

print('Divide hyp_aux by itself')
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    # pdb.set_trace()
    # self.hyp_aux = self.hyp_aux.todense()
    # pdb.set_trace()
    bin_hyp_aux = np.nan_to_num(np.divide(hyp_aux, hyp_aux))
    
print('Dot tf and bin hyp_aux')
_dot_bin = np.dot(tf, np.transpose(bin_hyp_aux))

print('Divide _dot and _dot_bin')
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    mu_hyp = np.nan_to_num(np.divide(_dot, _dot_bin))

start = timeit.default_timer()
print('Sum')
cluwords_idf = np.sum(mu_hyp, axis=0)

start = timeit.default_timer()
print('log')
cluwords_idf = np.log10(np.divide(n_documents, cluwords_idf))'''



### Build Topics

In [19]:
%%time
print("\nFitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..."
      % (n_documents, n_cluwords))
nmf = NMF(n_components=N_COMPONENTS, random_state=1, alpha=.1, l1_ratio=.5, max_iter=1500).fit(cluwords_tf_idf)


Fitting the NMF model (Frobenius norm) with tf-idf features, n_samples=2609 and n_features=2732...
CPU times: user 1min 54s, sys: 1.83 s, total: 1min 55s
Wall time: 1min 52s


In [20]:
topics = []
n_top_words = 101
for topic_idx, topic in enumerate(nmf.components_):
    top = ''
    top2 = ''
    top += ' '.join([vocab_cluwords[i]
                     for i in topic.argsort()[:-n_top_words - 1:-1]])
    top2 += ''.join(str(sorted(topic)[:-n_top_words - 1:-1]))

    topics.append(str(top))
cluwords_freq, cluwords_docs, n_docs = count_tf_idf_repr(topics, vocab_cluwords, cluwords_tf_idf.transpose())
nmf = None

In [21]:
topics_t = []
for topic in topics:
    topic_t = topic.split(' ')
    topics_t.append(topic_t)
topics = topics_t

In [22]:
one_hot_topics = []
top = 101
for topic in topics:
    topic_top = topic[:top]
    one_hot_topic = np.zeros(len(vocab_cluwords))
    for word in topic_top:
        index_vocab = np.argwhere(vocab_cluwords == word)[0]
        one_hot_topic[index_vocab] = 1

    one_hot_topics.append(one_hot_topic)

one_hot_topics = np.array(one_hot_topics)

In [23]:
topics_t = []
for topic in topics:
    filtered_topic = []
    insert_word = np.ones(len(topic))
    for w_i in range(0, len(topic)-1):
        if insert_word[w_i]:
            filtered_topic.append(topic[w_i])
            for w_j in range((w_i + 1), len(topic)):
                if distance.get_jaro_distance(topic[w_i], topic[w_j], winkler=True, scaling=0.1) > 0.75:
                    insert_word[w_j] = 0

    topics_t.append(filtered_topic)
topics = topics_t

### Print Topics

In [24]:
os.system('mkdir -p ' + RESULTS)
for t in [5, 10, 20]:
    with open('{}/result_topic_{}.txt'.format(RESULTS, t), 'w') as f_res:
        f_res.write('Topics {}\n'.format(t))
        f_res.write('Topics:\n')
        topics_t = []
        for topic in topics:
            topics_t.append(topic[:t])
            for word in topic[:t]:
                f_res.write('{} '.format(word))

            f_res.write('\n')
        coherence = get_coherence(topics_t, cluwords_freq, cluwords_docs)
        f_res.write('Coherence: {} ({})\n'.format(np.round(np.mean(coherence), 4), np.round(np.std(coherence), 4)))
        f_res.write('{}\n'.format(coherence))

        pmi, npmi = get_pmi(topics=topics_t, word_frequency=cluwords_freq, term_docs=cluwords_docs, n_docs=n_docs, n_top_words=t)
        f_res.write('PMI: {} ({})\n'.format(np.round(np.mean(pmi), 4), np.round(np.std(pmi), 4)))
        f_res.write('{}\n'.format(pmi))
        f_res.write('NPMI:\n')
        for score in npmi:
            f_res.write('{}\n'.format(score))

        f_res.write('avg NPMI: {} ({})\n'.format(np.round(np.mean(npmi), 4), np.round(np.std(npmi), 4)))

        w2v_l1 = get_w2v_metric(topics, t, DATASET_RESULT, 'l1_dist', "{}_{}".format(switcher.get(SELECT_DATA),  SELECT_YEAR))
        f_res.write('W2V-L1: {} ({})\n'.format(np.round(np.mean(w2v_l1), 4), np.round(np.std(w2v_l1), 4)))
        f_res.write('{}\n'.format(w2v_l1))

        f_res.close()