In [1]:
from __future__ import print_function

import os, sys, math, operator
import numpy as np, copy as cp

from keras import backend as K
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, Lambda
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dot
from keras.models import Model
from keras.initializers import Constant
from keras.activations import sigmoid

from kutils.file import dump as kdump
from kutils.file import load as kload
from kutils.file import get_files

sys.path.append('/home/bwlee2/research/embedding/word2vec/cbow_update')
#sys.path.append(os.getcwd())
from word_index import Word_dic
from cbow import Cbow

Using TensorFlow backend.


In [2]:
f2_s = '/home/bwlee2/work/projects/market_sensing/dict/cbow_update/texts.pk'
_, _, texts = kload(f2_s)

EMBEDDING_DIM = 100
N_WINDOW = 3
N_NEGATIVE = 5

texts1 = texts[:1000]
print(type(texts1), type(texts1[0]), type(texts1[0][0]))

<class 'list'> <class 'list'> <class 'str'>


In [69]:
class KEmbedding(Cbow):
    def __init__(self, n_window, n_negative, embed_dim):
        self.n_window, self.n_negative, self.embed_dim = n_window, n_negative, embed_dim
        self.word_dic = None

    def _concat_tuple(texts):
        """
        if tuple of texts(array of array of string) is given,
        it concatenates all tuple in texts format
        """
        temp = []
        for text1 in texts:
            temp += text1
        texts = temp
        return texts
        
    def load_text(self, *texts):
        """
        initialize dictionary from texts
        """
        texts = KEmbedding._concat_tuple(texts)
        self.word_dic = Word_dic(texts)
        Cbow.__init__(self, self.n_window, self.n_negative, self.embed_dim, self.word_dic)
        self.texts = texts
        
    def add_text(self, *texts):
        """
        preserve self.word_dic and add words after that
        """
        if self.word_dic is None:
            self.load_text(*texts)
            self.get_network()
        else:
            texts = KEmbedding._concat_tuple(texts)
            
            mat_old = self.get_embed()
            n_words_old = self.n_words
            word_dic_old = cp.copy(self.word_dic)
            self.new_words = self.word_dic.update(texts)
            
            Cbow.__init__(self, self.n_window, self.n_negative, self.embed_dim, self.word_dic)
            n_words_new = self.n_words
            word_dic_new = self.word_dic

            new_word_context_dic = { word1: self._contexts_of(word1, texts) for word1 in self.new_words }
            if n_words_old == n_words_new:
                mat_new = mat_old
            else:
                mat_new = []
                for ix1 in range(n_words_old, n_words_new):
                    word2 = self.word_dic.ix2word[ix1]
                    embed2 = self._get_avg_embed(word2, mat_old, word_dic_old, new_word_context_dic)
                    mat_new.append(embed2)
                mat_new = np.array(mat_new)
                mat_new = np.concatenate((mat_old, mat_new))        
            
            self.set_embed(mat_new)
            self.texts = texts
    
    def get_train_data(self):
        return super().get_train_data(self.texts)
    
    def clone(self):
        return cp.copy(self)
    
    def _contexts_of(self, word1a, texts2, n_window=3):
        """
        get top frequent context of word1a from texts2
        """
        context = {}
        def add_count_dic(count_dic1, key_arr1):
            for key1 in key_arr1:
                count_dic1.setdefault(key1, 0)
                count_dic1[key1] += 1
        def get_top_words(count_dic1):
            sorted_count_dic1 = sorted(count_dic1.items(), key=operator.itemgetter(1), reverse=True)
            n_words = len(sorted_count_dic1)
            n_words2 = max(10, int(n_words/10)) # consider 10% of words in calculation
            return { word1: count1 for word1, count1 in sorted_count_dic1[:n_words2] }

        for text1 in texts2:
            n_text1 = len(text1)
            for i, word1 in enumerate(text1):
                if word1a == word1:
                    low_limit = max([i-n_window, 0])
                    high_limit = min([i+n_window+1, n_text1])
                    arr_temp = text1[low_limit:i]
                    arr_temp += text1[i+1:high_limit]
                    add_count_dic(context, arr_temp)
        return get_top_words(context)

    def _get_avg_embed(self, word0, embed_mat1, word_dic1, word_context_dic):
        """
        get embedding of oov word0 by averaging its already seen context
        """
        context_count1 = word_context_dic[word0]        
        words = word_dic1.words
        total_count_sum = 0
        embed = np.zeros(self.embed_dim)
        for word1, count1 in context_count1.items():
            if word1 in words:
                ix1 = word_dic1.word2ix[word1]
                embed += embed_mat1[ix1] * count1
                total_count_sum += count1
        if total_count_sum < 1:
            return np.zeros(self.embed_dim)
        return embed/total_count_sum
        

In [70]:
embed1 = KEmbedding(N_WINDOW, N_NEGATIVE, EMBEDDING_DIM)
embed1.load_text(texts1)

None


In [71]:
input1, target1 = embed1.get_train_data()
model1 = embed1.get_network()

print('--------', embed1.network)
model1.compile(optimizer='rmsprop', loss='binary_crossentropy')
score1 = model1.fit(x=input1, y=target1, batch_size=100, epochs=1)
mat1 = embed1.get_embed()

-------- <keras.engine.training.Model object at 0x7fd9938354a8>
Epoch 1/1


In [72]:
texts2 = texts[1000:1500]
print('111111', embed1.network)
#embed2 = cp.copy(embed1)
embed2 = embed1.clone()
print('2222222', embed1.network)
print(embed2.network)
embed2.add_text(texts2)

111111 <keras.engine.training.Model object at 0x7fd9938354a8>
2222222 <keras.engine.training.Model object at 0x7fd9938354a8>
<keras.engine.training.Model object at 0x7fd9938354a8>
new~~~~~~~` ['graphics' 'jpeg' 'gif' ... 'blues' 'duck' 'vuw']


In [73]:
n_words1 = embed1.n_words
n_words2 = embed2.n_words
print(n_words1, n_words2)

9583 12260


In [74]:
input2, target2 = embed2.get_train_data()
model2 = embed2.get_network()
model2.compile(optimizer='rmsprop', loss='binary_crossentropy')
score2 = model2.fit(x=input2, y=target2, batch_size=100, epochs=1)

Epoch 1/1


In [77]:
print( embed1.word_dic.ix2word[950] )
print( embed1.word_dic.ix2word[1950] )

directly
trouble


In [78]:
print(embed1.get_embed('directly'))
print(embed2.get_embed('directly'))
print(embed2.get_embed('trouble'))

[ 0.03662822  0.08640641  0.09149102 -0.12910344  0.07835306  0.10650671
  0.05641901 -0.07046448  0.11508797 -0.12533854 -0.05059731  0.08084092
  0.03880278  0.10370369 -0.06310498 -0.06693341  0.11772046 -0.01862401
  0.07547874 -0.10211251 -0.07356763 -0.11647589  0.08149291  0.10060213
 -0.12544914  0.09936368 -0.1133073   0.0321604   0.04843689  0.06579549
  0.0731416  -0.10536198 -0.08858644 -0.1032197   0.03922247  0.12872247
  0.05878539  0.08730777 -0.05282257  0.05589495 -0.10633729 -0.07055101
  0.13383353  0.07207597 -0.13517264 -0.09601953 -0.09697238 -0.03642629
 -0.10364527  0.01997281 -0.06686973  0.13869137  0.09556331 -0.11943659
 -0.12136271 -0.12421506  0.14105746 -0.1033272   0.08189443  0.06742942
 -0.10946915  0.04581811  0.15215345  0.12930353  0.10357404  0.06815642
  0.04857005 -0.13451771  0.12757763 -0.13269976  0.08387921 -0.05071742
 -0.07927927 -0.06023665  0.04752771 -0.07053507  0.12513182 -0.0772993
 -0.03863328  0.08223251 -0.08222835  0.12718932  0.

In [83]:
import nmslib

mat1 = embed1.get_embed()
mat2 = embed2.get_embed()

# initialize a new index, using a HNSW index on Cosine Similarity
index1 = nmslib.init(method='hnsw', space='cosinesimil')
index1.addDataPointBatch(mat1)
index1.createIndex({'post': 2}, print_progress=True)

index2 = nmslib.init(method='hnsw', space='cosinesimil')
index2.addDataPointBatch(mat2)
index2.createIndex({'post': 2}, print_progress=True)

# query for the nearest neighbours of the first datapoint
word1 = 'directly'
word2 = 'trouble'

ids1, distances1 = index1.knnQuery(mat1[embed1.word_dic.word2ix[word1]], k=10)
for id1 in ids1:
    print(embed1.word_dic.ix2word[id1])

ids2a, distances2a = index2.knnQuery(mat2[embed2.word_dic.word2ix[word1]], k=10)
ids2b, distances2b = index2.knnQuery(mat2[embed2.word_dic.word2ix[word2]], k=10)
print('------------------')
for id1 in ids2a:
    print(embed2.word_dic.ix2word[id1])
print('------------------')
for id1 in ids2b:
    print(embed2.word_dic.ix2word[id1])

# get all nearest neighbours for all the datapoint
# using a pool of 4 threads to compute
#neighbours = index.knnQueryBatch(data, k=10, num_threads=4)

integrated
stable
displays
1930's
astray
en
warfare
worm
transportation
shifting
------------------
wpd
knowing
improve
rusnews
raise
lunatic
bodies
whatsoever
viability
des
------------------
connecting
accomplish
270
thereby
6th
mock
medicine
357
floor
infants


In [6]:
##############







model1 = cbow1.get_network()
model1.compile(optimizer='rmsprop', loss='binary_crossentropy')
#score1 = model1.fit(x=[cbow1.data_x, cbow1.data_context, cbow1.data_negative], y=[cbow1.target_data, cbow1.target_negative], batch_size=100, epochs= 1)
score1 = model1.fit(x=input1, y=target1, batch_size=100, epochs= 1)
mat1 = cbow1.get_embed()

Epoch 1/1


In [7]:
print([ word_dic1.ix2word[i] for i in range(5) ])
#mat1[:5]

['<PAD>', '<START>', '<UNK>', 'the', 'of']


In [8]:
n_words1 = word_dic1.n_words
texts2 = texts[1000:2500]
word_dic2 = cp.copy(word_dic1)
new_words = word_dic2.update(texts2)
n_words2 = word_dic2.n_words

In [9]:
for i in range(5):
    word1 = word_dic2.ix2word[i+n_words1]
    print(word1)

windows
graphics
jpeg
gif
package


In [10]:
def contexts_of(word1a, texts2, n_window=3):
    context = {}
    def add_count_dic(count_dic1, key_arr1):
        for key1 in key_arr1:
            count_dic1.setdefault(key1, 0)
            count_dic1[key1] += 1
    def get_top_words(count_dic1):
        sorted_count_dic1 = sorted(count_dic1.items(), key=operator.itemgetter(1), reverse=True)
        n_words = len(sorted_count_dic1)
        n_words2 = max(10, int(n_words/10)) # consider 10% of words in calculation
        return { word1: count1 for word1, count1 in sorted_count_dic1[:n_words2] }
        
    for text1 in texts2:
        n_text1 = len(text1)
        for i, word1 in enumerate(text1):
            if word1a == word1:
                low_limit = max([i-n_window, 0])
                high_limit = min([i+n_window+1, n_text1])
                arr_temp = text1[low_limit:i]
                arr_temp += text1[i+1:high_limit]
                add_count_dic(context, arr_temp)
    return get_top_words(context)

In [11]:
ccc = contexts_of('windows', texts2, 3)

In [12]:
print(ccc)

{'the': 273, 'for': 183, 'and': 181, 'a': 175, 'i': 163, '3': 143, 'in': 135, 'to': 128, '1': 119, 'of': 105, 'is': 102, 'that': 91, 'dos': 90, 'under': 72, 'x': 71, 'it': 67, 'with': 65, 'ms': 60, '2': 60, 'on': 59, 'run': 50, 'or': 50, 'you': 47, 'from': 47, 'os': 45, 'will': 45, 'version': 44, 'nt': 44, 'my': 43, 'but': 40, 'using': 38, 'apps': 36, 'does': 35, '0': 35, 'if': 33, 'not': 32, 'windows': 32, 'when': 31, 'driver': 30, 'this': 30, 'use': 30, 'running': 29, 'have': 29, 'are': 26, 'has': 26, 'be': 26, 'would': 25, 'c': 25, 'as': 24, 'only': 24, 'file': 24, 'drivers': 21, 'microsoft': 21, 'up': 21, 'there': 21, 'do': 20, 'system': 20, 'comp': 20, 'into': 20, 'which': 20, 'what': 18, 'program': 18, 'about': 18, 'just': 17, 'mac': 17, 'was': 17, 'work': 17, 'can': 17, 'all': 17, 'directory': 17, 'word': 17, 'than': 16, 'so': 15, 'programs': 15, 'within': 15, 'time': 15, 'fine': 14, 'linux': 14, 'new': 14, 'like': 14, 'at': 13, 'thanks': 13, 'am': 13, 'an': 13, 'set': 13, 'more

In [13]:
new_word_context_dic = { word1: contexts_of(word1, texts2) for word1 in new_words }

In [14]:
def get_avg_embed(word0, embed_mat1, word_dic1, word_context_dic):
    context_count1 = word_context_dic[word0]        
    words = word_dic1.words
    total_count_sum = 0
    embed = np.zeros(EMBEDDING_DIM)
    for word1, count1 in context_count1.items():
        if word1 in words:
            ix1 = word_dic1.word2ix[word1]
            embed += embed_mat1[ix1] * count1
            total_count_sum += count1
    if total_count_sum < 1:
        return np.zeros(EMBEDDING_DIM)
    return embed/total_count_sum        
        
# need to skip oov words
#!!!!!!!!!!!!!!!!11111
### what if count is smaller than 10?

temp = get_avg_embed('windows', mat1, word_dic1, new_word_context_dic)
print(temp[:10])

[-0.40497088 -0.4164376   0.48043264 -0.39178229 -0.3831789   0.36858604
  0.36149348  0.37907459  0.39572365 -0.42370758]


In [15]:
mat2 = []
for ix1 in range(n_words1, n_words2):
    word2 = word_dic2.ix2word[ix1]
    embed2 = get_avg_embed(word2, mat1, word_dic1, new_word_context_dic)
    mat2.append(embed2)
mat2 = np.array(mat2)
mat2b = np.concatenate((mat1, mat2))

In [16]:
print(mat1.shape)
print(mat2.shape)
print(mat2b.shape)

(9583, 100)
(5072, 100)
(14655, 100)


In [20]:
cbow2 = Cbow(n_window=N_WINDOW, n_negative=N_NEGATIVE, embed_dim=EMBEDDING_DIM, word_dic=word_dic2)
input2, target2 = cbow2.get_train_data(texts2)
model2 = cbow2.get_network()

print(cbow2.get_embed('windows'))
cbow2.set_embed(mat2b)
model2 = cbow2.get_network()
print(cbow2.get_embed('windows'))
model2.compile(optimizer='rmsprop', loss='binary_crossentropy')
score2 = model2.fit(x=input2, y=target2, batch_size=100, epochs= 10)
#score2 = model2.fit(x=[cbow2.data_x, cbow2.data_context, cbow2.data_negative], y=[cbow2.target_data, cbow2.target_negative], batch_size=100, epochs= 1)
print(cbow2.get_embed('windows'))

[ 0.00670594 -0.01757532  0.00031279  0.02134672 -0.02010387  0.04631103
  0.04829497  0.02501355  0.04532603  0.03562606 -0.00941096  0.04452807
 -0.00683808  0.02831235 -0.02694577  0.04215329 -0.02857791  0.01319558
  0.03325592  0.02229548 -0.00657592  0.00780135 -0.01975346  0.00806315
 -0.04731954 -0.03876078 -0.00159435  0.01121148 -0.04123883  0.03608469
  0.03906335 -0.03284369 -0.00498239 -0.02178704 -0.01848469  0.03944755
 -0.03038275 -0.00640755 -0.01900774  0.01959025 -0.01430454  0.04742635
 -0.00796503 -0.04415531 -0.00662602 -0.00692859  0.04312536  0.04147445
  0.00102197 -0.02642416  0.02213148 -0.01989819  0.02945893  0.03492831
 -0.02178444 -0.00913808 -0.00861005  0.03066489 -0.04676073  0.04719165
 -0.03696543  0.03976751  0.03821139 -0.00107444  0.01274296  0.00871147
 -0.01314527  0.00583049  0.02715626  0.01377214  0.03178188  0.01826954
 -0.04220107  0.0061397   0.03155948  0.02434016  0.03245208 -0.04002186
 -0.00912543  0.02826928 -0.03327229  0.03838834 -0

In [40]:
cbow1type(input2)

tuple