In [1]:
from __future__ import division
from __future__ import absolute_import
from __future__ import print_function
import tensorflow as tf 
from keras.datasets import imdb
import numpy as np
import pandas as pd
import threading
from abc import ABCMeta, abstractmethod

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
(X_train, y_train), (X_test, y_test) = imdb.load_data(path="E:/paper/stackingmodel/imdb/imdb.npz", num_words=10000)

In [3]:
def getWordsStatistics(sentences, sample = 1e-4):
    from collections import namedtuple, defaultdict
    WordStat = namedtuple("WordStat", ["count", "sample_int"])
    wordsStat = defaultdict(lambda: {"count": 0, "sample_int": sample})
    retain_total = 0
    for sent in sentences:
        for word in sent:
            wordsStat[word]["count"] += 1
            retain_total += 1
    threshold_count = sample * retain_total
    for w in wordsStat:
        v = wordsStat[w]["count"]
        word_probability = (np.sqrt(v / threshold_count) + 1) * (threshold_count / v)
        if word_probability > 1.0:
            word_probability = 1.0
        wordsStat[w]["sample_int"] = int(round(word_probability * 2**32))
        wordsStat[w] = WordStat(count=wordsStat[w]["count"], sample_int=wordsStat[w]["sample_int"])
    return wordsStat

In [4]:
wordsStat = getWordsStatistics(np.hstack([X_train, X_test]))

In [5]:
# X_train_df = pd.DataFrame({"description": X_train, "label": y_train})
# X_train_df.to_csv("E:/kaggle/avito/imdb_testset/train.data", index=False)
# X_test_df = pd.DataFrame({"description": X_test, "label": y_test})
# X_test_df.to_csv("E:/kaggle/avito/imdb_testset/test.data", index=False)

In [6]:
class SamplePool(metaclass=ABCMeta):
    __metaclass__ = ABCMeta
    
    def __init__(self, chunk_size = 100):
        self.__chunk_size = chunk_size
        self.chunk_indices_list = np.zeros(chunk_size)
    
    @property
    def chunk_size(self):
        return self.__chunk_size
    
    @abstractmethod
    def reset(self):
        pass
    
    def extend(self, samplepool_to_extend):
        if not isinstance(samplepool_to_extend, SamplePool):
            raise Exception("Illegal pool type")
    
    @abstractmethod
    def __next__(self):
        pass

class InMemorySamplePool(SamplePool):
    def __init__(self, samples, chunk_size = 1000):
        super(InMemorySamplePool, self).__init__(chunk_size=chunk_size)
        if not isinstance(samples, np.ndarray):
            raise Exception("samples must be an ndarray")
        self.samples = samples
        self.iter_index = 0
    
    def reset(self):
        self.iter_index = 0
    
    def extend(self, samplepool_to_extend):
        super(InMemorySamplePool, self).extend(samplepool_to_extend)
        self.samples = np.concatenate([self.samples, samplepool_to_extend.samples])
    
    def __next__(self):
        if self.iter_index + self.chunk_size <= len(self.samples):
            iter_samples = self.samples[self.iter_index: self.iter_index + self.chunk_size]
            self.chunk_indices_list[:] = list(range(self.iter_index, self.iter_index + self.chunk_size))
            self.iter_index = self.iter_index + self.chunk_size
        else:
            iter_samples_1 = self.samples[self.iter_index:]
            chunk_indices_1 = list(range(self.iter_index, len(self.samples)))
            self.iter_index = (self.iter_index + self.chunk_size) % len(self.samples)
            iter_samples_2 = self.samples[:self.iter_index]
            chunk_indices_2 = list(range(0, self.iter_index))
            iter_samples = np.concatenate([iter_samples_1, iter_samples_2]) 
            chunk_indices = np.concatenate([chunk_indices_1, chunk_indices_2]) 
            self.chunk_indices_list[:] = chunk_indices
        return iter_samples

# file must be saved as csv with header 
class OutMemorySamplePool(SamplePool):
    def __init__(self, sample_filepath_list, target_col_name, chunk_size = 1000):
        super(OutMemorySamplePool, self).__init__(chunk_size=chunk_size)
        if sample_filepath_list is None or not isinstance(sample_filepath_list, list) or len(sample_filepath_list) == 0:
            raise Exception("filepath list is empty or not a list")
        self.sample_filepath_list = sample_filepath_list
        self.target_col_name = target_col_name
        self.dataframe_iter = None
        self.dataframe_iter_index = 0
        self.__current_chunk = None
        self.chunk_cumulation = 0
    
    def reset(self):
        self.dataframe_iter = None
        self.dataframe_iter_index = 0
        self.__current_chunk = None
    
    def extend(self, samplepool_to_extend):
        super(OutMemorySamplePool, self).extend(samplepool_to_extend)
        self.sample_filepath_list.extend(samplepool_to_extend.sample_filepath_list)
    
    def __get_chunk__(self):
        current_chunk_size = None
        while(1):
            if self.dataframe_iter is None:
                self.dataframe_iter = pd.read_csv(self.sample_filepath_list[self.dataframe_iter_index], usecols=[self.target_col_name], iterator=True, chunksize=self.chunk_size, encoding="utf-8")
                self.dataframe_iter_index = (self.dataframe_iter_index + 1) % len(self.sample_filepath_list)
            try:
                if self.__current_chunk is not None:
                    extra_chunk_size = self.chunk_size - len(self.__current_chunk)
                    extra_chunk = self.dataframe_iter.get_chunk(extra_chunk_size).to_dict("list")[self.target_col_name]
                    current_chunk = np.concatenate([self.__current_chunk, extra_chunk])
                    current_chunk_size = len(self.__current_chunk)
                    self.__current_chunk = None
                else:
                    current_chunk = self.dataframe_iter.get_chunk().to_dict("list")[self.target_col_name]
                if len(current_chunk) < self.chunk_size:
                    self.__current_chunk = current_chunk
                    self.dataframe_iter = None
                    continue
                else:
                    if self.dataframe_iter_index - 1 == 0 and current_chunk_size is not None:
                        chunk_indices_1 = list(range(self.chunk_cumulation, self.chunk_cumulation + current_chunk_size))
                        chunk_indices_2 = list(range(0, extra_chunk_size))
                        self.chunk_indices_list[:] = np.concatenate([chunk_indices_1, chunk_indices_2])
                        self.chunk_cumulation = extra_chunk_size
                        current_chunk_size = None
                    else:
                        self.chunk_indices_list[:] = list(range(self.chunk_cumulation, self.chunk_cumulation + self.chunk_size))
                        self.chunk_cumulation += self.chunk_size
            except StopIteration:
                self.dataframe_iter.close()
                self.dataframe_iter = None
                if self.dataframe_iter_index == 0:
                    self.chunk_cumulation = 0
                continue
            return current_chunk
    
    def __next__(self):
        return self.__get_chunk__()

In [7]:
class AbstractGenerator(metaclass=ABCMeta):
    __metaclass__ = ABCMeta
    
    @abstractmethod
    def batch_index(self):
        pass

    @abstractmethod
    def reset(self):
        pass
    
    @abstractmethod
    def __iter__(self):
        pass
    
    @abstractmethod
    def __next__(self):
        pass

class ParagraphVectorGenerator(AbstractGenerator):
    def __init__(self, sample_pool, window_size = 10, batch_size = 10, max_nbatch = 5, unkown_word_indicator = 0, 
                 shuffle = False, idtype = np.int32, wordsStat = None, seed = 0):
        if not isinstance(sample_pool, SamplePool):
            raise Exception("sample_pool must be an instance of SamplePool")
        if window_size < 2:
            raise Exception("window_size must be greater than or equal to 2")
        self.sample_pool = sample_pool
        self.window_size = window_size
        self.batch_size = batch_size
        self.max_nbatch = max_nbatch
        self._batch_index = 0
        self._chunks = None
        self._paragraph_index = 0
        self._word_index = 0
        self.unkown_word_indicator = unkown_word_indicator
        self.shuffle = shuffle
        self.idtype = idtype
        self.wordsStat = wordsStat
        self.seed = seed
        self.random =  np.random.RandomState(self.seed)
        self.__last_paragraph_index = -1
        self.__last_paragraph_words = None
    
    def extend(self, generator_to_extend):
        if not isinstance(generator_to_extend, ParagraphVectorGenerator):
            raise Exception("Illegal generator type")
        self.sample_pool.extend(generator_to_extend.sample_pool)
        
    @property
    def batch_index(self):
        return self._batch_index
    
    def reset(self):
        self.sample_pool.reset()
        self._batch_index = 0
        self._chunks = None
        self._paragraph_index = 0
        self._word_index = 0
        
    def check_and_prepad(self):
        if self.__last_paragraph_index == self._paragraph_index:
            return self.__last_paragraph_words
        
        current_chunk_item = self._chunks[self._paragraph_index]
        if isinstance(current_chunk_item, str):
            if current_chunk_item.startswith("nan"):
                current_chunk_item = "np.nan"
            current_chunk_item = eval(current_chunk_item)
        if current_chunk_item is np.nan:
            current_chunk_item = []
        if self.wordsStat is not None:
            current_chunk_item = [w for w in current_chunk_item if w in self.wordsStat and 
                                  self.wordsStat[w].sample_int > self.random.rand() * 2 ** 32]
        if len(current_chunk_item) < self.window_size:
            # padding with unkown word indicator
            padding_size = self.window_size - len(current_chunk_item)
            current_chunk_item = np.pad(current_chunk_item, [(0, padding_size)], "constant", constant_values=[self.unkown_word_indicator] * 2)
        
        self.__last_paragraph_index = self._paragraph_index
        self.__last_paragraph_words = current_chunk_item
        
        return current_chunk_item
    
    def __iter__(self):
        return self
    
    def __next__(self):
        if self.max_nbatch != -1:
            self._batch_index += 1
            if self._batch_index > self.max_nbatch:
                raise StopIteration()
        return self.generate_batch()
    
# distributed memory model generator
class DMGenerator(ParagraphVectorGenerator):
    def generate_batch(self):
        current_batches = np.zeros((self.batch_size, self.window_size - 1), dtype=self.idtype)
        current_labels = np.zeros((self.batch_size, 1), dtype=self.idtype)
        current_paragraph_indices = np.zeros((self.batch_size, 1), dtype=self.idtype)
        for loop_i in range(self.batch_size):
            if self._paragraph_index == 0 and self._word_index == 0:
                self._chunks = self.sample_pool.__next__()
            current_paragraph = self.check_and_prepad()
            current_batch = current_paragraph[self._word_index: self._word_index + self.window_size - 1].copy()
            if self.shuffle:
                np.random.shuffle(current_batch)
            current_batches[loop_i, :] = current_batch
            current_labels[loop_i, :] = current_paragraph[self._word_index + self.window_size - 1]
            current_paragraph_indices[loop_i, :] = self.sample_pool.chunk_indices_list[self._paragraph_index]
            self._word_index += 1
            if self._word_index + self.window_size - 1 == len(current_paragraph):
                self._paragraph_index = (self._paragraph_index + 1) % self.sample_pool.chunk_size
                self._word_index = 0
        if self.shuffle:
            perm = np.random.permutation(current_batches.shape[0])
            current_batches = current_batches[perm]
            current_labels = current_labels[perm]
            current_paragraph_indices = current_paragraph_indices[perm]
        return current_batches, current_labels, current_paragraph_indices

# bag of word model generator
class BOWGenerator(ParagraphVectorGenerator):
    def generate_batch(self):
        current_batches = np.zeros((self.batch_size, self.window_size), dtype=self.idtype)
        current_paragraph_indices = np.zeros((self.batch_size, 1), dtype=self.idtype)
        for loop_i in range(self.batch_size):
            if self._paragraph_index == 0 and self._word_index == 0:
                self._chunks = self.sample_pool.__next__()
            current_paragraph = self.check_and_prepad()
            current_batch = current_paragraph[self._word_index: self._word_index + self.window_size].copy()
            if self.shuffle:
                np.random.shuffle(current_batch)
            current_batches[loop_i, :] = current_batch
            current_paragraph_indices[loop_i, :] = self.sample_pool.chunk_indices_list[self._paragraph_index]
            self._word_index += 1
            if self._word_index + self.window_size - 1 == len(current_paragraph):
                self._paragraph_index = (self._paragraph_index + 1) % self.sample_pool.chunk_size
                self._word_index = 0
        if self.shuffle:
            perm = np.random.permutation(current_batches.shape[0])
            current_batches = current_batches[perm]
            current_paragraph_indices = current_paragraph_indices[perm]
        return current_batches, current_paragraph_indices
    
# continuous bag of word model generator
class CBOWGenerator(ParagraphVectorGenerator):
    def generate_batch(self):
        current_batches = np.zeros((self.batch_size, self.window_size - 1), dtype=self.idtype)
        current_labels = np.zeros((self.batch_size, 1), dtype=self.idtype)
        current_paragraph_indices = np.zeros((self.batch_size, 1), dtype=self.idtype)
        for loop_i in range(self.batch_size):
            if self._paragraph_index == 0 and self._word_index == 0:
                self._chunks = self.sample_pool.__next__()
            current_paragraph = self.check_and_prepad()
            current_batch = current_paragraph[self._word_index: self._word_index + self.window_size]
            target_word = current_batch[int(self.window_size / 2)]
            # copy a new one
            current_batch = np.delete(current_batch, [int(self.window_size / 2)])
            if self.shuffle:
                np.random.shuffle(current_batch)
            current_batches[loop_i, :] = current_batch
            current_labels[loop_i, :] = target_word
            current_paragraph_indices[loop_i, :] = self.sample_pool.chunk_indices_list[self._paragraph_index]
            self._word_index += 1
            if self._word_index + self.window_size - 1 == len(current_paragraph):
                self._paragraph_index = (self._paragraph_index + 1) % self.sample_pool.chunk_size
                self._word_index = 0
        if self.shuffle:
            perm = np.random.permutation(current_batches.shape[0])
            current_batches = current_batches[perm]
            current_labels = current_labels[perm]
            current_paragraph_indices = current_paragraph_indices[perm]
        return current_batches, current_labels, current_paragraph_indices

In [491]:
imPool = InMemorySamplePool(np.array([[5],[0,1,2,3,4],[1,2,3,4,5],[2,3,4,5,6],[3,4,5,6,7,8],[4,5,6],
                                      np.nan, [5,6,7,8],[6,7,8,9,10,11],[7,8,9,10,11],[8,9,10],[9,10,11,12]]), chunk_size=5)
omPoolNumber = OutMemorySamplePool(["E:/kaggle/avito/preprocessing/test.data", "E:/kaggle/avito/preprocessing/test2.data"], target_col_name="c", chunk_size=5)
omPool = OutMemorySamplePool(["E:/kaggle/avito/preprocessing/train_descriptions.data", "E:/kaggle/avito/preprocessing/train_active_descriptions.data"], target_col_name="description", chunk_size=10)

In [492]:
dmgener = DMGenerator(omPoolNumber, window_size=3, batch_size=5, max_nbatch=-1)
bowgener = BOWGenerator(omPoolNumber, window_size=3, batch_size=5, max_nbatch=-1)
cbowgener = CBOWGenerator(omPoolNumber, window_size=3, batch_size=5, max_nbatch=-1, shuffle=True)

In [None]:
cbowgener.__next__()

In [498]:
imdbImTrainingPool = InMemorySamplePool(X_train, chunk_size=1000)
imdbOmTrainingPool = OutMemorySamplePool(["E:/kaggle/avito/imdb_testset/train.data"], target_col_name="description", chunk_size=1000)

In [499]:
imdbDmGener = DMGenerator(imdbImTrainingPool, window_size=8, batch_size=256, max_nbatch=-1)
imdbBowGener = BOWGenerator(imdbImTrainingPool, window_size=8, batch_size=256, max_nbatch=-1)
imdbCBowGener = CBOWGenerator(imdbImTrainingPool, window_size=11, batch_size=1, max_nbatch=-1, shuffle=False, wordsStat=wordsStat)

imdbDmGener_Om = DMGenerator(imdbOmTrainingPool, window_size=8, batch_size=256, max_nbatch=-1)
imdbBowGener_Om = BOWGenerator(imdbOmTrainingPool, window_size=8, batch_size=256, max_nbatch=-1)
imdbCBowGener_Om = CBOWGenerator(imdbOmTrainingPool, window_size=8, batch_size=256, max_nbatch=-1)

In [501]:
imdbCBowGener.__next__()

(array([[ 973, 1622, 1385,   65,  458,   66, 3941,  173,  256,  100]]),
 array([[4468]]),
 array([[0]]))

In [8]:
# paragraph vector distributed memory
class ParagraphVectorDmNet(object):
    def __init__(self, word_embedding_size = 400, paragraph_embedding_size = 400, q_batch_size = 256, batch_size = 256,
                 vocabulary_size = 10000, paragraph_size = 25000, window_size = 8, mode = "concat", wordsStat = None, worker = -1,
                 contrast_num_samples = 64, optimizer_type = tf.train.AdagradOptimizer, optimizer_kwargs = {}, eval_epochs = 1000):
        self.word_embedding_size = word_embedding_size
        self.paragraph_embedding_size = paragraph_embedding_size
        self.vocabulary_size = vocabulary_size
        self.paragraph_size = paragraph_size
        self.window_size = window_size
        self.q_batch_size = q_batch_size
        self.batch_size = batch_size
        if worker == -1 and q_batch_size != batch_size:
            raise Exception("q_batch_size must be equal to batch_size when worker = -1")
        self.mode = mode
        self.wordsStat = wordsStat
        if self.mode == "total_average" and self.word_embedding_size != self.paragraph_embedding_size:
            raise Exception("word_embedding_size != paragraph_embedding_size in total_average mode")
        self.contrast_num_samples = contrast_num_samples
        self.nce_weights = None
        self.nce_biases = None
        self.optimizer_type = optimizer_type
        self.optimizer_kwargs = optimizer_kwargs
        self.worker = worker
        self.eval_epochs = eval_epochs
        self.force_build_net = True
        self.tensor_map = None
        
    def build_net(self, learning_rate = 1.0, training = True, nce_weights = None, nce_biases = None, word_embeddings = None):
        graph = tf.Graph()
        with graph.as_default():
            tf_queue = tf.FIFOQueue(self.q_batch_size * 2, 
                                    dtypes=[tf.int32, tf.int32, tf.int32], 
                                    shapes=[[self.window_size - 1], [1], [1]])
            train_inputs, train_labels, paragraph_inputs = tf_queue.dequeue_many(self.batch_size)
            
            if training:
                word_embeddings = tf.Variable(tf.random_uniform([self.vocabulary_size, self.word_embedding_size], -1.0, 1.0))
            else:
                if word_embeddings is None:
                    raise Exception("word_embeddings cannot be None in testing stage")
                word_embeddings = tf.constant(word_embeddings)
            paragraph_embeddings = tf.Variable(tf.random_uniform([self.paragraph_size, self.paragraph_embedding_size], -1.0, 1.0))
            
            # shape: (None, window_size - 1, w_embed_size)
            word_embed = tf.nn.embedding_lookup(word_embeddings, train_inputs)
            # shape: (None, 1, p_embed_size)
            paragraph_embed = tf.nn.embedding_lookup(paragraph_embeddings, paragraph_inputs)
            
            if self.mode == "concat":
                # shape: (None, (window_size - 1) * w_embed_size)
                word_embed = tf.reshape(word_embed, shape=(-1, word_embed.shape[1] * word_embed.shape[2]))
                # shape: (None, p_embed_size)
                paragraph_embed = tf.squeeze(paragraph_embed, [1])
                # shape: (None, (window_size - 1) * w_embed_size + p_embed_size)
                train_embed = tf.concat([paragraph_embed, word_embed], axis=1)
            elif self.mode == "average":
                # shape: (None, w_embed_size)
                word_embed = tf.reduce_mean(word_embed, axis=1)
                # shape: (None, p_embed_size)
                paragraph_embed = tf.squeeze(paragraph_embed, [1])
                train_embed = tf.concat([paragraph_embed, word_embed], axis=1)
            elif self.mode == "total_average":
                train_embed = tf.concat([paragraph_embed, word_embed], axis=1)
                train_embed = tf.reduce_mean(train_embed, axis=1)
            elif self.mode == "sum_concat":
                # shape: (None, w_embed_size)
                word_embed = tf.reduce_sum(word_embed, axis=1)
                # shape: (None, p_embed_size)
                paragraph_embed = tf.squeeze(paragraph_embed, [1])
                train_embed = tf.concat([paragraph_embed, word_embed], axis=1)
            else:
                raise NotImplementedError("unknown mode")
            
            embedding_size = train_embed.shape[1].value
            if training:
                nce_weights = tf.Variable(tf.truncated_normal([self.vocabulary_size, embedding_size], stddev=1.0 / np.sqrt(embedding_size)))
                nce_biases = tf.Variable(tf.zeros([self.vocabulary_size]))
            else:
                if nce_weights is None or nce_biases is None:
                    raise Exception("nce_weights and nce_biases cannot be None in testing stage")
                nce_weights = tf.constant(nce_weights)
                nce_biases = tf.constant(nce_biases)
            losses = tf.nn.nce_loss(weights=nce_weights,
                                                 biases=nce_biases,
                                                 labels=train_labels,
                                                 inputs=train_embed,
                                                 num_sampled=self.contrast_num_samples,
                                                 num_classes=self.vocabulary_size)
            loss = tf.reduce_mean(losses)
            optimizer = self.optimizer_type(learning_rate, **self.optimizer_kwargs)
            trainable_vars = tf.trainable_variables()
            gradients = []
            for i in range(self.batch_size):
                grads_and_vars = optimizer.compute_gradients(losses[i], trainable_vars)
                gradients.extend(grads_and_vars)
            train_op = optimizer.apply_gradients(gradients)
#             train_op = optimizer.minimize(loss)
            init = tf.global_variables_initializer()
        return {"train_inputs": train_inputs, "train_labels": train_labels, "paragraph_inputs": paragraph_inputs,
               "word_embeddings": word_embeddings, "paragraph_embeddings": paragraph_embeddings,
               "train_op": train_op, "init": init, "graph": graph, "nce_weights": nce_weights, "nce_biases": nce_biases,
               "loss": loss, "tf_queue": tf_queue}
    
    def _train_op(self, trainingGenerator, session, tensor_map, mode = "single_thread"):
        if mode == "single_thread":
            return self._train_op_singlethread(trainingGenerator, session, tensor_map)
        else:
            return self._train_op_multithread(trainingGenerator, session, tensor_map)
    
    def _train_op_singlethread(self, trainingGenerator, session, tensor_map):
        average_loss = 0
        for step, (current_batches, current_labels, current_paragraph_indices) in enumerate(trainingGenerator):
            tensor_map["tf_queue"].enqueue_many([current_batches, current_labels, current_paragraph_indices]).run()
            for train_inner_step in range(int(np.ceil(self.q_batch_size / self.batch_size))):
                _, loss_val = session.run([tensor_map["train_op"], tensor_map["loss"]])
                average_loss += loss_val
            if (step % self.eval_epochs == 0 and step > 0) :
                average_loss /= self.eval_epochs
                print("Average loss at step" % trainingGenerator.max_nbatch, step, "/%s: ", average_loss)
                average_loss = 0
    
    def _train_op_multithread(self, trainingGenerator, session, tensor_map):
        def _multithread_do_train(thread_id, coord, model, session, tensor_map):
            step = 0
            log_step = 0
            average_loss = 0
            while 1:
                try:
                    _, loss_val = session.run([tensor_map["train_op"], tensor_map["loss"]])
                    average_loss += loss_val
                    step += 1
                    if step * model.batch_size >= model.q_batch_size * model.eval_epochs:
                        average_loss /= step
                        log_step += 1
                        print("[Thread-%s] Average loss at step %s: %s" % (thread_id, log_step, average_loss))
                        average_loss = 0
                        step = 0
                except:
                    # queue closed, and no more elements left
                    break
        coord = tf.train.Coordinator()
        threads = [threading.Thread(target=_multithread_do_train, args=(i, coord, self, session, tensor_map)) for i in range(self.worker)]
        for t in threads:
            t.deamon = True
            t.start()
        for step, (current_batches, current_labels, current_paragraph_indices) in enumerate(trainingGenerator):
            tensor_map["tf_queue"].enqueue_many([current_batches, current_labels, current_paragraph_indices]).run()
        coord.request_stop()
        tensor_map["tf_queue"].close().run()
        coord.join(threads)
    
    def _fit(self, trainingGenerator, learning_rate = 1.0, epochs = None, return_embeddings = False, training = True):
        trainingGenerator.reset()
        if epochs is not None:
            trainingGenerator.max_nbatch = epochs
        trainingGenerator.batch_size = self.q_batch_size
        trainingGenerator.wordsStat = self.wordsStat
        trainingGenerator.window_size = self.window_size
        if self.force_build_net or self.tensor_map is None:
            if training:
                self.tensor_map = self.build_net(learning_rate=learning_rate)
            else:
                self.tensor_map = self.build_net(learning_rate=learning_rate, training=False, 
                                        nce_weights=self.nce_weights, nce_biases=self.nce_biases,
                                        word_embeddings=self.word_embeddings)
        tensor_map = self.tensor_map
        session = tf.Session(graph=tensor_map["graph"])
        self._session = session
        with session:
            tensor_map["init"].run()
            print("Initialized")
            if self.worker > 0:
                self._train_op(trainingGenerator, session, tensor_map, mode="multi_thread")
            else:
                self._train_op(trainingGenerator, session, tensor_map, mode="single_thread")
            if training:
                self.nce_weights = tensor_map["nce_weights"].eval()
                self.nce_biases = tensor_map["nce_biases"].eval()
                self.word_embeddings = tensor_map["word_embeddings"].eval()
            if return_embeddings:
                embeddings = {"paragraph_embeddings": tensor_map["paragraph_embeddings"].eval(),
                              "word_embeddings": tensor_map["word_embeddings"].eval()}
                return embeddings
    
    def fit(self, trainingGenerator, learning_rate = 1.0, epochs = None):
        self._fit(trainingGenerator=trainingGenerator, learning_rate=learning_rate, epochs=epochs, return_embeddings=False, training=True)
    
    def transform(self, testingGenerator, learning_rate = 1.0, epochs = None):
        return self._fit(trainingGenerator=testingGenerator, learning_rate=learning_rate, epochs=epochs, return_embeddings=True, training=False)
    
    def fit_transform(self, trainingGenerator, learning_rate = 1.0, epochs = None):
        return self._fit(trainingGenerator=trainingGenerator, learning_rate=learning_rate, epochs=epochs, return_embeddings=True, training=True)
    
    def save(self, filepath):
        np.save(file=filepath, arr=np.concatenate([self.nce_weights, self.nce_biases.reshape((-1, 1)), self.word_embeddings], axis=1))
    
    def load(self, filepath):
        nce_weights_and_bias_and_wordembed = np.load(file=filepath)
        if self.mode == "concat":
            embedding_size = self.paragraph_embedding_size + (self.window_size - 1) * self.word_embedding_size
        elif self.mode == "average" or self.mode == "sum_concat":
            embedding_size = self.paragraph_embedding_size + self.word_embedding_size
        elif self.mode == "total_average":
            embedding_size = self.paragraph_embedding_size
        else:
            raise NotImplementedError("unknown mode")
        self.nce_weights = nce_weights_and_bias_and_wordembed[:, :embedding_size]
        self.nce_biases = nce_weights_and_bias_and_wordembed[:, embedding_size]
        self.word_embeddings = nce_weights_and_bias_and_wordembed[:, embedding_size + 1: ]

In [9]:
imdbDmTrainingGener = CBOWGenerator(InMemorySamplePool(X_train, chunk_size=1000))
imdbDmTestingGener = CBOWGenerator(InMemorySamplePool(X_test, chunk_size=1000))
imdbDmTrainingGener.extend(imdbDmTestingGener)

In [10]:
imdbDmTrainingGener.sample_pool.samples.shape[0]

50000

In [11]:
def total_epochs(sample_num, total_sample_len, window_size, batch_size, round_num):
    return np.ceil((total_sample_len - window_size * sample_num + sample_num) / batch_size * round_num)

print(total_epochs(
    sample_num=imdbDmTrainingGener.sample_pool.samples.shape[0], 
    total_sample_len=np.sum(list(map(lambda x:len(x), imdbDmTrainingGener.sample_pool.samples))),
    window_size=11,
    batch_size=50,
    round_num=2
))

449518.0


In [12]:
paragraph_size = imdbDmTrainingGener.sample_pool.samples.shape[0]
dmNet = ParagraphVectorDmNet(vocabulary_size=10000, paragraph_size=paragraph_size, window_size=11, mode="total_average", batch_size=50, 
                             word_embedding_size=100, paragraph_embedding_size=100, contrast_num_samples=5, wordsStat=wordsStat,
                             optimizer_type=tf.train.AdamOptimizer, q_batch_size=50, eval_epochs=100, worker=-1)

In [13]:
%time training_vectors = dmNet.fit_transform(imdbDmTrainingGener, epochs=450000, learning_rate=0.001)

Initialized
Average loss at step/450000 100 :  25.38000499725342
Average loss at step/450000 200 :  25.6086026096344
Average loss at step/450000 300 :  28.968923416137695
Average loss at step/450000 400 :  29.61231204986572
Average loss at step/450000 500 :  33.719533653259276
Average loss at step/450000 600 :  35.50120421409607
Average loss at step/450000 700 :  38.29745830535889
Average loss at step/450000 800 :  42.62316241264343
Average loss at step/450000 900 :  45.284265270233156
Average loss at step/450000 1000 :  47.68463788986206
Average loss at step/450000 1100 :  49.852623777389525
Average loss at step/450000 1200 :  50.44102087020874
Average loss at step/450000 1300 :  54.059986419677735
Average loss at step/450000 1400 :  53.488341655731205
Average loss at step/450000 1500 :  60.843355655670166
Average loss at step/450000 1600 :  60.42912744522095
Average loss at step/450000 1700 :  58.352783641815186
Average loss at step/450000 1800 :  64.35190250396728
Average loss at st

KeyboardInterrupt: 

In [None]:
# %time testing_vectors = dmNet.transform(imdbDmTestingGener, epochs=90001, learning_rate=0.5)

In [None]:
dmNet.save("E:/kaggle/avito/imdb_testset/dmnet_weights")

In [None]:
def check_acc_on_model(classifier, X, y, split_ratio = 0.3):
    from sklearn.model_selection import train_test_split
    train_x, valid_x, train_y, valid_y = train_test_split(X, y, test_size=split_ratio, random_state=0, stratify=y)
    classifier.fit(train_x, train_y)
    proba_y = classifier.predict_proba(valid_x)
    print("acc: ", sum(np.argmax(proba_y, axis=1) == valid_y) / proba_y.shape[0])

def check_acc_on_rf(X, y, split_ratio = 0.3):
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(**{'n_jobs': -1,'n_estimators': 500,'max_depth': None,'max_features' : 'sqrt','random_state': 0})
    check_acc_on_model(classifier, X, y, split_ratio)
    
def check_acc_on_logit(X, y, split_ratio = 0.3):
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression()
    check_acc_on_model(classifier, X, y, split_ratio)
    
def check_acc_on_mlp(X, y, split_ratio = 0.3):
    import sklearn.neural_network
    classifier = sklearn.neural_network.MLPClassifier(hidden_layer_sizes=[1024], learning_rate="adaptive")
    check_acc_on_model(classifier, X, y, split_ratio)

In [None]:
training_vectors["paragraph_embeddings"].shape

In [None]:
check_acc_on_logit(training_vectors["paragraph_embeddings"][:25000], y_train)

In [None]:
# check_acc_on_logit(testing_vectors["paragraph_embeddings"][:25000], y_test)

In [19]:
np.save("E:/kaggle/avito/imdb_testset/training_paragraph_embeddings", training_vectors["paragraph_embeddings"])

In [49]:
def plot_vectors(vectors, labels):
    import matplotlib.pyplot as plt
    %matplotlib inline
    from sklearn.manifold import TSNE
    tsne = TSNE(perplexity=30, n_components=2, init="pca", n_iter=5000)
    low_dim_embs = tsne.fit_transform(vectors)
    plt.figure(figsize=(12, 12))
    plt.scatter(low_dim_embs[:, 0], low_dim_embs[:, 1], c=np.array(["b", "g"])[labels])
    plt.show()

In [2]:
sess = tf.Session()

In [3]:
tf_queue = tf.FIFOQueue(10, dtypes=[tf.float32, tf.float32], shapes=[[1], [2]])
tf_a, tf_b = tf_queue.dequeue_many(1)
tf_ab = tf.concat([tf_a, tf_b], axis=1)
tf_sum = tf.reduce_sum(tf_ab, axis=1)
tf_mean = tf.div(tf_sum, 2)

In [4]:
sess.run(tf_queue.enqueue_many(([[1], [2], [3]], [[2,3],[3,4],[4,5]])))

In [7]:
sess.run(tf_queue.close())

In [None]:
sess.run(tf_queue.dequeue())

In [23]:
sess.close()