# similarity_unseen_docsの内容

In [1]:
import numpy as np
import scipy
import scipy.linalg as LA

In [20]:
def similarity_unseen_docs(self, doc_words1, doc_words2, alpha=None, min_alpha=None, steps=None):
        """Compute cosine similarity between two post-bulk out of training documents.
        Parameters
        ----------
        model : :class:`~gensim.models.doc2vec.Doc2Vec`
            An instance of a trained `Doc2Vec` model.
        doc_words1 : list of str
            Input document.
        doc_words2 : list of str
            Input document.
        alpha : float, optional
            The initial learning rate.
        min_alpha : float, optional
            Learning rate will linearly drop to `min_alpha` as training progresses.
        steps : int, optional
            Number of epoch to train the new document.
        Returns
        -------
        float
            The cosine similarity between `doc_words1` and `doc_words2`.
        """
        d1 = self.infer_vector(doc_words=doc_words1, alpha=alpha, min_alpha=min_alpha, steps=steps)
        d2 = self.infer_vector(doc_words=doc_words2, alpha=alpha, min_alpha=min_alpha, steps=steps)
        return np.dot(matutils.unitvec(d1), matutils.unitvec(d2))

In [4]:
def unitvec(vec, norm='l2', return_norm=False):
    """Scale a vector to unit length.
    Parameters
    ----------
    vec : {numpy.ndarray, scipy.sparse, list of (int, float)}
        Input vector in any format
    norm : {'l1', 'l2', 'unique'}, optional
        Metric to normalize in.
    return_norm : bool, optional
        Return the length of vector `vec`, in addition to the normalized vector itself?
    Returns
    -------
    numpy.ndarray, scipy.sparse, list of (int, float)}
        Normalized vector in same format as `vec`.
    float
        Length of `vec` before normalization, if `return_norm` is set.
    Notes
    -----
    Zero-vector will be unchanged.
    """
    supported_norms = ('l1', 'l2', 'unique')
    if norm not in supported_norms:
        raise ValueError("'%s' is not a supported norm. Currently supported norms are %s." % (norm, supported_norms))

    if scipy.sparse.issparse(vec): #vecが疎行列だったらTrue  #false
        vec = vec.tocsr()  #vecをcsr_matrixの形式に変換
        if norm == 'l1':
            veclen = np.sum(np.abs(vec.data))
        if norm == 'l2':
            veclen = np.sqrt(np.sum(vec.data ** 2))
        if norm == 'unique':
            veclen = vec.nnz
        if veclen > 0.0:
            if np.issubdtype(vec.dtype, np.integer):
                vec = vec.astype(np.float)
            vec /= veclen
            if return_norm:
                return vec, veclen
            else:
                return vec
        else:
            if return_norm:
                return vec, 1.0
            else:
                return vec

    if isinstance(vec, np.ndarray): #vecがnd.ndarrayの型だったらTrue #true
        if norm == 'l1':
            veclen = np.sum(np.abs(vec)) #vecの絶対値の合計をveclenに代入
        if norm == 'l2': #ture
            if vec.size == 0:
                veclen = 0.0
            else:
                veclen = blas_nrm2(vec)
                print('veclen:',veclen)
        if norm == 'unique':
            veclen = np.count_nonzero(vec)
        if veclen > 0.0:
            if np.issubdtype(vec.dtype, np.integer):  #false
                vec = vec.astype(np.float)
            if return_norm: #false
                return blas_scal(1.0 / veclen, vec).astype(vec.dtype), veclen
            else:
                return blas_scal(1.0 / veclen, vec).astype(vec.dtype)
        else:
            if return_norm:
                return vec, 1.0
            else:
                return vec

    try:
        first = next(iter(vec))  # is there at least one element?
    except StopIteration:
        if return_norm:
            return vec, 1.0
        else:
            return vec

    if isinstance(first, (tuple, list)) and len(first) == 2:  # gensim sparse format
        if norm == 'l1':
            length = float(sum(abs(val) for _, val in vec))
        if norm == 'l2':
            length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec))
        if norm == 'unique':
            length = 1.0 * len(vec)
        assert length > 0.0, "sparse documents must not contain any explicit zero entries"
        if return_norm:
            return ret_normalized_vec(vec, length), length
        else:
            return ret_normalized_vec(vec, length)
    else:
        raise ValueError("unknown input type")

In [8]:
def blas(name, ndarray):
    """Helper for getting the appropriate BLAS function, using :func:`scipy.linalg.get_blas_funcs`.
    Parameters
    ----------
    name : str
        Name(s) of BLAS functions, without the type prefix.
    ndarray : numpy.ndarray
        Arrays can be given to determine optimal prefix of BLAS routines.
    Returns
    -------
    object
        BLAS function for the needed operation on the given data type.
    """
    return scipy.linalg.get_blas_funcs((name,), (ndarray,))[0] #名前から使用可能なBLAS関数オブジェクトを返します。
blas_nrm2 = blas('nrm2', np.array([], dtype=float))
blas_scal = blas('scal', np.array([], dtype=float))

疎行列（そぎょうれつ、英: sparse matrix）とは、成分のほとんどが零である行列のことをいう。スパース行列とも言う。 有限差分法、有限体積法、有限要素法などで離散化された偏微分方程式は一般に疎行列を係数行列とした連立一次方程式となる。

In [9]:
from gensim.models.doc2vec import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
model_doc = Doc2Vec.load("jawiki.doc2vec.dbow300d.model")

In [10]:
z = model_doc.infer_vector(['車'])

In [12]:
unitvec(z)

veclen: 0.7907881717694959


array([ 0.08972666, -0.0092322 , -0.04256337,  0.05482487,  0.10632727,
        0.02062949, -0.0560747 ,  0.06268494, -0.0562828 ,  0.02607832,
       -0.03893636, -0.04824613,  0.01007342, -0.00947893,  0.0035056 ,
        0.02899972, -0.02408149,  0.03620663, -0.00597921,  0.01082833,
        0.05455308,  0.01392252,  0.04128278,  0.03102316, -0.07902006,
        0.02307848, -0.04648721, -0.11421375, -0.05605326,  0.0289881 ,
       -0.14291629, -0.08959747,  0.04422378,  0.07865747, -0.06083259,
        0.01160509, -0.08184171, -0.01392379,  0.04465479,  0.01285873,
        0.02368837,  0.01367635, -0.09993049, -0.11114352,  0.02746045,
        0.01767351,  0.02436406, -0.06267492, -0.05237596,  0.14076704,
       -0.01748081, -0.01877484, -0.04558789, -0.142673  , -0.05778985,
        0.0200805 ,  0.00302748,  0.00614667,  0.05105071,  0.02074428,
        0.07087641,  0.02082591,  0.07308887,  0.02439552, -0.13460341,
       -0.05065057, -0.03573213,  0.05318211,  0.03366949,  0.00

In [40]:
np.issubdtype(z.dtype, np.integer)

False

In [None]:
def infer_vector(self, doc_words, alpha=None, min_alpha=None, epochs=None, steps=None):
        """Infer a vector for given post-bulk training document.
        Notes
        -----
        Subsequent calls to this function may infer different representations for the same document.
        For a more stable representation, increase the number of steps to assert a stricket convergence.
        Parameters
        ----------
        doc_words : list of str
            A document for which the vector representation will be inferred.
        alpha : float, optional
            The initial learning rate. If unspecified, value from model initialization will be reused.
        min_alpha : float, optional
            Learning rate will linearly drop to `min_alpha` over all inference epochs. If unspecified,
            value from model initialization will be reused.
        epochs : int, optional
            Number of times to train the new document. Larger values take more time, but may improve
            quality and run-to-run stability of inferred vectors. If unspecified, the `epochs` value
            from model initialization will be reused.
        Returns
        -------
        np.ndarray
            The inferred paragraph vector for the new document.
        """
        if isinstance(doc_words, str):  # a common mistake; fail with a nicer error
            raise TypeError("Parameter doc_words of infer_vector() must be a list of strings (not a single string).")

        alpha = alpha or self.alpha
        min_alpha = min_alpha or self.min_alpha
        epochs = epochs or self.epochs

        doctag_vectors = pseudorandom_weak_vector(self.dv.vector_size, seed_string=' '.join(doc_words))
        doctag_vectors = doctag_vectors.reshape(1, self.dv.vector_size)

        doctags_lockf = np.ones(1, dtype=REAL)
        doctag_indexes = [0]
        work = zeros(self.layer1_size, dtype=REAL)   #layer1_size = vector_size
        if not self.sg:    #sg= (1 + dm) % 2
                                #dm : {1,0}, optional
                                #Defines the training algorithm. If `dm=1`, 'distributed memory' (PV-DM) is used.
                                #Otherwise, `distributed bag of words` (PV-DBOW) is employed.
            neu1 = matutils.zeros_aligned(self.layer1_size, dtype=REAL)

        alpha_delta = (alpha - min_alpha) / max(epochs - 1, 1)

        for i in range(epochs):
            if self.sg:
                train_document_dbow(
                    self, doc_words, doctag_indexes, alpha, work,
                    learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf
                )
            elif self.dm_concat:
                train_document_dm_concat(
                    self, doc_words, doctag_indexes, alpha, work, neu1,
                    learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf
                )
            else:
                train_document_dm(
                    self, doc_words, doctag_indexes, alpha, work, neu1,
                    learn_words=False, learn_hidden=False, doctag_vectors=doctag_vectors, doctags_lockf=doctags_lockf
                )
            alpha -= alpha_delta

        return doctag_vectors[0]

In [None]:
def pseudorandom_weak_vector(size, seed_string=None, hashfxn=hash):
    """Get a random vector, derived deterministically from `seed_string` if supplied.
    Useful for initializing KeyedVectors that will be the starting projection/input layers of _2Vec models.
    """
    if seed_string:
        once = np.random.Generator(np.random.SFC64(hashfxn(seed_string) & 0xffffffff))
    else:
        once = utils.default_prng
    return (once.random(size).astype(REAL) - 0.5) / size  #REAL = np.float32


In [None]:
def zeros_aligned(shape, dtype, order='C', align=128):
    """Get array aligned at `align` byte boundary in memory.
    Parameters
    ----------
    shape : int or (int, int)
        Shape of array.
    dtype : data-type
        Data type of array.
    order : {'C', 'F'}, optional
        Whether to store multidimensional data in C- or Fortran-contiguous (row- or column-wise) order in memory.
    align : int, optional
        Boundary for alignment in bytes.
    Returns
    -------
    numpy.ndarray
        Aligned array.
    """
    nbytes = np.prod(shape, dtype=np.int64) * np.dtype(dtype).itemsize #np.prod 全要素を対象に要素の積を算出
    buffer = np.zeros(nbytes + align, dtype=np.uint8)  # problematic on win64 ("maximum allowed dimension exceeded")
    start_index = -buffer.ctypes.data % align
    return buffer[start_index: start_index + nbytes].view(dtype).reshape(shape, order=order)