Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
694 lines (556 sloc) 25.5 KB
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2011 Radim Rehurek <radimrehurek@seznam.cz>
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html
"""
This module contains math helper functions.
"""
from __future__ import with_statement
import logging
import math
from gensim import utils
import numpy
import scipy.sparse
import scipy.linalg
from scipy.linalg.lapack import get_lapack_funcs
from six import iteritems, itervalues, string_types
from six.moves import xrange, zip as izip
# scipy is not a stable package yet, locations change, so try to work
# around differences (currently only concerns location of 'triu' in scipy 0.7 vs. 0.8)
try:
from scipy.linalg.basic import triu
except ImportError:
from scipy.linalg.special_matrices import triu
try:
from numpy import triu_indices
except ImportError:
# numpy < 1.4
def triu_indices(n, k=0):
m = numpy.ones((n, n), int)
a = triu(m, k)
return numpy.where(a != 0)
blas = lambda name, ndarray: scipy.linalg.get_blas_funcs((name,), (ndarray,))[0]
try:
# with bottleneck installed, we can use faster partial sorting
import bottleneck
def argsort(x, topn=None):
"""Return indices of the `topn` greatest elements in numpy array `x`, in order."""
if topn is None:
topn = x.size
if topn <= 0:
return []
if topn >= x.size:
return numpy.argsort(x)[::-1]
biggest = bottleneck.argpartsort(x, x.size - topn)[-topn:]
# the indices in `biggest` are not sorted by magnitude => sort & return
return biggest.take(numpy.argsort(x.take(biggest))[::-1])
except ImportError:
# no bottleneck => fall back to numpy
def argsort(x, topn=None):
if topn is None:
topn = x.size
return numpy.argsort(x)[::-1][:topn]
logger = logging.getLogger("gensim.matutils")
def corpus2csc(corpus, num_terms=None, dtype=numpy.float64, num_docs=None, num_nnz=None, printprogress=0):
"""
Convert a streamed corpus into a sparse matrix, in scipy.sparse.csc_matrix format,
with documents as columns.
If the number of terms, documents and non-zero elements is known, you can pass
them here as parameters and a more memory efficient code path will be taken.
The input corpus may be a non-repeatable stream (generator).
This is the mirror function to `Sparse2Corpus`.
"""
try:
# if the input corpus has the `num_nnz`, `num_docs` and `num_terms` attributes
# (as is the case with MmCorpus for example), we can use a more efficient code path
if num_terms is None:
num_terms = corpus.num_terms
if num_docs is None:
num_docs = corpus.num_docs
if num_nnz is None:
num_nnz = corpus.num_nnz
except AttributeError:
pass # not a MmCorpus...
if printprogress:
logger.info("creating sparse matrix from corpus")
if num_terms is not None and num_docs is not None and num_nnz is not None:
# faster and much more memory-friendly version of creating the sparse csc
posnow, indptr = 0, [0]
indices = numpy.empty((num_nnz,), dtype=numpy.int32) # HACK assume feature ids fit in 32bit integer
data = numpy.empty((num_nnz,), dtype=dtype)
for docno, doc in enumerate(corpus):
if printprogress and docno % printprogress == 0:
logger.info("PROGRESS: at document #%i/%i" % (docno, num_docs))
posnext = posnow + len(doc)
indices[posnow: posnext] = [feature_id for feature_id, _ in doc]
data[posnow: posnext] = [feature_weight for _, feature_weight in doc]
indptr.append(posnext)
posnow = posnext
assert posnow == num_nnz, "mismatch between supplied and computed number of non-zeros"
result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype)
else:
# slower version; determine the sparse matrix parameters during iteration
num_nnz, data, indices, indptr = 0, [], [], [0]
for docno, doc in enumerate(corpus):
if printprogress and docno % printprogress == 0:
logger.info("PROGRESS: at document #%i" % (docno))
indices.extend([feature_id for feature_id, _ in doc])
data.extend([feature_weight for _, feature_weight in doc])
num_nnz += len(doc)
indptr.append(num_nnz)
if num_terms is None:
num_terms = max(indices) + 1 if indices else 0
num_docs = len(indptr) - 1
# now num_docs, num_terms and num_nnz contain the correct values
data = numpy.asarray(data, dtype=dtype)
indices = numpy.asarray(indices)
result = scipy.sparse.csc_matrix((data, indices, indptr), shape=(num_terms, num_docs), dtype=dtype)
return result
def pad(mat, padrow, padcol):
"""
Add additional rows/columns to a numpy.matrix `mat`. The new rows/columns
will be initialized with zeros.
"""
if padrow < 0:
padrow = 0
if padcol < 0:
padcol = 0
rows, cols = mat.shape
return numpy.bmat([[mat, numpy.matrix(numpy.zeros((rows, padcol)))],
[numpy.matrix(numpy.zeros((padrow, cols + padcol)))]])
def zeros_aligned(shape, dtype, order='C', align=128):
"""Like `numpy.zeros()`, but the array will be aligned at `align` byte boundary."""
nbytes = numpy.prod(shape, dtype=numpy.int64) * numpy.dtype(dtype).itemsize
buffer = numpy.zeros(nbytes + align, dtype=numpy.uint8) # problematic on win64 ("maximum allowed dimension exceeded")
start_index = -buffer.ctypes.data % align
return buffer[start_index : start_index + nbytes].view(dtype).reshape(shape, order=order)
def ismatrix(m):
return isinstance(m, numpy.ndarray) and m.ndim == 2 or scipy.sparse.issparse(m)
def any2sparse(vec, eps=1e-9):
"""Convert a numpy/scipy vector into gensim document format (=list of 2-tuples)."""
if isinstance(vec, numpy.ndarray):
return dense2vec(vec, eps)
if scipy.sparse.issparse(vec):
return scipy2sparse(vec, eps)
return [(int(fid), float(fw)) for fid, fw in vec if numpy.abs(fw) > eps]
def scipy2sparse(vec, eps=1e-9):
"""Convert a scipy.sparse vector into gensim document format (=list of 2-tuples)."""
vec = vec.tocsr()
assert vec.shape[0] == 1
return [(int(pos), float(val)) for pos, val in zip(vec.indices, vec.data) if numpy.abs(val) > eps]
class Scipy2Corpus(object):
"""
Convert a sequence of dense/sparse vectors into a streamed gensim corpus object.
This is the mirror function to `corpus2csc`.
"""
def __init__(self, vecs):
"""
`vecs` is a sequence of dense and/or sparse vectors, such as a 2d numpy array,
or a scipy.sparse.csc_matrix, or any sequence containing a mix of 1d numpy/scipy vectors.
"""
self.vecs = vecs
def __iter__(self):
for vec in self.vecs:
if isinstance(vec, numpy.ndarray):
yield full2sparse(vec)
else:
yield scipy2sparse(vec)
def __len__(self):
return len(self.vecs)
def sparse2full(doc, length):
"""
Convert a document in sparse document format (=sequence of 2-tuples) into a dense
numpy array (of size `length`).
This is the mirror function to `full2sparse`.
"""
result = numpy.zeros(length, dtype=numpy.float32) # fill with zeroes (default value)
doc = dict(doc)
# overwrite some of the zeroes with explicit values
result[list(doc)] = list(itervalues(doc))
return result
def full2sparse(vec, eps=1e-9):
"""
Convert a dense numpy array into the sparse document format (sequence of 2-tuples).
Values of magnitude < `eps` are treated as zero (ignored).
This is the mirror function to `sparse2full`.
"""
vec = numpy.asarray(vec, dtype=float)
nnz = numpy.nonzero(abs(vec) > eps)[0]
return list(zip(nnz, vec.take(nnz)))
dense2vec = full2sparse
def full2sparse_clipped(vec, topn, eps=1e-9):
"""
Like `full2sparse`, but only return the `topn` elements of the greatest magnitude (abs).
"""
# use numpy.argsort and only form tuples that are actually returned.
# this is about 40x faster than explicitly forming all 2-tuples to run sort() or heapq.nlargest() on.
if topn <= 0:
return []
vec = numpy.asarray(vec, dtype=float)
nnz = numpy.nonzero(abs(vec) > eps)[0]
biggest = nnz.take(argsort(vec.take(nnz), topn))
return list(zip(biggest, vec.take(biggest)))
def corpus2dense(corpus, num_terms, num_docs=None, dtype=numpy.float32):
"""
Convert corpus into a dense numpy array (documents will be columns). You
must supply the number of features `num_terms`, because dimensionality
cannot be deduced from the sparse vectors alone.
You can optionally supply `num_docs` (=the corpus length) as well, so that
a more memory-efficient code path is taken.
This is the mirror function to `Dense2Corpus`.
"""
if num_docs is not None:
# we know the number of documents => don't bother column_stacking
docno, result = -1, numpy.empty((num_terms, num_docs), dtype=dtype)
for docno, doc in enumerate(corpus):
result[:, docno] = sparse2full(doc, num_terms)
assert docno + 1 == num_docs
else:
result = numpy.column_stack(sparse2full(doc, num_terms) for doc in corpus)
return result.astype(dtype)
class Dense2Corpus(object):
"""
Treat dense numpy array as a sparse, streamed gensim corpus.
No data copy is made (changes to the underlying matrix imply changes in the
corpus).
This is the mirror function to `corpus2dense`.
"""
def __init__(self, dense, documents_columns=True):
if documents_columns:
self.dense = dense.T
else:
self.dense = dense
def __iter__(self):
for doc in self.dense:
yield full2sparse(doc.flat)
def __len__(self):
return len(self.dense)
#endclass DenseCorpus
class Sparse2Corpus(object):
"""
Convert a matrix in scipy.sparse format into a streaming gensim corpus.
This is the mirror function to `corpus2csc`.
"""
def __init__(self, sparse, documents_columns=True):
if documents_columns:
self.sparse = sparse.tocsc()
else:
self.sparse = sparse.tocsr().T # make sure shape[1]=number of docs (needed in len())
def __iter__(self):
for indprev, indnow in izip(self.sparse.indptr, self.sparse.indptr[1:]):
yield zip(self.sparse.indices[indprev:indnow], self.sparse.data[indprev:indnow])
def __len__(self):
return self.sparse.shape[1]
#endclass Sparse2Corpus
def veclen(vec):
if len(vec) == 0:
return 0.0
length = 1.0 * math.sqrt(sum(val**2 for _, val in vec))
assert length > 0.0, "sparse documents must not contain any explicit zero entries"
return length
blas_nrm2 = blas('nrm2', numpy.array([], dtype=float))
blas_scal = blas('scal', numpy.array([], dtype=float))
def unitvec(vec):
"""
Scale a vector to unit length. The only exception is the zero vector, which
is returned back unchanged.
Output will be in the same format as input (i.e., gensim vector=>gensim vector,
or numpy array=>numpy array, scipy.sparse=>scipy.sparse).
"""
if scipy.sparse.issparse(vec): # convert scipy.sparse to standard numpy array
vec = vec.tocsr()
veclen = numpy.sqrt(numpy.sum(vec.data ** 2))
if veclen > 0.0:
return vec / veclen
else:
return vec
if isinstance(vec, numpy.ndarray):
vec = numpy.asarray(vec, dtype=float)
veclen = blas_nrm2(vec)
if veclen > 0.0:
return blas_scal(1.0 / veclen, vec)
else:
return vec
try:
first = next(iter(vec)) # is there at least one element?
except:
return vec
if isinstance(first, (tuple, list)) and len(first) == 2: # gensim sparse format?
length = 1.0 * math.sqrt(sum(val ** 2 for _, val in vec))
assert length > 0.0, "sparse documents must not contain any explicit zero entries"
if length != 1.0:
return [(termid, val / length) for termid, val in vec]
else:
return list(vec)
else:
raise ValueError("unknown input type")
def cossim(vec1, vec2):
"""
Return cosine similarity between two sparse vectors.
The similarity is a number between <-1.0, 1.0>, higher is more similar.
"""
vec1, vec2 = dict(vec1), dict(vec2)
if not vec1 or not vec2:
return 0.0
vec1len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec1)))
vec2len = 1.0 * math.sqrt(sum(val * val for val in itervalues(vec2)))
assert vec1len > 0.0 and vec2len > 0.0, "sparse documents must not contain any explicit zero entries"
if len(vec2) < len(vec1):
vec1, vec2 = vec2, vec1 # swap references so that we iterate over the shorter vector
result = sum(value * vec2.get(index, 0.0) for index, value in iteritems(vec1))
result /= vec1len * vec2len # rescale by vector lengths
return result
def qr_destroy(la):
"""
Return QR decomposition of `la[0]`. Content of `la` gets destroyed in the process.
Using this function should be less memory intense than calling `scipy.linalg.qr(la[0])`,
because the memory used in `la[0]` is reclaimed earlier.
"""
a = numpy.asfortranarray(la[0])
del la[0], la # now `a` is the only reference to the input matrix
m, n = a.shape
# perform q, r = QR(a); code hacked out of scipy.linalg.qr
logger.debug("computing QR of %s dense matrix" % str(a.shape))
geqrf, = get_lapack_funcs(('geqrf',), (a,))
qr, tau, work, info = geqrf(a, lwork=-1, overwrite_a=True)
qr, tau, work, info = geqrf(a, lwork=work[0], overwrite_a=True)
del a # free up mem
assert info >= 0
r = triu(qr[:n, :n])
if m < n: # rare case, #features < #topics
qr = qr[:, :m] # retains fortran order
gorgqr, = get_lapack_funcs(('orgqr',), (qr,))
q, work, info = gorgqr(qr, tau, lwork=-1, overwrite_a=True)
q, work, info = gorgqr(qr, tau, lwork=work[0], overwrite_a=True)
assert info >= 0, "qr failed"
assert q.flags.f_contiguous
return q, r
class MmWriter(object):
"""
Store a corpus in Matrix Market format.
Note that the output is written one document at a time, not the whole
matrix at once (unlike scipy.io.mmread). This allows us to process corpora
which are larger than the available RAM.
NOTE: the output file is created in a single pass through the input corpus, so
that the input can be a once-only stream (iterator).
To achieve this, a fake MM header is written first, statistics are collected
during the pass (shape of the matrix, number of non-zeroes), followed by a seek
back to the beginning of the file, rewriting the fake header with proper values.
"""
HEADER_LINE = b'%%MatrixMarket matrix coordinate real general\n' # the only supported MM format
def __init__(self, fname):
self.fname = fname
if fname.endswith(".gz") or fname.endswith('.bz2'):
raise NotImplementedError("compressed output not supported with MmWriter")
self.fout = open(self.fname, 'wb+') # open for both reading and writing
self.headers_written = False
def write_headers(self, num_docs, num_terms, num_nnz):
self.fout.write(MmWriter.HEADER_LINE)
if num_nnz < 0:
# we don't know the matrix shape/density yet, so only log a general line
logger.info("saving sparse matrix to %s" % self.fname)
self.fout.write(utils.to_utf8(' ' * 50 + '\n')) # 48 digits must be enough for everybody
else:
logger.info("saving sparse %sx%s matrix with %i non-zero entries to %s" %
(num_docs, num_terms, num_nnz, self.fname))
self.fout.write(utils.to_utf8('%s %s %s\n' % (num_docs, num_terms, num_nnz)))
self.last_docno = -1
self.headers_written = True
def fake_headers(self, num_docs, num_terms, num_nnz):
stats = '%i %i %i' % (num_docs, num_terms, num_nnz)
if len(stats) > 50:
raise ValueError('Invalid stats: matrix too large!')
self.fout.seek(len(MmWriter.HEADER_LINE))
self.fout.write(utils.to_utf8(stats))
def write_vector(self, docno, vector):
"""
Write a single sparse vector to the file.
Sparse vector is any iterable yielding (field id, field value) pairs.
"""
assert self.headers_written, "must write Matrix Market file headers before writing data!"
assert self.last_docno < docno, "documents %i and %i not in sequential order!" % (self.last_docno, docno)
vector = sorted((i, w) for i, w in vector if abs(w) > 1e-12) # ignore near-zero entries
for termid, weight in vector: # write term ids in sorted order
self.fout.write(utils.to_utf8("%i %i %s\n" % (docno + 1, termid + 1, weight))) # +1 because MM format starts counting from 1
self.last_docno = docno
return (vector[-1][0], len(vector)) if vector else (-1, 0)
@staticmethod
def write_corpus(fname, corpus, progress_cnt=1000, index=False, num_terms=None, metadata=False):
"""
Save the vector space representation of an entire corpus to disk.
Note that the documents are processed one at a time, so the whole corpus
is allowed to be larger than the available RAM.
"""
mw = MmWriter(fname)
# write empty headers to the file (with enough space to be overwritten later)
mw.write_headers(-1, -1, -1) # will print 50 spaces followed by newline on the stats line
# calculate necessary header info (nnz elements, num terms, num docs) while writing out vectors
_num_terms, num_nnz = 0, 0
docno, poslast = -1, -1
offsets = []
if hasattr(corpus, 'metadata'):
orig_metadata = corpus.metadata
corpus.metadata = metadata
if metadata:
docno2metadata = {}
else:
metadata = False
for docno, doc in enumerate(corpus):
if metadata:
bow, data = doc
docno2metadata[docno] = data
else:
bow = doc
if docno % progress_cnt == 0:
logger.info("PROGRESS: saving document #%i" % docno)
if index:
posnow = mw.fout.tell()
if posnow == poslast:
offsets[-1] = -1
offsets.append(posnow)
poslast = posnow
max_id, veclen = mw.write_vector(docno, bow)
_num_terms = max(_num_terms, 1 + max_id)
num_nnz += veclen
if metadata:
utils.pickle(docno2metadata, fname + '.metadata.cpickle')
corpus.metadata = orig_metadata
num_docs = docno + 1
num_terms = num_terms or _num_terms
if num_docs * num_terms != 0:
logger.info("saved %ix%i matrix, density=%.3f%% (%i/%i)" % (
num_docs, num_terms,
100.0 * num_nnz / (num_docs * num_terms),
num_nnz,
num_docs * num_terms))
# now write proper headers, by seeking and overwriting the spaces written earlier
mw.fake_headers(num_docs, num_terms, num_nnz)
mw.close()
if index:
return offsets
def __del__(self):
"""
Automatic destructor which closes the underlying file.
There must be no circular references contained in the object for __del__
to work! Closing the file explicitly via the close() method is preferred
and safer.
"""
self.close() # does nothing if called twice (on an already closed file), so no worries
def close(self):
logger.debug("closing %s" % self.fname)
if hasattr(self, 'fout'):
self.fout.close()
#endclass MmWriter
class MmReader(object):
"""
Wrap a term-document matrix on disk (in matrix-market format), and present it
as an object which supports iteration over the rows (~documents).
Note that the file is read into memory one document at a time, not the whole
matrix at once (unlike scipy.io.mmread). This allows us to process corpora
which are larger than the available RAM.
"""
def __init__(self, input, transposed=True):
"""
Initialize the matrix reader.
The `input` refers to a file on local filesystem, which is expected to
be in the sparse (coordinate) Matrix Market format. Documents are assumed
to be rows of the matrix (and document features are columns).
`input` is either a string (file path) or a file-like object that supports
`seek()` (e.g. gzip.GzipFile, bz2.BZ2File).
"""
logger.info("initializing corpus reader from %s" % input)
self.input, self.transposed = input, transposed
with utils.file_or_filename(self.input) as lines:
try:
header = utils.to_unicode(next(lines)).strip()
if not header.lower().startswith('%%matrixmarket matrix coordinate real general'):
raise ValueError("File %s not in Matrix Market format with coordinate real general; instead found: \n%s" %
(self.input, header))
except StopIteration:
pass
self.num_docs = self.num_terms = self.num_nnz = 0
for lineno, line in enumerate(lines):
line = utils.to_unicode(line)
if not line.startswith('%'):
self.num_docs, self.num_terms, self.num_nnz = map(int, line.split())
if not self.transposed:
self.num_docs, self.num_terms = self.num_terms, self.num_docs
break
logger.info("accepted corpus with %i documents, %i features, %i non-zero entries" %
(self.num_docs, self.num_terms, self.num_nnz))
def __len__(self):
return self.num_docs
def __str__(self):
return ("MmCorpus(%i documents, %i features, %i non-zero entries)" %
(self.num_docs, self.num_terms, self.num_nnz))
def skip_headers(self, input_file):
"""
Skip file headers that appear before the first document.
"""
for line in input_file:
if line.startswith(b'%'):
continue
break
def __iter__(self):
"""
Iteratively yield vectors from the underlying file, in the format (row_no, vector),
where vector is a list of (col_no, value) 2-tuples.
Note that the total number of vectors returned is always equal to the
number of rows specified in the header; empty documents are inserted and
yielded where appropriate, even if they are not explicitly stored in the
Matrix Market file.
"""
with utils.file_or_filename(self.input) as lines:
self.skip_headers(lines)
previd = -1
for line in lines:
docid, termid, val = utils.to_unicode(line).split() # needed for python3
if not self.transposed:
termid, docid = docid, termid
docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based
assert previd <= docid, "matrix columns must come in ascending order"
if docid != previd:
# change of document: return the document read so far (its id is prevId)
if previd >= 0:
yield previd, document
# return implicit (empty) documents between previous id and new id
# too, to keep consistent document numbering and corpus length
for previd in xrange(previd + 1, docid):
yield previd, []
# from now on start adding fields to a new document, with a new id
previd = docid
document = []
document.append((termid, val,)) # add another field to the current document
# handle the last document, as a special case
if previd >= 0:
yield previd, document
# return empty documents between the last explicit document and the number
# of documents as specified in the header
for previd in xrange(previd + 1, self.num_docs):
yield previd, []
def docbyoffset(self, offset):
"""Return document at file offset `offset` (in bytes)"""
# empty documents are not stored explicitly in MM format, so the index marks
# them with a special offset, -1.
if offset == -1:
return []
if isinstance(self.input, string_types):
fin = open(self.input)
else:
fin = self.input
fin.seek(offset) # works for gzip/bz2 input, too
previd, document = -1, []
for line in fin:
docid, termid, val = line.split()
if not self.transposed:
termid, docid = docid, termid
docid, termid, val = int(docid) - 1, int(termid) - 1, float(val) # -1 because matrix market indexes are 1-based => convert to 0-based
assert previd <= docid, "matrix columns must come in ascending order"
if docid != previd:
if previd >= 0:
return document
previd = docid
document.append((termid, val,)) # add another field to the current document
return document
#endclass MmReader