## TF-IDF ##

In [77]:
from pyspark.mllib.feature import HashingTF, IDF
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
from pyspark.mllib.linalg.distributed import RowMatrix, DistributedMatrix
from pyspark.mllib.linalg import Vectors
import numpy as np
import math

** Peso de um termo dado um documento **

In [78]:
def termDocWeight(termFrequencyInDoc, totalTermsInDoc, termFreqInCorpus, totalDocs):
    tf = termFrequencyInDoc / totalTermsInDoc
    docFreq = totalDocs/termFreqInCorpus
    idf = math.log(docFreq)
    return tf * idf

** Entrando arquivo **

In [79]:
#este arquivo é pré-processado

documents = sc.textFile("data_works.txt").map(lambda line: line.split(" "))

** Criando dicionário de termos por documento **

In [80]:
#função q retorna um dicionário de termos/frequencia de um documento
def caclDocTermFreq(doc):
    terms = dict()
    for term in doc:
        if term in terms:
            terms[term] += 1
        else:
            terms[term] = 1
    return terms
         
#para cada documento, seu dicionário
docTermFreqs = documents.map(caclDocTermFreq)

#como será usado ao menos mais duas vezes, manteremos em memória
docTermFreqs.cache()

#
docFreqs = docTermFreqs.flatMap(lambda _: _.keys()).map(lambda _: (_, 1)).reduceByKey(lambda x1, x2: x1 + x2)


** Calculando a inversa das frequencias do documento **

In [186]:
#numero de documentos
numDocs = docTermFreqs.count()

def inverseDocFreq((term, count)):
    return (term, math.log(numDocs / count))

#idfs é um dicionário termo/inversa
idfs = docFreqs.map(inverseDocFreq).collectAsMap()

#tdicionário id/termo e o reverso
termIds = dict(enumerate(idfs.keys()))
idTerms = dict(map(reversed, termIds.iteritems()))

docIds = dict(enumerate(['doc'+str(x) for x in range(0,numDocs)]))
idDocs = dict(map(reversed, docIds.iteritems()))

#Because the term ID map is fairly large and we’ll use it in a few different places, let’s broadcast it along with the IDFs:
bIdfs = sc.broadcast(idfs).value
bIdTerms = sc.broadcast(idTerms).value
bIdDocs = sc.broadcast(idDocs).value


{0: 'doc0', 1: 'doc1', 2: 'doc2', 3: 'doc3', 4: 'doc4', 5: 'doc5', 6: 'doc6', 7: 'doc7', 8: 'doc8', 9: 'doc9', 10: 'doc10', 11: 'doc11', 12: 'doc12', 13: 'doc13', 14: 'doc14'}


** TF-IDF para cada documento **

In [82]:
def generateVectors(termFreqs):
    docTotalTerms = sum(termFreqs.values())
    
    def calcScores(TF):
        filterTF = dict((k,v) for k, v in TF.iteritems() if bIdTerms.has_key(k))
        return dict((bIdTerms[k], bIdfs[k] * TF[k] / docTotalTerms) for k, v in filterTF.iteritems())
           
    return Vectors.sparse(len(bIdTerms), calcScores(termFreqs)) 
    
vecs = docTermFreqs.map(generateVectors)
vecs.cache()

PythonRDD[145] at RDD at PythonRDD.scala:43

** SVD **



hashingTF = HashingTF()
tf = hashingTF.transform(documents)

tf.cache()
idf = IDF().fit(tf)
tfidf = idf.transform(tf)

In [83]:
class SVD(JavaModelWrapper):
    """Wrapper around the SVD scala case class"""
    @property
    def U(self):
        """ Returns a RowMatrix whose columns are the left singular vectors of the SVD if computeU was set to be True."""
        u = self.call("U")
        if u is not None:
            return RowMatrix(u)

    @property
    def s(self):
        """Returns a DenseVector with singular values in descending order."""
        return self.call("s")

    @property
    def V(self):
        """ Returns a DenseMatrix whose columns are the right singular vectors of the SVD."""
        return self.call("V")

def computeSVD(row_matrix, k, computeU=True, rCond=1e-9):
    """
    Computes the singular value decomposition of the RowMatrix.
    The given row matrix A of dimension (m X n) is decomposed into U * s * V'T where
    * s: DenseVector consisting of square root of the eigenvalues (singular values) in descending order.
    * U: (m X k) (left singular vectors) is a RowMatrix whose columns are the eigenvectors of (A X A')
    * v: (n X k) (right singular vectors) is a Matrix whose columns are the eigenvectors of (A' X A)
    :param k: number of singular values to keep. We might return less than k if there are numerically zero singular values.
    :param computeU: Whether of not to compute U. If set to be True, then U is computed by A * V * sigma^-1
    :param rCond: the reciprocal condition number. All singular values smaller than rCond * sigma(0) are treated as zero, where sigma(0) is the largest singular value.
    :returns: SVD object
    """
    
    java_model = row_matrix._java_matrix_wrapper.call("computeSVD", int(k), computeU, float(rCond))
    return SVD(java_model)

In [157]:
mat = RowMatrix (vecs) 
svd = computeSVD(mat, 15, True)


**Encontrando conceitos importantes**

In [151]:
v = svd.V
numConcepts = v.numCols
arr = v.toArray().flatten()

numTerms = 500
topTerms = []

def arrayWithIndex(arr):
    tupleList = [ (index,item) for (index,item) in enumerate( arr ) ]
    return sorted(tupleList, key=lambda tup: tup[1])[::-1]

def mapList(l):
    return [(termIds[x], y) for (x,y) in l]
        
for i in range(0, numConcepts):
    offs = i * v.numRows
    termWeights = arrayWithIndex(arr[offs : offs + v.numRows])
    topTerms.append(mapList(termWeights[:numTerms]))

**Top Documents in Top Concepts**

In [190]:
u = svd.U

topDocs = []

def mapListDoc(l):
    return [(docIds[y], x) for (x,y) in l]

for i in range(0, u.numCols()):
    docWeights = u.rows.map(lambda x: x.toArray()[i]).zipWithUniqueId()
    topDocs.append(mapListDoc(docWeights.top(numDocs)))


In [191]:
print topDocs

[[('doc7', -2.4237389563359696e-07), ('doc14', -3.922928851859077e-07), ('doc5', -6.2919680657957415e-07), ('doc1', -9.4336882817787148e-06), ('doc2', -1.5064183054833892e-05), ('doc8', -3.8958484174650644e-05), ('doc11', -9.5836023893182783e-05), ('doc9', -0.00010840152253518779), ('doc13', -0.000150086275358205), ('doc3', -0.00033900569760214881), ('doc4', -0.00091665002746759013), ('doc6', -0.07075411955019921), ('doc10', -0.10626098917921036), ('doc0', -0.12230649997271191), ('doc12', -0.98424721302221629)], [('doc12', 0.1503316708022252), ('doc7', -3.2732460143766834e-07), ('doc5', -7.3774618333364265e-07), ('doc14', -1.5988818257447671e-06), ('doc4', -3.7943280015833294e-05), ('doc1', -8.7666548584013933e-05), ('doc3', -0.00012187934947024443), ('doc2', -0.0001421907521860537), ('doc9', -0.00021317549435223262), ('doc8', -0.00034428371910892998), ('doc11', -0.0004215654701204005), ('doc13', -0.00095070533391661478), ('doc6', -0.080552027463366371), ('doc10', -0.23836381448721797)