## Trabalho Final

## Latent Semantic Analysis

** Importando libs **

In [78]:
from pyspark.mllib.feature import HashingTF, IDF
from pyspark.mllib.common import callMLlibFunc, JavaModelWrapper
from pyspark.mllib.linalg.distributed import RowMatrix, DistributedMatrix
from pyspark.mllib.linalg import Vectors, DenseMatrix
from nltk.stem import SnowballStemmer
import numpy as np
import math

** Peso de um termo dado um documento **

In [2]:
def termDocWeight(termFrequencyInDoc, totalTermsInDoc, termFreqInCorpus, totalDocs):
    tf = termFrequencyInDoc / totalTermsInDoc
    docFreq = totalDocs/termFreqInCorpus
    idf = math.log(docFreq)
    return tf * idf

** Entrando arquivo **

In [77]:
#este arquivo é pré-processado
documents = sc.textFile("data_works.txt").map(lambda line: line.split(" "))
stemmer = SnowballStemmer('english')

** Criando dicionário de termos por documento **

In [4]:
#função q retorna um dicionário de termos/frequencia de um documento
def caclDocTermFreq(doc):
    terms = dict()
    for term in doc:
        if term in terms:
            terms[term] += 1
        else:
            terms[term] = 1
    return terms
         
#para cada documento, seu dicionário
docTermFreqs = documents.map(caclDocTermFreq)

#como será usado ao menos mais duas vezes, manteremos em memória
docTermFreqs.cache()

#
docFreqs = docTermFreqs.flatMap(lambda _: _.keys()).map(lambda _: (_, 1)).reduceByKey(lambda x1, x2: x1 + x2)

** Calculando a inversa das frequencias do documento **

In [5]:
#numero de documentos
numDocs = docTermFreqs.count()

def inverseDocFreq((term, count)):
    return (term, math.log(numDocs / count))

#idfs é um dicionário termo/inversa
idfs = docFreqs.map(inverseDocFreq).collectAsMap()

#tdicionário id/termo e o reverso
termIds = dict(enumerate(idfs.keys()))
idTerms = dict(map(reversed, termIds.iteritems()))

docIds = dict(enumerate(['doc'+str(x) for x in range(0,numDocs)]))
idDocs = dict(map(reversed, docIds.iteritems()))

#Because the term ID map is fairly large and we’ll use it in a few different places, let’s broadcast it along with the IDFs:
bIdfs = sc.broadcast(idfs).value
bIdTerms = sc.broadcast(idTerms).value
bIdDocs = sc.broadcast(idDocs).value

** TF-IDF para cada documento **

In [6]:
def generateVectors(termFreqs):
    docTotalTerms = sum(termFreqs.values())
    
    def calcScores(TF):
        filterTF = dict((k,v) for k, v in TF.iteritems() if bIdTerms.has_key(k))
        return dict((bIdTerms[k], bIdfs[k] * TF[k] / docTotalTerms) for k, v in filterTF.iteritems())
           
    return Vectors.sparse(len(bIdTerms), calcScores(termFreqs)) 
    
vecs = docTermFreqs.map(generateVectors)
vecs.cache()

PythonRDD[9] at RDD at PythonRDD.scala:43

** SVD **

In [7]:
class SVD(JavaModelWrapper):
    """Wrapper around the SVD scala case class"""
    @property
    def U(self):
        """ Returns a RowMatrix whose columns are the left singular vectors of the SVD if computeU was set to be True."""
        u = self.call("U")
        if u is not None:
            return RowMatrix(u)

    @property
    def s(self):
        """Returns a DenseVector with singular values in descending order."""
        return self.call("s")

    @property
    def V(self):
        """ Returns a DenseMatrix whose columns are the right singular vectors of the SVD."""
        return self.call("V")

def computeSVD(row_matrix, k, computeU=True, rCond=1e-9):
    """
    Computes the singular value decomposition of the RowMatrix.
    The given row matrix A of dimension (m X n) is decomposed into U * s * V'T where
    * s: DenseVector consisting of square root of the eigenvalues (singular values) in descending order.
    * U: (m X k) (left singular vectors) is a RowMatrix whose columns are the eigenvectors of (A X A')
    * v: (n X k) (right singular vectors) is a Matrix whose columns are the eigenvectors of (A' X A)
    :param k: number of singular values to keep. We might return less than k if there are numerically zero singular values.
    :param computeU: Whether of not to compute U. If set to be True, then U is computed by A * V * sigma^-1
    :param rCond: the reciprocal condition number. All singular values smaller than rCond * sigma(0) are treated as zero, where sigma(0) is the largest singular value.
    :returns: SVD object
    """
    
    java_model = row_matrix._java_matrix_wrapper.call("computeSVD", int(k), computeU, float(rCond))
    return SVD(java_model)

In [8]:
mat = RowMatrix (vecs) 
svd = computeSVD(mat, 50, True)


**Encontrando conceitos importantes**

In [9]:
def topTermsInTopConcepts(svd, numConcepts, numTerms, termIds):
    v = svd.V
    arr = v.toArray().flatten()
    topTerms = []

    def arrayWithIndex(arr):
        tupleList = [ (index,item) for (index,item) in enumerate( arr ) ]
        return sorted(tupleList, key=lambda tup: tup[1])[::-1]

    def mapList(l):
        return [(termIds[x], y) for (x,y) in l]

    for i in range(0, numConcepts):
        offs = i * v.numRows
        termWeights = arrayWithIndex(arr[offs : offs + v.numRows])
        topTerms.append(mapList(termWeights[:numTerms]))
        
    return topTerms

topTerms = topTermsInTopConcepts(svd, 50, 400, termIds)

**Top Documents in Top Concepts**

In [10]:
def topDocsInTopConcepts(svd, numConcepts, numDocs, docIds):
    u = svd.U
    topDocs = []

    def mapListDoc(l):
        return [(docIds[y], x) for (x,y) in l]

    for i in range(0, numConcepts):
        docWeights = u.rows.map(lambda x: x.toArray()[i]).zipWithUniqueId()
        topDocs.append(mapListDoc(docWeights.top(numDocs)))
    
    return topDocs
        
topDocs = topDocsInTopConcepts(svd, 50, 50, docIds)

**Consultas**

In [234]:
def multiplyByDiagonalMatrix(mat, diag):
    sArr = diag.toArray()
    mult = np.empty([mat.numRows, mat.numCols])
    for i in range(0, mat.numRows):
        for j in range(0, mat.numCols):
            mult[i,j] = mat[i,j]*diag[j]
            
    return DenseMatrix(mat.numRows, mat.numCols, mult.flatten())

In [235]:
def multiplyByDiagonalRowMatrix(mat, diag):
    sArr = diag.toArray()
    matAux = mat.rows.collect()
    mult = np.empty([mat.numRows(), mat.numCols()])
    for i in range(0, mat.numRows()):
        for j in range(0, mat.numCols()):
                mult[i,j] = matAux[i][j]*diag[j]
            
    return RowMatrix(sc.parallelize(mult))

In [236]:
def rowsNormalized(mat):
    def calcLenght(row):
        sumCols = 0
        for col in range(0, mat.numCols):
            sumCols += mat[row,col] * mat[row,col]
        return math.sqrt(sumCols)
    
    nomrMat = np.empty([mat.numRows, mat.numCols])
    for row in range(0, mat.numRows):
        lenght = calcLenght(row)
        for col in range(0, mat.numCols):
            nomrMat[row, col] = mat[row, col] / lenght
    return nomrMat      

In [237]:
def distributedRowsNormalized(mat):
    matAux = mat.rows.collect()
    def calcLenght(row):
        sumCols = 0
        for col in range(0, mat.numCols()):
            sumCols += matAux[row][col] * matAux[row][col]
        return math.sqrt(sumCols)
    
    
    nomrMat = np.empty([mat.numRows(), mat.numCols()])
    for row in range(0, mat.numRows()):
        lenght = calcLenght(row)
        for col in range(0, mat.numCols()):
            nomrMat[row, col] = matAux[row][col] / lenght
    return nomrMat  

In [238]:
def topTermsForTerm(normalizedVS, termId):
    rowVec = normalizedVS[termId,:]
    termScores = [(i,score) for i,score in zip(range(len(rowVec)),normalizedVS.dot(rowVec))]
    return sorted(termScores,key = lambda tup: tup[1], reverse=True)[:50]

In [239]:
def topDocsForDoc(normalizedUS, docId):
    docRowArr = normalizedUS[docId,:]
    docScores = normalizedUS.dot(docRowArr)
    allDocsWeights = [(i,score) for i,score in zip(range(len(docScores)),docScores)]
    return sorted(allDocsWeights, key=lambda tup: tup[1], reverse = True)[:50]

In [240]:
def topDocsForTerm(termId):
    rowArr = svd.V.toArray()[termId]
    US = multiplyByDiagonalRowMatrix(svd.U,svd.s)
    normalizedUS = distributedRowsNormalized(US)
    docScores = normalizedUS.dot(rowArr)
    allDocsWeights = [(i,score) for i,score in zip(range(len(docScores)),docScores)]
    return sorted(allDocsWeights, key=lambda tup: tup[1], reverse = True)[:50]

In [290]:
def topDocsForTermQuery(query):
    termRowVec = svd.V.toArray().T.dot(query.toArray())
    US = multiplyByDiagonalRowMatrix(svd.U, svd.s)
    docScores = np.array(US.rows.collect()).dot(termRowVec)
    allDocWeights = sorted([(i, docScores[i]) for i in range(docScores.shape[0])], key=lambda tup: tup[1], reverse=True)
    return allDocWeights[:50]

In [291]:
def termsToQueryVector(terms):
    indices = [idTerms[stemmer.stem(t)] for t in terms if stemmer.stem(t) in idTerms]
    values = [idfs[stemmer.stem(t)] for t in terms if stemmer.stem(t) in idTerms] 
    return Vectors.sparse(len(idTerms),zip(indices,values))

In [292]:
def printRelevantTermsForTerm(term):
    stemmed = stemmer.stem(term)
    if stemmed not in idTerms:
        print ("Termo desconhecido")
    else:
        termId = idTerms[stemmed]
        VS = multiplyByDiagonalMatrix(svd.V,svd.s)
        normalizedVS = rowsNormalized(VS)
        topTerms = topTermsForTerm(normalizedVS,termId)

        for t in topTerms:
            print(termIds[t[0]],' relevancia: ',t[1])

In [293]:
def printRelevantDocsForDoc(doc):
    US = multiplyByDiagonalRowMatrix(svd.U, svd.s)
    normalizedUS = distributedRowsNormalized(US)
    
    topDocs = topDocsForDoc(normalizedUS, doc)
    for d in topDocs:
        print("Documento: " + str(d[0]) + " relevancia: " + str(d[1]))

In [294]:
def printRelevantDocsforTerm(term):
    stemmed = stemmer.stem(term)
    if stemmed not in idTerms:
        print ("Termo desconhecido")
    else:
        termId = idTerms[stemmed]
        topDocs = topDocsForTerm(termId)
        for d in topDocs:
            print('Documento: ' + str(d[0]) + '  relevancia: '+ str(d[1]))

In [295]:
def printRelevantDocsForTermQuery(terms):
    queryVec = termsToQueryVector(terms)
    print("Query: " + str(terms))
    for doc, relev in topDocsForTermQuery(queryVec):
        print("Documento {}, possui relevancia {}".format(doc, relev))

In [297]:

printRelevantDocsForTermQuery(['hulk', 'incred'])

Query: ['hulk', 'incred']
Documento 10, possui relevancia 0.296274418708
Documento 175, possui relevancia 0.21748956848
Documento 27, possui relevancia 0.183700128148
Documento 246, possui relevancia 0.157522349283
Documento 343, possui relevancia 0.155993612002
Documento 35, possui relevancia 0.149376805463
Documento 373, possui relevancia 0.141669907733
Documento 374, possui relevancia 0.12518879738
Documento 370, possui relevancia 0.119144769545
Documento 366, possui relevancia 0.0948684925432
Documento 152, possui relevancia 0.0679804141064
Documento 51, possui relevancia 0.0661146122938
Documento 365, possui relevancia 0.0641240306588
Documento 283, possui relevancia 0.0579771753193
Documento 210, possui relevancia 0.0562385801076
Documento 163, possui relevancia 0.051166573553
Documento 379, possui relevancia 0.0452439093538
Documento 123, possui relevancia 0.0435555481412
Documento 377, possui relevancia 0.0417546404629
Documento 0, possui relevancia 0.0410603663446
Documento 21