## TallerNro2:  Buscador de texto por palabras claves usando VSM y Similitud Coseno utilizando MongoDB para almacenamiento

### Autores: Yenner Robayo, Wilmar Martín

In [1]:
# Autores: Yenner Robayo
#          Wilmar Martín

import json
import math
from collections import Counter
from time import time
from pymongo import MongoClient

### Definición de metodos utilizados en el algoritmo de busqueda

In [4]:
def readDatabase(db):
    inverdIndex = db.inverdIndex
    stopwords = db.stopwords
    words = db.words
    allTfidf = db.allTfidf
    documents = db.documents
    inv_frec_vector = db.inv_frec_vector

    return inverdIndex, stopwords, words, allTfidf, documents, inv_frec_vector


def findCoincidences(doc, find_term):
    coincidences = 0
    for word in doc:
        if word == find_term:
            coincidences = coincidences + 1
    return coincidences


def cosine_similarity(vectorSpace1, vectorSpace2):
    numerator = 0
    sumxx, sumyy = 0, 0
    for i in range(len(vectorSpace1)):
        x = vectorSpace1[i]
        y = vectorSpace2[i]
        sumxx += x*x
        sumyy += y*y
        numerator += x*y
    return numerator/math.sqrt(sumxx*sumyy)


def createHistogram(query):
    listQuery = []
    for word in query.lower().split():  # split              
        if stopwords.find_one({'stopword':  word}) is None:                            
            listQuery.append(word)
    return Counter(listQuery)


def createVectorSpace(histogram):
    vectorSpace = []
    for word in words.find():        
        vectorSpace.append(findCoincidences(histogram, word.get('words')))
    return vectorSpace


def createTdiDf(vectorSpace):
    tfidf = []
    for id, ter_frec in enumerate(vectorSpace):
        eq = 0
        if ter_frec > 0:
            inv_frec = inv_frec_vector.find_one({'_id':  id})
            eq = ter_frec*inv_frec.get('value')
        tfidf.append(eq)
    return tfidf


def search(tfidf):
    cosSim = {}
    for palabra in histQuery:        
        inv_idx = inverdIndex.find_one({'word':  palabra})
        if inv_idx != None:                
            for key in inv_idx.get('docs'):
                if key not in cosSim:
                    documents_tfidf = allTfidf.find_one({'doc': key})
                    calc = cosine_similarity(tfidf, documents_tfidf.get('words'))
                    cosSim[key] = calc
    return cosSim

### Estableciendo conexión a la base de datos MongoDB

In [5]:
# Cliente base de
client = MongoClient()
db = client.text

# abrir las colecciones desde la base de datos
inverdIndex, stopwords, words, allTfidf, documents, inv_frec_vector = readDatabase(db)

### Algoritmo de Búsqueda

In [10]:
#consulta
query = input('Ingrese texto a buscar: ')

### Algoritmo de Búsqueda

# obtiene el tiempo actual
t0 = time()

# crea al vector de histograma de la cosulta
histQuery = createHistogram(query)

# crea el vector space de la consulta
vectorSpace = createVectorSpace(histQuery)

# normaliza TF-IDF
tfidf = createTdiDf(vectorSpace)

# realiza la consulta
docs = search(tfidf)

# obtiene el tiempo total de la busqueda
totalTime = time()-t0

# Muestra resultados
print()
print("Tiempo total de la busqueda %0.3fs." % totalTime)
print("Total de documentos encontrados: %d" % len(docs))
print("La consulta es: " + query)
print()

i = 0
for key in sorted(docs, key=docs.get, reverse=True):
    document = documents.find_one({'_id': key})    
    print("Documento encontrado: #%s, cs: %f" % (i, docs[key]))
    print('Documento #%s: %s' % (key, document.get('doc')))
    print()
    i += 1

    if (i > 10):
        break

Ingrese texto a buscar: company

Tiempo total de la busqueda 1.709s.
Total de documentos encontrados: 242
La consulta es: company

Documento encontrado: #0, cs: 0.190667
Documento #411: electronic mail <emca> financing attempt fails electronic mail corp of america said its efforts to secure additional financing for expansion have fallen through. the company said that there are no immediate prospects for financing through other sources, though efforts will continue. the company said operations wil continue while further efforts are made. negotiations with an undisclosed company had been onging for four months, the company said. reuter 

Documento encontrado: #1, cs: 0.170725
Documento #751: united merchants <umm> to buy its own stock united merchants and manufactuerers inc said its board has authorized the repurchase of up to one mln shares of the company's common stock. the company now has about 9.1 mln shares outstanding. it said the stock will be acquired from time to time on the ope