In [1]:
import numpy as np

In [2]:
import pandas as pd

In [3]:
from math import log10

In [4]:
from nltk import wordpunct_tokenize

In [5]:
news = pd.read_csv('estadao_noticias_eleicao.csv', sep=',', index_col='idNoticia')

In [6]:
news = news.sort_index(ascending=True)

In [7]:
news = news.replace(np.nan, '', regex=True)

# Converte o texto de cada celula para minusculo, e divide as frases em listas e as palavras em sublistas.

In [8]:
for i in range(1, len(news) + 1):
    for j in range(len(news.ix[i])):        
        news.ix[i][j] = str(news.ix[i][j]).lower() #força que a celula seja uma string, para uso da função lower()
        news.ix[i][j] = wordpunct_tokenize(news.ix[i][j])

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  


# Cria um índice invertido das palavras dos documentos, identificando em quantos e quais documentos a palavra aparece, e quantas vezes por documento:

# reference_list[word] = [doc_frequency,[[doc,word_frequency]]]

In [None]:
reference_list = {}

for i in range(1, len(news) + 1):
    for j in range(len(news.ix[i])):
        for k in news.ix[i][j]:
            if(reference_list.setdefault(k, None) == None):
                reference_list[k] = [1,[[i,1]]]
            elif(reference_list[k][1][-1][0] == i):
                reference_list[k][1][-1][1] += 1
            else:
                reference_list[k][0] += 1
                reference_list[k][1].append([i,1])

# FUNÇÕS DE BUSCA

In [None]:
def smaller_terms(words):
    smaller = 0
    
    for i in range(1, len(words)):
            if (reference_list[words[smaller]][0] > reference_list[words[i]][0]): 
                    smaller = i # posição da palavra com menos documentos
                    
    words[0], words[smaller] = words[smaller], words[0]
    #coloca o termo presente em menos documentos na posição inicial da lista
                    
    return None # função com efeito colateral

In [None]:
def docs_containing_word(word):
    docs = list()
    
    for i in range(len(reference_list[word][1])):
        docs.append(reference_list[word][1][i][0])
        
    return docs

In [None]:
def search_and_n_terms_bit_vector(words):
    if (len(words) > 2):
        smaller = smaller_terms(words)
            
    result = docs_containing_word(words[0])
    
    for i in range(1, len(words)):
                result = np.intersect1d(result, docs_containing_word(words[i]))
                # result é o resultado de sucessivas buscas OR das palavras
                
    return list(result)

In [None]:
def docs_containing_word_frequency(word):
    docs = list()
    
    for i in range(len(reference_list[word][1])):
        docs.append(reference_list[word][1][i])
        
    return docs

In [None]:
def docs_frequency_decreasing(docs_and_weights):
    frequency_decreasing = sorted(list(docs_and_weights.values()), key=int, reverse=True)
    docs_frequency_decreasing = list()
    
    for i in frequency_decreasing:
        doc = list(docs_and_weights.values()).index(i)
        docs_frequency_decreasing.append(list(docs_and_weights.keys())[doc])
        
        docs_and_weights[list(docs_and_weights.keys())[doc]] = None
        
    return docs_frequency_decreasing

In [None]:
def search_and_n_terms_tf(words):
    docs = search_and_n_terms_bit_vector(words)
    docs_and_weights = {}
    stop = 0
    
    for i in range(len(words)):
        for j in range(len(reference_list[words[i]][1])):
            if(reference_list[words[i]][1][j][0] in docs):
                docs_and_weights[reference_list[words[i]][1][j][0]] = reference_list[words[i]][1][j][1]
                stop += 1

                if(stop == len(docs)):
                    break
                        
        stop = 0
        
    return docs_frequency_decreasing(docs_and_weights)

In [None]:
def search_and_n_terms_tf_idf(words):
    docs = search_and_n_terms_bit_vector(words)
    docs_and_weights = {}
    stop = 0
    
    for i in range(len(words)):
        for j in range(len(reference_list[words[i]][1])):
            if(reference_list[words[i]][1][j][0] in docs):
                docs_and_weights[reference_list[words[i]][1][j][0]] = reference_list[words[i]][1][j][1]*log10((len(news)+1)/reference_list[words[i]][0])
                stop += 1

                if(stop == len(docs)):
                    break
                        
        stop = 0
        
    return docs_frequency_decreasing(docs_and_weights)

In [None]:
def search_and_n_terms_bm_25(words):
    coefficient_bm_25 = 1.5
    docs = search_and_n_terms_bit_vector(words)
    docs_and_weights = {}
    stop = 0
    
    for i in range(len(words)):
        for j in range(len(reference_list[words[i]][1])):
            if(reference_list[words[i]][1][j][0] in docs):
                docs_and_weights[reference_list[words[i]][1][j][0]] = (((coefficient_bm_25 + 1) * reference_list[words[i]][1][j][1])/(reference_list[words[i]][1][j][1] + coefficient_bm_25))*log10((len(news)+1)/reference_list[words[i]][0])
                stop += 1

                if(stop == len(docs)):
                    break
                        
        stop = 0
        
    return docs_frequency_decreasing(docs_and_weights)

# Função que trata a entrada, e identifica as saidas das funções de busca.

In [None]:
def search(terms):
    words = wordpunct_tokenize(terms.lower())
       
    print ("busca binária: " + str(search_and_n_terms_bit_vector(words)[0:5]))
    print ("tf: " + str(search_and_n_terms_tf(words)[0:5]))
    print ("tf-idf: " + str(search_and_n_terms_tf_idf(words)[0:5]))
    print ("bm25: " + str(search_and_n_terms_bm_25(words)[0:5]))
    
    return None

# TESTES

In [None]:
search("segundo turno")

In [None]:
search("lava jato")

In [None]:
search("projeto de lei")

In [None]:
search("compra de voto")

In [None]:
search("ministério público")