In [5]:
import csv
import pandas as pd
import numpy as np
import nltk
import ast
import re
import math
from unicodedata import normalize
from collections import Counter

In [2]:
gabarito = pd.read_csv('../gabarito/gabarito.csv')
leitura = pd.read_csv('../data/estadao_noticias_eleicao.csv')
leitura = leitura.replace(np.nan, '',regex = True)

In [6]:
def convert_str(lista):
    return ast.literal_eval(lista)

In [7]:
gabarito.google = gabarito.google.apply(convert_str)
gabarito.busca_binaria = gabarito.busca_binaria.apply(convert_str)
gabarito.tf = gabarito.tf.apply(convert_str)
gabarito.tfidf = gabarito.tfidf.apply(convert_str)
gabarito.bm25 = gabarito.bm25.apply(convert_str)

## Método de limpar frase
Método resposavel por limpar o texto e tornar mais eficiente. 

In [8]:
def text_clear(text):
    pattern = re.compile('[^a-zA-Z0-9 ]')
    text = normalize('NFKD', text).encode('ASCII', 'ignore').decode('ASCII')
    return pattern.sub(' ', text)

In [9]:
conteudos = leitura.titulo + " " + leitura.subTitulo +  " " + leitura.conteudo
conteudos  = conteudos .apply(lambda text: "" if isinstance(text, float) else text_clear(text).lower())
ids = leitura.idNoticia

## Tokenização do conteudo.


In [10]:
noticias = conteudos.apply(nltk.word_tokenize)
freq_term = noticias.apply(Counter)


## Idexação dos termos

In [11]:
index = {}
for i in range(len(noticias)):
    id_noticia = ids[i]
    for palavra in noticias[i]:
        palavra = palavra.lower()
        if palavra not in index:
            index[palavra] = {}
        id_rec = index[palavra].get(id_noticia)
        
        if not id_rec:
            docs = index[palavra]
            docs[id_noticia] = freq_term[i][palavra]

## Metodo que gera os um dicionario com os pesos dos index-terms
Método auxiliar que gera um novo discionario com o index e os pesos associados. Util para analise binaria ou TF. ß

In [39]:
def generador_docs_peso(frase, gerador_peso):
    termos = frase.split(" ")
    docs_peso = {}
    
    for i in range(len(termos)):
        termo = termos[i]
        docs = index[termo]
        for doc_id in docs:
            tf = docs[doc_id]
            
            if doc_id not in docs_peso:
                docs_peso[doc_id] = np.array([0 if j != i else gerador_peso(tf) for j in range(len(termos))])
            else:
                doc_vector = docs_peso[doc_id]
                docs_peso[doc_id] = np.array([doc_vector[j] if j != i else gerador_peso(tf) for j in range(len(termos))])
    
    return docs_peso

## Método que gera um vetor com tf dos index.

In [60]:
def generator_tf(phase):
    term = phase.split(' ')
    doc_tf = {}
    
    for i in range(len(term)):
        docs = index[term[i]]
        for doc_id in docs:
            tf = docs[doc_id]
            
            if doc_id not in doc_tf:
                doc_tf[doc_id] = np.array([0 if j != i else tf for j in range(len(term))])
            else:
                doc_vector = doc_tf[doc_id]
                doc_tf[doc_id] = np.array([doc_vector[j] if j != i else tf for j in range(len(term))])
    return doc_tf
    

## Método que gera um vetor com valor binario dos index.

In [41]:
def generator_binario(frase):
    def generador_peso(tf):
        return 1
    return generador_docs_peso(frase, generador_peso)

## Método que gera um vetor com idf dos index.

In [13]:
def generator_idf(phase):
    terms = phase.split(' ')
    idf  = np.array([math.log((len(noticias)+1)/len(index[term])) for term in terms])
    return idf

## Método que gera um vetor binário de consulta. 
Considerando os 0 ou 1

In [14]:
def generator_query(phase):
    terms = phase.split(' ')
    query = np.array([1 if index.get(term) else 0 for term in terms])
    return query

## Método que gera um vetor com o bm25 dos termos

In [62]:
def generato_bm25(phase):
    docs_tf = generator_tf(phase)
    k = 5
    bm25_vetor = {doc_id: np.array([((k+1)*tf)/(tf+k) for tf in tf_vetor]) for doc_id, tf_vetor in docs_tf.items()}
    return bm25_vetor

## Busca pelo index binario dos termos. 

In [37]:
def seach_bin(phase):
    docs_tf = generator_binario(phase)
    query = generator_query(phase)
    doc_rank = sorted(list(docs_tf.items()), key=lambda doc: np.dot(doc[1], query), reverse=True)[:5] 
    return [doc[0] for doc in doc_rank]

## Busca pelo tf dos termos. 

In [36]:
def seach_tf(phase):
    docs_tf = generator_tf(phase)
    query = generator_query(phase)
    doc_rank = sorted(list(docs_tf.items()), key=lambda doc: np.dot(doc[1], query), reverse=True)[:5] 
    return [doc[0] for doc in doc_rank]

## Busca pelo tf e idf dos termos. 

In [51]:
def seach_tf_idf(phase):
    doc_tf = generator_tf(phase)
    doc_idf = generator_idf(phase)
    doc_rank = sorted(list(doc_tf.items()), key=lambda doc: np.dot(doc[1], doc_idf), reverse=True)[:5]
    return [doc[0] for doc in doc_rank]


## Busca pelo BM25

In [58]:
def seach_bm25(phase):
    doc_bm25 = generato_bm25(phase)
    query = generator_query(phase)
    doc_rank = sorted(list(doc_bm25.items()), key=lambda doc: np.dot(doc[1], query), reverse=True)[:5]
    return [doc[0] for doc in doc_rank]

In [18]:
def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])


## Testes de analise de presição

## Teste pela busca com o metodo TF

In [32]:
busca_tf = [seach_tf(text_clear(frase)) for frase in gabarito.str_busca]

print("Local: %.4f" %(mapk(gabarito.tf, busca_tf, k=5)))
print("Google:%.4f" %(mapk(gabarito.google, busca_tf, k=5)))

Local: 0.6520
Google:0.0480


## Teste pela busca com o metodo binario.

In [45]:
busca_bi  = [seach_bin(text_clear(frase)) for frase in gabarito.str_busca]
print("Local: %.4f" %(mapk(gabarito.busca_binaria, busca_bi, k=5)))
print("Google:%.4f" %(mapk(gabarito.google, busca_bi, k=5)))

Local: 0.2400
Google:0.0400


## Teste pela busca com o metodo TF-IDF

In [67]:
busca_tf_idf  = [seach_tf_idf(text_clear(frase)) for frase in gabarito.str_busca]
print("Local: %.4f" %(mapk(gabarito.tfidf, busca_tf_idf, k=5)))
print("Google:%.4f" %(mapk(gabarito.google, busca_tf_idf, k=5)))

Local: 0.6160
Google:0.0580


## Teste pela busca com o metodo BM25

In [65]:
busca_bm25  = [seach_bm25(text_clear(frase)) for frase in gabarito.str_busca]
print("Local: %.4f" %(mapk(gabarito.bm25, busca_bm25 , k=5)))
print("Google:%.4f" %(mapk(gabarito.google, busca_bm25 , k=5)))

Local: 0.6787
Google:0.1180
