<a href="https://colab.research.google.com/github/valterlucena/recuperacao-informacao/blob/master/vectorial-model/vectorial_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
DATA_URL = 'https://raw.githubusercontent.com/Benardi/ri_lab_01/master/output/results.csv'
news = pd.read_csv(DATA_URL)
news

Unnamed: 0,title,subtitle,author,date,section,text,url
0,“A sociedade foi Rubens Paiva não os facínora...,A decisão da juíza que proíbe as Forças Armada...,F. M.,30/03/2019 00:11:08,Brasil,A juíza federal Ivani Silva da Luz de Brasíli...,https://brasil.elpais.com/brasil/2019/03/26/po...
1,Justiça suspende decisão que proibia Forças Ar...,Liminar havia sido concedida na sexta-feira a ...,Marina Rossi,30/03/2019 16:17:59,Brasil,Menos de 24 horas depois de a juíza federal Iv...,https://brasil.elpais.com/brasil/2019/03/30/po...
2,Governo Bolsonaro prega “negacionismo históric...,Marcos Napolitano professor da USP diz que o...,Regiane Oliveira,04/04/2019 22:37:48,Brasil,Quando determinou que de 31 de março 1964 u...,https://brasil.elpais.com/brasil/2019/04/05/po...
3,Quando os pais de Gabo perceberam que tinham u...,Gustavo Tatis percorre o universo de García Má...,Jesús Ruiz Mantilla,07/03/2019 16:38:56,Cultura,Quando era pequeno Luisa e Gabriel se preo...,https://brasil.elpais.com/brasil/2019/03/06/cu...
4,Rádios canadenses banem músicas de Michael Jac...,Quebec Cogeco Media toma a decisão após queixa...,Jaime Porras Ferreyra,07/03/2019 16:12:37,Cultura,Desde a manhã da última segunda-feira e ...,https://brasil.elpais.com/brasil/2019/03/06/cu...
5,Rosângela uma das crianças sequestradas por f...,“Roubaram minha identidade” diz Rosângela Par...,Joana Oliveira,30/03/2019 22:21:08,Brasil,Rosângela Serra Paraná não sabe quantos anos t...,https://brasil.elpais.com/brasil/2019/03/29/po...
6,“Lógica de usar torturadores da ditadura no cr...,Aloy Jupiara coautor de 'Os porões da contrav...,Felipe Betim,01/04/2019 13:22:17,Brasil,Trajetória similar tiveram outros vários agent...,https://brasil.elpais.com/brasil/2019/03/29/po...
7,Bolsonaro troca embaixada por escritório em Je...,Autoridade Palestina convoca embaixador no Bra...,Juan Carlos Sanz,01/04/2019 12:26:32,Brasil,A devoção dos cristãos evangélicos em seu apoi...,https://brasil.elpais.com/brasil/2019/03/31/po...
8,O lado mais sombrio de Dickens,Escritor e jornalista tentou internar sua mulh...,Rafa de Miguel,10/03/2019 14:29:37,Cultura,O homem mais famoso da era vitoriana o “poeta...,https://brasil.elpais.com/brasil/2019/03/01/cu...
9,O amor entre mulheres que sacudiu o Paraguai,O drama ganhou três prêmios em Berlim e provoc...,Gregorio Belinchón,09/03/2019 12:10:23,Cultura,Depois de inclusive o da crítica internacion...,https://brasil.elpais.com/brasil/2019/03/07/cu...


In [0]:
def isValid(token):
  return not bool(re.search(r'\d', token)) and len(token) > 2

def total_documents():
  return len(news)

def get_tokens(document):
  toker = RegexpTokenizer('''\w+[-']*\w*''')
  stop_words = stopwords.words('portuguese')
  return [token for token in toker.tokenize(document.lower()) if isValid(token) and token not in stop_words]

In [0]:
def build_index(documents):
  index = {}
  n = 0
  for document in documents:
    n += 1
    tokens = get_tokens(document)
    for token in tokens:
      occurrence = tokens.count(token)
      if token not in index:
        index[token] = {}
      if n not in index[token]:
        index[token][n] = occurrence
  return index

index = build_index(news.text)

for posting in index:
  k = len(index[posting])
  idf = round(np.log(total_documents() + 1 / k), 2)
  index[posting]['idf'] = idf

vocabulary = index.keys()

In [0]:
def binary_representation(query, document):
  terms = query.split()
  doc_tokens = get_tokens(document)
  q = {}
  d = {}
  for term in terms:
    q[term] = 0
    d[term] = 0
    if term in vocabulary:
      q[term] = 1
    if term in doc_tokens:
      d[term] = 1  
  score = 0
  for term in terms:
    if q[term] > 0 and d[term] > 0:
      score += q[term] * d[term]  
  return score
  
def tf_representation(query, document):
  terms = query.split()
  doc_tokens = get_tokens(document)
  q = {}
  d = {}
  for term in terms:
    q[term] = 0
    d[term] = 0
    if term in vocabulary:
      q[term] = terms.count(term)
    if term in doc_tokens:
      d[term] = doc_tokens.count(term)
  score = 0
  for term in terms:
    if q[term] > 0 and d[term] > 0:
      score += q[term] * d[term]
  return score

def tf_idf_representation(query, document):
  terms = query.split()
  doc_tokens = get_tokens(document)
  q = {}
  d = {}
  for term in terms:
    q[term] = 0
    d[term] = 0
    if term in vocabulary:
      q[term] = terms.count(term)
    if term in doc_tokens:
      d[term] = doc_tokens.count(term)
  score = 0
  for term in terms:
    idf = index[term]['idf']
    if q[term] > 0 and d[term] > 0:
      score += q[term] * d[term] * idf
  return round(score, 2)

def bm25_representation(query, document, k):
  terms = query.split()
  doc_tokens = get_tokens(document)
  matched = [term for term in terms if term in doc_tokens]
  score = 0
  for match in matched:
    cwq = terms.count(match)
    cwd = doc_tokens.count(match)
    m = total_documents()
    dfw = len(index[match].keys()) - 1
    score += cwq * (((k + 1) * cwd) / (cwd  + k)) * np.log((m + 1) / dfw)
  return round(score, 2)

In [6]:
queries = ['jair bolsonaro', 'reforma previdência', 'forças armadas']

data = {
    'query': queries,
    'binary': [],
    'tf': [],
    'tf_idf': [],
    'bm25': []
}

for query in queries:
  n = 1
  binary = []
  tf = []
  tf_idf = []
  bm25 = []
  for document in news.text:
    binary.append((n, binary_representation(query, document)))
    tf.append((n, tf_representation(query, document)))
    tf_idf.append((n, tf_idf_representation(query, document)))
    bm25.append((n, bm25_representation(query, document, 10)))
    n += 1
  data['binary'].append(sorted(binary, key = lambda x: x[1], reverse=True)[:5])
  data['tf'].append(sorted(tf, key = lambda x: x[1], reverse=True)[:5])
  data['tf_idf'].append(sorted(tf_idf, key = lambda x: x[1], reverse=True)[:5])
  data['bm25'].append(sorted(bm25, key = lambda x: x[1], reverse=True)[:5])

pd.options.display.max_colwidth = 160
pd.DataFrame(data)

Unnamed: 0,query,binary,tf,tf_idf,bm25
0,jair bolsonaro,"[(1, 2), (1, 2), (1, 2), (1, 2), (1, 2)]","[(1, 52), (1, 48), (1, 39), (1, 26), (1, 12)]","[(1, 287.04), (1, 264.96), (1, 215.28), (1, 143.52), (1, 66.24)]","[(1, 27.29), (1, 22.53), (1, 16.13), (1, 10.46), (1, 10.16)]"
1,reforma previdência,"[(1, 2), (1, 2), (1, 2), (1, 2), (1, 2)]","[(1, 19), (1, 14), (1, 10), (1, 9), (1, 8)]","[(1, 104.88), (1, 77.28), (1, 55.2), (1, 49.68), (1, 44.16)]","[(1, 23.32), (1, 20.03), (1, 16.37), (1, 15.14), (1, 13.04)]"
2,forças armadas,"[(1, 2), (1, 2), (1, 2), (1, 2), (1, 2)]","[(1, 15), (1, 9), (1, 8), (1, 8), (1, 6)]","[(1, 82.8), (1, 49.68), (1, 44.16), (1, 44.16), (1, 33.12)]","[(1, 21.01), (1, 15.1), (1, 14.1), (1, 14.1), (1, 11.39)]"
