<a href="https://colab.research.google.com/github/valterlucena/recuperacao-informacao/blob/master/vectorial-model/vectorial_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Introdução

Nesta atividade iremos exercitar algumas instanciações do modelo vetorial.

Primeiramente, vamos importar nossos dados.

In [0]:
DATA_URL = 'https://raw.githubusercontent.com/Benardi/ri_lab_01/master/output/results.csv'
news = pd.read_csv(DATA_URL)

Agora iremos criar nosso índice invertido, para ser utilizado mais adiante. Utilizaremos a função tokenize da biblioteca NLTK associada à uma expressão regular, que considerará como token apenas as sequências de caracteres não-especiais (com exceção do hífen) ou numéricos que não formem stopwords e quem possuam mais que 2 caracteres.
Além disso, refinaremos nosso índice para que o mesmo contenha o *inverse document frequency* (IDF) de cada *posting*.

In [0]:
def isValid(token):
  return not bool(re.search(r'\d', token)) and len(token) > 2

def total_documents():
  return len(news)

def get_tokens(document):
  toker = RegexpTokenizer('''\w+[-']*\w*''')
  stop_words = stopwords.words('portuguese')
  return [token for token in toker.tokenize(document.lower()) if isValid(token) and token not in stop_words]

def build_index(documents):
  index = {}
  n = 0
  for document in documents:
    n += 1
    tokens = get_tokens(document)
    for token in tokens:
      occurrence = tokens.count(token)
      if token not in index:
        index[token] = {}
      if n not in index[token]:
        index[token][n] = occurrence
  return index

index = build_index(news.text)

for posting in index:
  k = len(index[posting])
  idf = round(np.log((total_documents() + 1) / k), 2)
  index[posting]['idf'] = idf

Um vocabulário dos termos presentes nos documentos nos auxiliará em algumas das instanciações que faremos.

In [0]:
vocabulary = index.keys()

# Instanciações



In [0]:
'''
  Calcula a medida de similaridade entre a consulta e um documento
  pela representação binária do modelo vetorial
'''
def binary_representation(query, document):
  terms = query.split()
  doc_tokens = get_tokens(document)
  q = {}
  d = {}
  for term in terms:
    q[term] = 0
    d[term] = 0
    if term in vocabulary:
      q[term] = 1
    if term in doc_tokens:
      d[term] = 1  
  measure = 0
  for term in terms:
    if q[term] != 0 and d[term] != 0:
      measure += q[term] * d[term]  
  return measure
  
'''
  Calcula a medida de similaridade entre a consulta e um documento
  pela representação TF do modelo vetorial
'''
def tf_representation(query, document):
  terms = query.split()
  doc_tokens = get_tokens(document)
  q = {}
  d = {}
  for term in terms:
    q[term] = 0
    d[term] = 0
    if term in vocabulary:
      q[term] = terms.count(term)
    if term in doc_tokens:
      d[term] = doc_tokens.count(term)
  measure = 0
  for term in terms:
    if q[term] != 0 and d[term] != 0:
      measure += q[term] * d[term]
  return measure

'''
  Calcula a medida de similaridade entre a consulta e um documento
  pela representação TF-IDF do modelo vetorial
'''
def tf_idf_representation(query, document):
  terms = query.split()
  doc_tokens = get_tokens(document)
  q = {}
  d = {}
  for term in terms:
    q[term] = 0
    d[term] = 0
    if term in vocabulary:
      q[term] = terms.count(term)
    if term in doc_tokens:
      d[term] = doc_tokens.count(term)
  measure = 0
  for term in terms:
    idf = index[term]['idf']
    if q[term] != 0 and d[term] != 0:
      measure += q[term] * d[term] * idf
  return round(measure, 2)

'''
  Calcula a medida de similaridade entre a consulta e um documento
  pela representação bm25 do modelo vetorial
'''
def bm25_representation(query, document, k):
  terms = query.split()
  doc_tokens = get_tokens(document)
  matched = [term for term in terms if term in doc_tokens]
  measure = 0
  for match in matched:
    cwq = terms.count(match)
    cwd = doc_tokens.count(match)
    m = total_documents()
    dfw = len(index[match].keys()) - 1
    measure += cwq * (((k + 1) * cwd) / (cwd  + k)) * np.log((m + 1) / dfw)
  return round(measure, 2)

# Consultas

As consultas realizadas serão:

* Jair Bolsonaro
* Reforma previdência
* Forças armadas

Para a consulta utilizando o BM25, utilizaremos k = 10.

In [15]:
queries = ['jair bolsonaro', 'reforma previdência', 'forças armadas']

data = {
    'query': [],
    'binary': [],
    'tf': [],
    'tf_idf': [],
    'bm25': []
}

def get_top_5(results):
  return sorted(results, key = lambda x: x[1], reverse=True)[:5]

for query in queries:
  n = 1
  binary = []
  tf = []
  tf_idf = []
  bm25 = []
  for document in news.text:
    binary.append((n, binary_representation(query, document)))
    tf.append((n, tf_representation(query, document)))
    tf_idf.append((n, tf_idf_representation(query, document)))
    bm25.append((n, bm25_representation(query, document, 10)))
    n += 1
  data['query'].append(query)
  data['binary'].append(get_top_5(binary))
  data['tf'].append(get_top_5(tf))
  data['tf_idf'].append(get_top_5(tf_idf))
  data['bm25'].append(get_top_5(bm25))

pd.options.display.max_colwidth = 160
pd.DataFrame(data)

Unnamed: 0,query,binary,tf,tf_idf,bm25
0,jair bolsonaro,"[(1, 2), (2, 2), (25, 2), (86, 2), (126, 2)]","[(151, 52), (207, 48), (166, 39), (19, 26), (42, 12)]","[(207, 79.74), (151, 76.2), (166, 54.0), (19, 34.32), (216, 17.1)]","[(207, 27.29), (151, 22.53), (166, 16.13), (19, 10.46), (216, 10.16)]"
1,reforma previdência,"[(37, 2), (95, 2), (138, 2), (140, 2), (166, 2)]","[(37, 19), (138, 14), (166, 10), (248, 9), (205, 8)]","[(37, 43.14), (138, 31.0), (166, 22.3), (248, 19.96), (205, 17.4)]","[(37, 23.32), (138, 20.03), (166, 16.37), (248, 15.14), (205, 13.04)]"
2,forças armadas,"[(1, 2), (6, 2), (12, 2), (25, 2), (42, 2)]","[(150, 15), (25, 9), (166, 8), (208, 8), (1, 6)]","[(150, 33.34), (25, 19.87), (166, 17.96), (208, 17.96), (1, 13.47)]","[(150, 21.01), (25, 15.1), (166, 14.1), (208, 14.1), (1, 11.39)]"
