<a href="https://colab.research.google.com/github/valterlucena/recuperacao-informacao/blob/master/vectorial-model/vectorial_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
DATA_URL = '/content/results.csv'
news = pd.read_csv(DATA_URL)
news

In [25]:
def total_documents():
  return len(news)

249

In [0]:
toker = RegexpTokenizer('''\w+[-']*\w*''')
stop_words = stopwords.words('portuguese')

def isValid(token):
  return token not in stop_words and not bool(re.search(r'\d', token)) and len(token) > 2

def build_index(documents):
  index = {}
  n = 0
  for document in documents:
    n += 1
    tokens = [token for token in toker.tokenize(document.lower()) if isValid(token)]
    for token in tokens:
      occurrence = tokens.count(token)
      idf = np.log(total_documents() + 1) / occurrence
      if token not in index:
        index[token] = {}
      if n not in index[token]:
        index[token][n] = (occurrence, round(idf, 5))
  return index

index = build_index(news.text)

In [0]:
def build_binary_representation(index):
  binary_representation = {}
  for posting, inverted_list in index.items():
    binary_vector = [0] * (total_documents() + 1)
    for document in inverted_list.keys():
      binary_vector[document] = 1
    binary_representation[posting] = binary_vector
  return binary_representation

def build_tf_representation(index):
  tf_representation = {}
  for posting, inverted_list in index.items():
    tf_vector = [0] * (total_documents() + 1)
    for document, pair in inverted_list.items():
      tf_vector[document] = pair[0] 
    tf_representation[posting] = tf_vector
  return tf_representation

def build_tf_idf_representation(index):
  tf_idf_representation = {}
  for posting, inverted_list in index.items():
    tf_idf_vector = [0] * (total_documents() + 1)
    for document, pair in inverted_list.items():
      tf_idf_vector[document] = pair[0] * pair[1]
    tf_idf_representation[posting] = tf_idf_vector
  return tf_idf_representation

def build_bm25_representation(index):
  bm25_representation = {}
  for posting, inverted_list in index.items():
    bm25_vector = [0] * (total_documents() + 1)
    for document, pair in inverted_list.items():
      x = pair[0] * pair[1]
      bm25_vector[document] = ((document + 1) * x) / (x + document)
      bm25_representation[posting] = bm25_vector
  return bm25_representation

In [0]:
build_binary_representation(index)