<a href="https://colab.research.google.com/github/valterlucena/recuperacao-informacao/blob/master/evaluation/evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
DATA_URL = 'https://raw.githubusercontent.com/valterlucena/recuperacao-informacao/master/evaluation/results.csv'
news = pd.read_csv(DATA_URL)

In [0]:
def isValid(token):
  return not bool(re.search(r'\d', token)) and len(token) > 2

def total_documents():
  return len(news)

def get_tokens(document):
  toker = RegexpTokenizer('''\w+[-']*\w*''')
  stop_words = stopwords.words('portuguese')
  return [token for token in toker.tokenize(document.lower()) if isValid(token) and token not in stop_words]

def build_index(documents):
  index = {}
  n = 0
  for document in documents:
    n += 1
    tokens = get_tokens(document)
    for token in tokens:
      occurrence = tokens.count(token)
      if token not in index:
        index[token] = {}
      if n not in index[token]:
        index[token][n] = occurrence
  return index

index = build_index(news.text)

for posting in index:
  k = len(index[posting])
  idf = round(np.log((total_documents() + 1) / k), 2)
  index[posting]['idf'] = idf
  
vocabulary = index.keys()

In [0]:
def binary_representation(query, document):
  '''
  Calcula a medida de similaridade entre a consulta e um documento
  pela representação binária do modelo vetorial
  '''
  terms = query.split()
  doc_tokens = get_tokens(document)
  q = {}
  d = {}
  for term in terms:
    q[term] = 0
    d[term] = 0
    if term in vocabulary:
      q[term] = 1
    if term in doc_tokens:
      d[term] = 1  
  measure = 0
  for term in terms:
    if q[term] != 0 and d[term] != 0:
      measure += q[term] * d[term]  
  return measure
  
def tf_representation(query, document):
  '''
  Calcula a medida de similaridade entre a consulta e um documento
  pela representação TF do modelo vetorial
  '''
  terms = query.split()
  doc_tokens = get_tokens(document)
  q = {}
  d = {}
  for term in terms:
    q[term] = 0
    d[term] = 0
    if term in vocabulary:
      q[term] = terms.count(term)
    if term in doc_tokens:
      d[term] = doc_tokens.count(term)
  measure = 0
  for term in terms:
    if q[term] != 0 and d[term] != 0:
      measure += q[term] * d[term]
  return measure

def tf_idf_representation(query, document):
  '''
  Calcula a medida de similaridade entre a consulta e um documento
  pela representação TF-IDF do modelo vetorial
  '''
  terms = query.split()
  doc_tokens = get_tokens(document)
  q = {}
  d = {}
  for term in terms:
    q[term] = 0
    d[term] = 0
    if term in vocabulary:
      q[term] = terms.count(term)
    if term in doc_tokens:
      d[term] = doc_tokens.count(term)
  measure = 0
  for term in terms:
    try:
      idf = index[term]['idf']
      if q[term] != 0 and d[term] != 0:
        measure += q[term] * d[term] * idf
    except KeyError:
      measure += 0
  return round(measure, 2)

def bm25_representation(query, document, k):
  '''
    Calcula a medida de similaridade entre a consulta e um documento
    pela representação bm25 do modelo vetorial
  '''
  terms = query.split()
  doc_tokens = get_tokens(document)
  matched = [term for term in terms if term in doc_tokens]
  measure = 0
  for match in matched:
    cwq = terms.count(match)
    cwd = doc_tokens.count(match)
    m = total_documents()
    dfw = len(index[match].keys()) - 1
    measure += cwq * (((k + 1) * cwd) / (cwd  + k)) * np.log((m + 1) / dfw)
  return round(measure, 2)

In [0]:
def get_top_10(results):
  return sorted(results, key = lambda x: x[1], reverse=True)[:10]


def get_top_documents(query, documents):
  n = 0
  binary = []
  tf = []
  tf_idf = []
  bm25 = []
  for document in documents:
    binary.append((n,binary_representation(query, document)))
    tf.append((n, tf_representation(query, document)))
    tf_idf.append((n, tf_idf_representation(query, document)))
    bm25.append((n, bm25_representation(query, document, 10)))
    n += 1
  data = {
      'binary': get_top_10(binary),
      'tf': get_top_10(tf),
      'tf_idf': get_top_10(tf_idf),
      'bm25': get_top_10(bm25)
  }
  return data

In [0]:
query = "negacionismo histórico"

In [0]:
def reciprocal_rank(ranking):
  return 1 / ranking

def get_ranking(document, results):
  rank = 1
  for d,_ in results:
    if document == d:
      return rank
  return -1

In [8]:
data = pd.DataFrame(get_top_documents(query, news.text))
data

Unnamed: 0,binary,tf,tf_idf,bm25
0,"(2, 2)","(2, 5)","(2, 19.09)","(2, 16.48)"
1,"(9, 1)","(24, 2)","(247, 4.83)","(247, 4.83)"
2,"(16, 1)","(40, 2)","(24, 4.6)","(24, 4.22)"
3,"(18, 1)","(55, 2)","(40, 4.6)","(40, 4.22)"
4,"(21, 1)","(9, 1)","(55, 4.6)","(55, 4.22)"
5,"(24, 1)","(16, 1)","(9, 2.3)","(9, 2.3)"
6,"(40, 1)","(18, 1)","(16, 2.3)","(16, 2.3)"
7,"(55, 1)","(21, 1)","(18, 2.3)","(18, 2.3)"
8,"(56, 1)","(56, 1)","(21, 2.3)","(21, 2.3)"
9,"(81, 1)","(81, 1)","(56, 2.3)","(56, 2.3)"


In [9]:
ranking_binary = get_ranking(2, data['binary'])
reciprocal_rank(ranking_binary)

1.0

In [10]:
ranking_tf = get_ranking(2, data['tf'])
reciprocal_rank(ranking_tf)

1.0

In [11]:
ranking_tf_idf = get_ranking(2, data['tf_idf'])
reciprocal_rank(ranking_tf_idf)

1.0

In [12]:
ranking_bm25 = get_ranking(2, data['bm25'])
reciprocal_rank(ranking_bm25)

1.0

Pra todas as abordagens, o valor do reciprocal rank sempre foi 1, já que o documento escolhido, cujo índice é 2, sempre foi o primeiro colocado nos resultados da consulta.

In [13]:
GABARITO_URL = 'https://raw.githubusercontent.com/valterlucena/recuperacao-informacao/master/evaluation/results_final.json'
gabarito = pd.read_json(GABARITO_URL)
pd.options.display.max_colwidth = 160
gabarito

Unnamed: 0,query,docs
0,território palestino,"[{'URL': 'https://brasil.elpais.com/brasil/2019/03/31/politica/1554060705_325198.html', 'level': 9}, {'URL': 'https://brasil.elpais.com/brasil/2019/03/28/al..."
1,recessão mundial,"[{'URL': 'https://brasil.elpais.com/brasil/2019/04/02/internacional/1554203421_336330.html', 'level': 3}, {'URL': 'https://brasil.elpais.com/brasil/2019/04/..."
2,ditadura militar,"[{'URL': 'https://brasil.elpais.com/brasil/2019/03/29/politica/1553877780_122371.html', 'level': 8}, {'URL': 'https://brasil.elpais.com/brasil/2019/04/01/de..."
3,muro das lamentações,"[{'URL': 'https://brasil.elpais.com/brasil/2019/03/31/politica/1554060705_325198.html', 'level': 7}, {'URL': 'https://brasil.elpais.com/brasil/2019/04/01/in..."
4,brasil e argentina,"[{'URL': 'https://brasil.elpais.com/brasil/2019/03/30/opinion/1553971198_297214.html', 'level': 9}, {'URL': 'https://brasil.elpais.com/brasil/2019/03/29/pol..."
5,golpe militar,"[{'URL': 'https://brasil.elpais.com/brasil/2019/03/26/politica/1553638410_317117.html', 'level': 6}, {'URL': 'https://brasil.elpais.com/brasil/2019/04/01/de..."
6,governo bolsonaro,"[{'URL': 'https://brasil.elpais.com/brasil/2019/03/26/politica/1553557825_337887.html', 'level': 6}, {'URL': 'https://brasil.elpais.com/brasil/2019/04/01/po..."
7,ministro da economia,"[{'URL': 'https://brasil.elpais.com/brasil/2019/01/22/economia/1548182020_953667.html', 'level': 8}, {'URL': 'https://brasil.elpais.com/brasil/2019/03/27/po..."
8,prisão de Temer,"[{'URL': 'https://brasil.elpais.com/brasil/2019/03/22/opinion/1553273072_697119.html', 'level': 5}, {'URL': 'https://brasil.elpais.com/brasil/2019/03/22/pol..."
9,Congresso Nacional,"[{'URL': 'https://brasil.elpais.com/brasil/2019/03/27/politica/1553642262_663695.html', 'level': 7}, {'URL': 'https://brasil.elpais.com/brasil/2019/03/28/po..."


In [0]:
d = {}
for i in range(len(news)):
  d[i] = news.iloc[i]['url']

In [0]:
def is_relevant(document, gabarito):
  document_url = d[document]
  relevant_url_docs = get_url_relevant_docs(gabarito)
  return document_url in relevant_url_docs

def get_url_relevant_docs(gabarito):
  return [doc['URL'] for doc_list in gabarito['docs'] for doc in doc_list]

def get_relevant_docs(gabarito):
  relevant = []
  for i, url in d.items():
    if is_relevant(i, gabarito):
      relevant.append(i)
  return relevant

def intersection(a, b):
  return [el for el in a if el in b]

def get_doc_indexes(results):
  return [doc for doc,_ in results]

def ap(query):
  results = get_top_documents(query, news.text)
  binary = [d for d,_ in results['binary']]
  tf = [d for d,_ in results['tf']]
  tf_idf = [d for d,_ in results['tf_idf']]
  bm25 = [d for d,_ in results['bm25']]

  relevant_docs = get_relevant_docs(gabarito)

  ap_binary = len(intersection(binary, relevant_docs)) / len(binary)
  ap_tf = len(intersection(tf, relevant_docs)) / len(tf)
  ap_tf_idf = len(intersection(tf_idf, relevant_docs)) / len(tf_idf)
  ap_bm25 = len(intersection(bm25, relevant_docs)) / len(bm25)
  return ap_binary, ap_tf, ap_tf_idf, ap_bm25

def mean_ap(queries):  
  def pre_process_query(query):
    query_tokens = get_tokens(query)
    return ' '.join(query_tokens)
  
  binary = []
  tf = []
  tf_idf = []
  bm25 = []
  for query in queries:
    query = pre_process_query(query)
    ap_binary, ap_tf, ap_tf_idf, ap_bm25 = ap(query)
    binary.append(ap_binary)
    tf.append(ap_tf)
    tf_idf.append(ap_tf_idf)
    bm25.append(ap_bm25)
  
  map_binary = round(np.mean(binary),2)
  map_tf = round(np.mean(tf), 2)
  map_tf_idf = round(np.mean(tf_idf), 2)
  map_bm25 = round(np.mean(bm25), 2)
  
  return map_binary, map_tf, map_tf_idf, map_bm25

In [0]:
map_binary, map_tf, map_tf_idf, map_bm25 = mean_ap(gabarito['query'])

In [41]:
data = {
    'Binary': [map_binary],
    'TF': [map_tf],
    'TF-IDF': [map_tf_idf],
    'BM25': [map_bm25]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Binary,TF,TF-IDF,BM25
0,0.19,0.2,0.21,0.22
