In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
import collections
import seaborn as sns
nltk.download('rslp')
from nltk.stem import RSLPStemmer
import heapq as hp
from collections import Counter,OrderedDict
import math

[nltk_data] Downloading package rslp to /home/vinicius/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [2]:
data = pd.read_csv('./results.csv')

### 1. Reconstruir o índice considerando o conjunto de dados que indicamos. Esses são os dados coletados por Bernardi e os estaremos usando para facilitar a correção da atividade. Se você já estiver usando esses dados não precisa reconstruir o índice;

In [6]:
#generate tokens from a document
def parse(doc):
    words = ''.join(str(v) for v in doc).lower()
    return RegexpTokenizer(r'[A-zÀ-ú\d]{4,}').tokenize(words)

# produces: term:[(doc, frequency)]
#build an index for a collection of documents
def build_index_with_frequency(docs):
    index = {}
    n = 0
    for text in docs.text:
        n +=1
        tokens = parse(text)
        for token in tokens:
            if (token in index):
                hasDoc = False
                for i in range(len(index[token])):
                    tup = index[token][i]
                    if (tup[0] == n):
                        index[token][i] = (n, tup[1] + 1)
                        hasDoc = True
                if (not hasDoc):
                    index[token].append((n,1))
            else:
                index[token] = [(n,1)]
    return index
index = build_index_with_frequency(data)

### 2. Refinar o índice invertido de forma a também incluir o IDF (inverse document frequency) de cada termo do dicionário; 

In [11]:
# produces: term:[(doc, frequency), idf]
#build an index for a collection of documents
def build_index_with_frequency_and_idf(docs):
    index = {}
    n = 0
    M = len(docs)
    for text in docs.text:
        n +=1
        tokens = parse(text)
        for token in tokens:
            if (token in index):
                hasDoc = False
                for i in range(len(index[token])):
                    tup = index[token][i]
                    if (tup[0] == n):
                        index[token][i] = (n, tup[1] + 1)
                        hasDoc = True
                if (not hasDoc):
                    index[token].append((n,1))
            else:
                index[token] = [(n,1)]
    
    for elem in index:
        k = len(index[elem])
        idf = math.log((M+1)/k)
        index[elem].append(idf)
    return index

index = build_index_with_frequency_and_idf(data)

### 3. Implementar as seguintes versões do modelo vetorial:

#### Representação binária:

In [26]:
def binary_rep(query, doc):
    score = 0
    query_t = query.split()
    doc_t = doc.split()
    for t in query_t:
        score += (t in doc_t)
    return score
#for el in data.text:
#    print(binary_vsm("deputado federal", el))

#### TF

In [31]:
def tf(query, doc):
    score = 0
    doc_t = doc.split()
    query_t = query.split()
    for t in query_t:
        score += doc_t.count(t)
    return score
#for el in data.text:
#    print(tf("deputado federal", el))

#### TF-IDF

In [35]:
def tfidf(query, doc, index):
    score = 0
    doc_t = doc.split()
    query_t = query.split()
    for t in query_t:
        count = doc_t.count(t)
        if t in index:
            score += count * index[t][-1]
    return round(score, 2)
#index = build_index_with_frequency_and_idf(data)
#for el in data.text:
#    print(tfidf("deputado federal", el, index))

#### BM25

In [40]:
def bm25(query, doc, index):
    k = 1
    M = len(data)
    doc_t = doc.split()
    query_t = query.split()
    words = list(set(doc_t) & set(query_t))
    score = 0
    for word in words:
        cwq = query_t.count(word)
        cwd = doc_t.count(word)
        dfw = len(index[word][:-1])
        y = ((k+1)*cwd)/(cwd+k)
        score += cwq * y * math.log((M+1)/dfw)
    return score
#index = build_index_with_frequency_and_idf(data)
#for el in data.text:
#    print(bm25("deputado federal", el, index))

### 4. Execute os algoritmos separadamente em 3 consultas de sua escolha e retorne os top-5 documentos mais similares à cada consulta

In [53]:
queries = ['campeonato brasileiro', 'ministério público', 'jair bolsonaro']
def top_5(query, index):
    doc_id = 0
    bin_res = []
    tf_res = []
    tdidf_res = []
    bm25_res = []
    for doc in data.text:
        bin_res.append((binary_rep(query, doc), doc_id))
        tf_res.append((tf(query, doc), doc_id))
        tdidf_res.append((tfidf(query, doc, index), doc_id))
        bm25_res.append((bm25(query, doc, index), doc_id))
        doc_id += 1
    bin_res = sorted(bin_res, reverse=True)
    tf_res = sorted(tf_res, reverse=True)
    tdidf_res = sorted(tdidf_res, reverse=True)
    bm25_res = sorted(bm25_res, reverse=True)
    return bin_res[:5], tf_res[:5], tdidf_res[:5], bm25_res[:5]
top5_binary = [0,0,0]
top5_tf = [0,0,0]
top5_tfidf = [0,0,0]
top5_bm25 = [0,0,0]
for i in range(len(queries)):
    q = queries[i]
    top5_binary[i], top5_tf[i], top5_tfidf[i], top5_bm25[i] = top_5(q, index)

    
query_df = pd.DataFrame()

query_df['Query'] = queries
query_df['Binary'] = top5_binary
query_df['TF'] = top5_tf
query_df['TF-IDF'] = top5_tfidf
query_df['BM25'] = top5_bm25

query_df

Unnamed: 0,Query,Binary,TF,TF-IDF,BM25
0,campeonato brasileiro,"[(2, 54), (1, 237), (1, 224), (1, 223), (1, 222)]","[(6, 54), (4, 163), (4, 104), (4, 41), (3, 209)]","[(15.73, 54), (6.06, 163), (6.06, 104), (6.06,...","[(7.86574377189595, 54), (3.7297014486341915, ..."
1,ministério público,"[(2, 216), (2, 208), (1, 246), (1, 245), (1, 2...","[(10, 196), (4, 216), (4, 198), (4, 109), (4, ...","[(12.45, 196), (5.49, 216), (5.28, 123), (4.98...","[(3.6274530004379706, 216), (3.005055601014875..."
2,jair bolsonaro,"[(0, 248), (0, 247), (0, 246), (0, 245), (0, 2...","[(0, 248), (0, 247), (0, 246), (0, 245), (0, 2...","[(0.0, 248), (0.0, 247), (0.0, 246), (0.0, 245...","[(0, 248), (0, 247), (0, 246), (0, 245), (0, 2..."
