In [149]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
import collections
import seaborn as sns
nltk.download('rslp')
from nltk.stem import RSLPStemmer
import heapq as hp

[nltk_data] Downloading package rslp to /home/vinicius/nltk_data...
[nltk_data]   Package rslp is already up-to-date!


In [37]:
data = pd.read_csv('../lab3/results.csv')

In [67]:
#save the index to a csv file with the given name
def save_index(index, filename):    
    words = []
    docs = []
    for key in index:
        words.append(key)
        docs.append(str(index[key]))
    d = {'words': words, 'docs': docs}
    df = pd.DataFrame(d, columns = ["words", "docs"])
    df.to_csv(filename)
    return df

## Questão 1:

In [99]:
#generate tokens from a document
def parse(doc):
    words = ''.join(str(v) for v in doc).lower()
    return RegexpTokenizer(r'[A-zÀ-ú\d]{,}').tokenize(words)

#build an index for a collection of documents
def build_index(docs):
    index = {}
    n = 0
    for text in docs.texto:
        n +=1
        tokens = list(set(parse(text)))
        for token in tokens:
            if (token in index):
                index[token].append(n)
            else:
                index[token] = [n]
    return index

In [70]:
index = build_index(data)
save_index(index, "index.csv")

## Questão 2:

In [100]:
#build an index for a collection of documents
def build_index_with_frequency(docs):
    index = {}
    n = 0
    for text in docs.texto:
        n +=1
        tokens = parse(text)
        for token in tokens:
            if (token in index):
                hasDoc = False
                for i in range(len(index[token])):
                    tup = index[token][i]
                    if (tup[0] == n):
                        index[token][i] = (n, tup[1] + 1)
                        hasDoc = True
                if (not hasDoc):
                    index[token].append((n,1))
            else:
                index[token] = [(n,1)]
    return index
build_index_with_frequency(data)

{'': [(1, 634),
  (2, 98),
  (3, 358),
  (4, 109),
  (5, 465),
  (6, 113),
  (7, 410),
  (8, 233),
  (9, 294),
  (10, 334),
  (11, 378),
  (12, 1210),
  (13, 484),
  (14, 300),
  (15, 1104),
  (16, 1360),
  (17, 397),
  (18, 494),
  (19, 1067),
  (20, 570),
  (21, 823),
  (22, 883),
  (23, 824),
  (24, 1248),
  (25, 1827),
  (26, 1486),
  (27, 1290),
  (28, 1613),
  (29, 1294),
  (30, 929),
  (31, 330),
  (32, 1204),
  (33, 669),
  (34, 926),
  (35, 977),
  (36, 1152),
  (37, 883),
  (38, 1139),
  (39, 1894),
  (40, 1407),
  (41, 1298),
  (42, 906),
  (43, 481),
  (44, 952),
  (45, 844),
  (46, 457),
  (47, 267),
  (48, 1954),
  (49, 2228),
  (50, 856),
  (51, 233),
  (52, 873),
  (53, 2121),
  (54, 1027),
  (55, 2220),
  (56, 874),
  (57, 633),
  (58, 654),
  (59, 421),
  (60, 499),
  (61, 787),
  (62, 861),
  (63, 2169),
  (64, 1057),
  (65, 1527),
  (66, 1847),
  (67, 1358),
  (68, 799),
  (69, 767),
  (70, 1324),
  (71, 550),
  (72, 801),
  (73, 922),
  (74, 692),
  (75, 649),
  (7

In [111]:
def document_at_time_retrieval(query, index, n_results):
    inverted_lists = []
    results = []
    
    for term in query.split():
        inverted_lists.append(index[term])
    
    documents = []
    for key in index:
        il = index[key]
        for doc in il:
            documents.append(doc[0])
    documents = list(set(documents))
    
    for doc in documents:
        score = 0
        for il in inverted_lists:
            for d in il:
                if (d[0] == doc):
                    score += d[1]
        results.append((score, doc))
    
    results = sorted(results, reverse=True)
    r = []
    for i in range(n_results):
        r.append(results[i][1])
    return r
            
document_at_time_retrieval("jogo belo", build_index_with_frequency(data), 10)

[189, 175, 1, 113, 103, 24, 198, 187, 166, 14]

In [103]:
def term_at_time_retrieval(query, index, n_results):
    doc_scores = {}
    inverted_lists = []
    results = []
    
    for term in query.split():
        inverted_lists.append(index[term])
        for il in inverted_lists:
            for doc in il:
                doc_id = doc[0]
                doc_freq = doc[1]
                if (doc_id in doc_scores):
                    doc_scores[doc_id] += doc_freq
                else:
                    doc_scores[doc_id] = doc_freq
    
    for doc in doc_scores:
        results.append((doc_scores[doc], doc))
    
    results = sorted(results, reverse=True)
    r = []
    for i in range(n_results):
        r.append(results[i][1])
    return r  
#document_at_time_retrieval("belo jogo", build_index_with_frequency(data), 10)

[189, 175, 1, 113, 103, 24, 198, 187, 166, 14]

In [154]:
query_terms = ["jogo", "partida", "futebol", "campo", "universidade"]
document_at_time_results = []
term_at_time_results = []
index = build_index_with_frequency(data)
for term in query_terms:
    document_at_time_results.append(document_at_time_retrieval(term, index, 10))
    term_at_time_results.append(term_at_time_retrieval(term, index, 10))


print(document_at_time_results)
print(term_at_time_results)

[[189, 175, 1, 113, 103, 24, 198, 187, 166, 14], [189, 193, 127, 9, 4, 196, 191, 190, 183, 177], [195, 197, 176, 200, 187, 12, 7, 191, 189, 198], [144, 107, 67, 191, 180, 126, 84, 200, 198, 197], [157, 107, 153, 178, 141, 96, 87, 26, 23, 186]]
[[189, 175, 1, 113, 103, 24, 198, 187, 166, 14], [189, 193, 127, 9, 4, 196, 191, 190, 183, 177], [195, 197, 176, 200, 187, 12, 7, 191, 189, 198], [144, 107, 67, 191, 180, 126, 84, 200, 198, 197], [157, 107, 153, 178, 141, 96, 87, 26, 23, 186]]


Podemos ver nos resultados acima que obetemos os mesmos resultados oriundos de algoritmos distintos, isso nos dá uma boa confiança na corretude das implementações.

TODO: comparar tempo médio de execução e uso de memória dos algoritmos

## Questão 3:

In [138]:
def doc_at_time_conj_retrieval(query, index, n_results):
    inverted_lists = []
    results = []
    for term in query.split():
        inverted_lists.append(index[term])
        
    queries = [item for sublist in inverted_lists for item in sublist]
    queries.sort()
    print(queries)
    for i in range(len(queries)):
        score = 0
        d = queries.pop()
        repeat = 1
        for ind in queries:
            if ind[0] == d[0]:
                score += ind[1]
                repeat += 1
        if score != 0 and repeat == len(inverted_lists):
            doc_score += d[1]
            results.append((score, d[0]))
    
    results = sorted(results, reverse=True)
    r = []
    print(results)
    for i in range(n_results):
        r.append(results[i][0])
    return r 

In [165]:
and_query_terms = ["partida de futebol", "governo federal", "o presidente", "universidade federal", "as vias"]
and_results = []
for query in and_query_terms:
    and_results.append(conj_query(term, index, 10))
print(and_results)

[(1, 6), (2, 2), (3, 2), (4, 1), (6, 1), (9, 1), (10, 1), (11, 1), (12, 1), (14, 2), (24, 3), (39, 1), (49, 1), (66, 1), (74, 1), (76, 1), (88, 1), (91, 1), (103, 4), (105, 1), (107, 1), (110, 1), (113, 5), (152, 1), (166, 2), (171, 1), (175, 6), (180, 1), (187, 2), (189, 8), (190, 1), (191, 1), (192, 1), (197, 1), (198, 2), (200, 1)]
[]


IndexError: list index out of range