# Projet Search-Engine - Fondement de la Recherche d'Information-WEB

## Requirements installation
Run:
```
pip3 install -r requirements.txt
```

# Tâche 1 : Création d’un index inversé et moteur de recherche booléen et vectorie

## 2.1 Traitements linguistiques

### Tokenisation des collections

#### CACM

importing datasets

In [None]:
import file_reading
import tokenization

CACM (quick import and tokenization)

In [None]:
CACM_documents = file_reading.read_cacm()  # raw dataset
CACM_tokens_NLTK = tokenization.tokenize_CACM(CACM_documents)  # tokenization with NLTK and stoplist
CACM_tokens = tokenization.tokenize_no_nltk_CACM(CACM_documents)  # Manual tokenization (stoplist)

#### CS276

Storing tokenized dataset into a pickle file that's reused

In [None]:
import pickle
import os

if os.path.exists("tokenized_data/CS276_docs.pickle"):
    print("getting CS276 docs pickle")
    CS276_docs = pickle.load(open("tokenized_data/CS276_docs.pickle", 'rb'))
else:
    print("reading CS297")
    CS276_docs = file_reading.read_cs276()
    print("storing CS297 docs in a pickle")
    pickle.dump(CS276_docs, open("tokenized_data/CS276_docs.pickle", 'wb'))

if os.path.exists("tokenized_data/CS276_tokens.pickle"):
    print("getting CS276 tokens pickle")
    CS276_tokens = pickle.load(open("tokenized_data/CS276_tokens.pickle", 'rb'))
else:
    print("making CS276 tokens")
    CS276_tokens = tokenization.tokenize_CS276(CS276_docs)
    print("storing CS276 in a pickle")
    pickle.dump(CS276_tokens, open("tokenized_data/CS276_tokens.pickle", "wb"))
print("done")

### Nombre de tokens

In [None]:
print("Nombre de tokens pour CACM (sans NLTK):")
T1 = sum([len(L) for L in CACM_tokens])
print(T1)

print("Nombre de tokens pour CACM (avec NLTK):")
T1 = sum([len(L) for L in CACM_tokens_NLTK])
print(T1)

print("Nombre de tokens pour CS276:")
T1 = sum([len(L) for L in CS276_tokens])
print(T1)

### Taille du vocabulaire

In [None]:
print("Vocabulaire pour CACM: ")
M1 = len(CACM_tokens)
print(M1)
# print(CACM_tokens.keys())

print("Vocabulaire pour CACM (avec NLTK): ")
M1 = len(CACM_tokens_NLTK)
print(M1)
# print(CACM_tokens.keys())

print("Vocabulaire pour CS276_tokens: ")
M1 = len(CS276_tokens)
print(M1)
# print(CS276_tokens.keys())

### Loi Heap et estimation pour 1M vocabulaire

CACM

In [None]:
d_half = tokenization.tokenize_no_nltk_CACM(CACM_documents[:len(CACM_documents)//2])
tokenization.print_heap_law(CACM_tokens, d_half, "CACM without NLTK")

In [None]:
d_half = tokenization.tokenize_CACM(CACM_documents[:len(CACM_documents)//2])
tokenization.print_heap_law(CACM_tokens_NLTK, d_half, "CACM with NLTK")

CS276

In [None]:
CS276_tokens_heap = CS276_tokens.copy()
ids_to_delete = list(range(len(CS276_docs)//2))

for document_ids in CS276_tokens_heap.values():
    filter(lambda d_id: d_id not in ids_to_delete, document_ids)

tokenization.print_heap_law(CS276_tokens, d_half, "CS276")

### Graphes frequence / rang et log(frequence) / log(rang)

In [None]:
freq_CACM = tokenization.word_frequency(CACM_tokens)
freq_CACM_NLTK = tokenization.word_frequency(CACM_tokens_NLTK)
freq_CS276 = tokenization.word_frequency(CS276_tokens)

#### CACM without NLTK

In [None]:
import matplotlib.pyplot as plt
list_word = [tokenization.Token_sorter(word, freq_CACM[word]) for word in freq_CACM.keys()]
list_word.sort()
X = range(1, len(list_word) + 1)
Y = [word.f for word in list_word]
plt.title('pour CACM sans NLTK')
plt.ylabel('frequence')
plt.xlabel('rang')
plt.plot(X, Y,)
plt.show()

In [None]:
from numpy import log, array
X2= log(array(X))
Y2 = log(array(Y))
plt.title('pour CACM sans NLTK')
plt.ylabel('log(frequence)')
plt.xlabel('log(rang)')
plt.plot(X2, Y2)
plt.show()

#### CACM with NLTK

In [None]:
list_word = [tokenization.Token_sorter(word, freq_CACM_NLTK[word]) for word in freq_CACM_NLTK.keys()]
list_word.sort()
X = range(1, len(list_word) + 1)
Y = [word.f for word in list_word]
plt.title('pour CACM avec NLTK')
plt.ylabel('frequence')
plt.xlabel('rang')
plt.plot(X, Y)
plt.show()

In [None]:
from numpy import log, array
X2= log(array(X))
Y2 = log(array(Y))
plt.title('pour CACM avec NLTK')
plt.ylabel('log(frequence)')
plt.xlabel('log(rang)')
plt.plot(X2, Y2)
plt.show()

#### CS276

In [None]:
list_word = [tokenization.Token_sorter(word, freq_CS276[word]) for word in freq_CS276.keys()]
list_word.sort()
X = range(1, len(list_word) + 1)
Y = [word.f for word in list_word]
plt.plot(X, Y)
plt.title('pour CS276')
plt.ylabel('frequence')
plt.xlabel('rang')
plt.show()

In [None]:
from numpy import log, array
X2= log(array(X))
Y2 = log(array(Y))
plt.title('pour CS276')
plt.ylabel('log(frequence)')
plt.xlabel('log(rang)')
plt.plot(X2, Y2)
plt.show()

### Index inversé

On crée des indexes inversés pour les corpus CACM et CS276 dans des fichiers `invesed_index/cacm/cacm.output`

Des fichiers intermédiaires sont générés pour alleger la mémoire `invesed_index/cacm/cacm.output_x`

Chaque ligne contient l'id du token en premiere position, puis la liste des id des documents qui contiennent ce token

In [None]:
import inversed_index
from time import time

#### CACM without NKTK (better performances)

In [None]:
cacm_dict_term = {}
i = 0
for token in CACM_tokens:
    if token not in cacm_dict_term:
        cacm_dict_term[token] = i
        i += 1

start = time()
inversed_index.index_inverse_global(CACM_documents, 1000,
                                    "inversed_index/cacm/cacm.output",
                                    cacm_dict_term,
                                    type="CACM")
print("l'indexation de CACM a pris", time() - start, "secondes")

#### CS276 (supprimez les fichiers inversed_index/cs276/cs276.output pour les recalculer)

In [None]:
cs276_dict_term = {}
i = 0
for token in CS276_tokens:
    if token not in cs276_dict_term:
        cs276_dict_term[token] = i
        i += 1

if not os.path.exists("inversed_index/cs276/cs276.output"):
    start = time()
    inversed_index.index_inverse_global(CS276_docs, 1000,
                                        "inversed_index/cs276/cs276.output",
                                        cs276_dict_term,
                                        type="CS276")
    print("l'indexation de CS276 a pris", time() - start, "secondes")

# Index inversé fréquentiel

On crée des indexes inversés fréquentiels pour les corpus CACM et CS276 dans des fichiers frequential_index/cacm/cacm.output

Des fichiers intermédiaires sont générés pour alleger la mémoire frequential_index/cacm/cacm.output_x

Chaque ligne contient l'id du token en premiere position, puis la liste des couples document_id/frequence

In [None]:
import frequency_index

#### CACM without NKTK (better performances)

In [None]:
start = time()
frequency_index.index_inverse_global_with_frequency(CACM_documents, 1000,
                                                      "frequential_index/cacm/cacm.output",
                                                      cacm_dict_term,
                                                      type="CACM")
print("l'indexation fréquentielle de CACM a pris", time() - start, "secondes")

#### CS276

In [None]:
if not os.path.exists("frequential_index/cs276/cs276.output"):
    start = time()
    frequency_index.index_inverse_global_with_frequency(CS276_docs, 1000,
                                                      "frequential_index/cs276/cs276.output",
                                                      cs276_dict_term,
                                                      type="CS276")
    print("l'indexation fréquentielle de CS276 a pris", time() - start, "secondes")

## Modele booleen

La recherche booléenne prend une entrée booléenne du type "token&token|(!token&token)". L'expression booleenne est transformée en expression polonaise pour prendre en compte efficacement les parenthèses. Le programme renvoie un set des ids des documents qui contiennent l'expression et le temps de utilisé pour calculer. 

In [None]:
import boolean_model

#### CACM

In [None]:
boolean_queries = [
    "Algebraic&Language|(Preliminary&printer)",
    "Algebraic&!Language",
    "The&Secant&Method&for&Simultaneous&Nonlinear&Equations",
    "Binary&Representation|(Algebraic&Language)"
]
for query in boolean_queries:
    print()
    debut = time()
    result = boolean_model.boolean_model(query, CACM_documents, cacm_dict_term, type="CACM")
    print("query:", query)
    print("request time:", time() - debut)
    print("result:", result)

#### CS276

In [None]:
from time import time

boolean_queries = [
    "contact&us&about&us&food&allergies&stanford&medicine",
    "contact&us&about&us&!food",
    "(climate&change)|(agricultural&adaptation)&!politic&!university"
]
for query in boolean_queries:
    print()
    debut = time()
    result = boolean_model.boolean_model(query, CS276_docs, cs276_dict_term, type="CS276")
    print("query:", query)
    print("request time:", time() - debut)
    print("result:", result)

## Modele vectoriel

In [None]:
import vectorial_model
from importlib import reload
vectorial_model = reload(vectorial_model)

##### CACM

In [None]:
# ponderation tf-idf
start = time()
found_documents = vectorial_model.modele_vectoriel("paper",
                                                   CACM_documents,
                                                   cacm_dict_term,
                                                   ponderation="tfidf",
                                                   type="CACM")
print("request time", time() - start)
print(found_documents[:5])

In [None]:
# ponderation tf-idf normalisé
start = time()
found_documents = vectorial_model.modele_vectoriel("This paper describes an alternate method for summing a set of floating-point numbers.  Comparison of the error bound for this method with that of the standard summation method shows that it is considerably less sensitive to propagation of round-off error.",
                                                   CACM_documents,
                                                   cacm_dict_term,
                                                   ponderation="tfidf_norm",
                                                   type="CACM")
print("request time", time() - start)
print(found_documents[:5])

In [None]:
# ponderation frequence normalisé
start = time()
found_documents = vectorial_model.modele_vectoriel("This paper describes an alternate method for summing a set of floating-point numbers.  Comparison of the error bound for this method with that of the standard summation method shows that it is considerably less sensitive to propagation of round-off error.",
                                                   CACM_documents,
                                                   cacm_dict_term,
                                                   ponderation="norm_freq",
                                                   type="CACM")
print("request time", time() - start)
print(found_documents[:5])

#### CS276

In [None]:
start = time()
found_documents = vectorial_model.modele_vectoriel("hello hello",
                                                   CS276_docs,
                                                   cs276_dict_term,
                                                   ponderation="tfidf",
                                                   type="CS276")
print("request time", time() - start)
print(found_documents[:5])

## Evaluation

### Performances

Temps de calcul donné dans les cellules correspondantes

index inversé CACM: 463 KB

index inversé CS276: 81,3 MB

fréquence index CACM: 659 KB

fréquence index CS276: 93,5 MB

In [None]:
start = time()
found_documents = vectorial_model.modele_vectoriel("Accelerating Convergence of Iterative Processes",
                                                   CACM_documents,
                                                   cacm_dict_term,
                                                   ponderation="tfidf",
                                                   type="CACM")
print("request time", time() - start)
print(found_documents)

In [None]:
CACM_documents[1143]