# Projet Search-Engine - Fondement de la Recherche d'Information-WEB

## Requirements installation
Run:
```
pip3 install -r requirements.txt
```

In [3]:
def read_cacm():
    file_name = "./Data/CACM/cacm.all"
    file = open(file_name)
    documents = []
    for line in file.readlines():
        curent_section = -1
        if line[0] == '.':
            if line[1] == 'I':
                documents.append(["", "", ""])
            elif line[1] == 'T':
                current_section = 0
            elif line[1] == 'W':
                current_section = 1
            elif line[1] == 'K':
                current_section = 2
            else:
                current_section = -1
        else:
            if current_section >= 0:
                if len(documents[-1][current_section]) == 0:
                    documents[-1][current_section] += line.strip('\n')
                else:
                    documents[-1][current_section] += " " + line.strip('\n')
    file.close()  
    return documents

In [35]:
documents = read_cacm()
print(documents[1204])

['An Undergraduate Program in Computer Science-Preliminary Recommendations', '', '']


In [5]:
def read_forbidden_words():
    file_name = "./Data/CACM/common_words"
    file = open(file_name)
    words = []
    for word in file.readlines():
        words.append(word.strip().lower())
    file.close()
    return words

In [74]:
import nltk
nltk.download('punkt')
def tokenize(documents):
    d = {}
    forbidden_words = read_forbidden_words()
    for i in range(len(documents)):
        document = documents[i]
        for text in document:
            if len(text) > 0:
                tokens = nltk.word_tokenize(text)
                for token in tokens:
                    if token.lower() not in forbidden_words and len(token) > 0:
                        if token.lower() in d:
                            d[token.lower()].append(i+1)
                        else:
                            d[token.lower()] = [i+1]
    return d

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\micka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [75]:
import re
def tokenize_no_nltk(documents):
    d = {}
    forbidden_words = read_forbidden_words()
    for i in range(len(documents)):
        document = documents[i]
        for text in document:
            if len(text) > 0:
                tokens = re.compile("[^0-9a-zA-Z]").split(text)
                for token in tokens:
                    if token.lower() not in forbidden_words and len(token) > 0:
                        if token.lower() in d:
                            d[token.lower()].append(i+1)
                        else:
                            d[token.lower()] = [i+1]
    return d

In [76]:
print(tokenize(documents))
print(tokenize_no_nltk(documents))

{'preliminary': [1, 254, 825, 894, 1235, 1726, 1771, 1946, 2050, 2065, 2163, 2181, 2389, 2398, 2556, 2718, 2929, 2970, 2972], 'report-international': [1], 'algebraic': [1, 21, 54, 55, 93, 99, 284, 393, 964, 964, 1029, 1214, 1214, 1216, 1223, 1253, 1253, 1258, 1334, 1365, 1394, 1394, 1397, 1397, 1397, 1397, 1453, 1453, 1471, 1543, 1589, 1803, 1806, 1824, 1975, 2054, 2090, 2090, 2090, 2164, 2165, 2165, 2165, 2166, 2166, 2167, 2167, 2167, 2167, 2167, 2167, 2170, 2323, 2323, 2323, 2323, 2323, 2547, 2645, 2719, 2802, 2809, 2931, 2932, 2958, 2958, 3031, 3031, 3031, 3071, 3077, 3077, 3078, 3189, 3189, 3189, 3199, 3199, 3199, 3202, 3203, 3203], 'language': [1, 54, 82, 82, 93, 99, 123, 144, 196, 205, 208, 224, 224, 254, 254, 265, 292, 300, 300, 321, 321, 321, 321, 407, 464, 464, 464, 584, 616, 616, 616, 616, 616, 637, 637, 644, 644, 644, 644, 644, 654, 655, 655, 655, 658, 679, 679, 679, 679, 681, 691, 691, 731, 731, 761, 763, 795, 800, 829, 830, 889, 892, 892, 949, 982, 982, 987, 990, 1012, 101

{'preliminary': [1, 254, 825, 894, 1205, 1235, 1726, 1771, 1946, 2050, 2065, 2163, 2181, 2389, 2398, 2556, 2718, 2929, 2970, 2972], 'report': [1, 65, 146, 147, 196, 254, 321, 321, 321, 329, 329, 462, 584, 599, 601, 616, 616, 616, 616, 616, 616, 637, 644, 644, 675, 689, 691, 721, 724, 946, 947, 984, 985, 1025, 1051, 1086, 1236, 1238, 1325, 1349, 1416, 1416, 1476, 1476, 1531, 1659, 1765, 1771, 1771, 1846, 1889, 1927, 1937, 2046, 2048, 2154, 2181, 2198, 2198, 2226, 2283, 2380, 2389, 2479, 2479, 2522, 2527, 2538, 2583, 2593, 2689, 2689, 2777, 2852, 2930, 2979, 3099, 3130, 3160, 3160, 3160, 3160, 3160, 3160, 3184, 3184], 'international': [1, 99, 414, 690, 975, 1362, 1476, 1476, 2875, 3184], 'algebraic': [1, 21, 44, 54, 55, 93, 99, 284, 393, 964, 964, 1029, 1214, 1214, 1216, 1223, 1253, 1253, 1258, 1334, 1365, 1394, 1394, 1397, 1397, 1397, 1397, 1453, 1453, 1471, 1543, 1589, 1803, 1806, 1824, 1975, 2054, 2090, 2090, 2090, 2164, 2165, 2165, 2165, 2166, 2166, 2167, 2167, 2167, 2167, 2167, 2167

In [77]:
d = tokenize_no_nltk(documents)
print("Nombre de tokens :")
T1 = sum([len(L) for L in d])
print(T1)
print("Vocabulaire : ")
M1 = len(d)
print(M1)
print(d.keys())

Nombre de tokens :
73126
Vocabulaire : 
9496
dict_keys(['preliminary', 'report', 'international', 'algebraic', 'language', 'extraction', 'roots', 'repeated', 'subtractions', 'digital', 'computers', 'techniques', 'department', 'matrix', 'program', 'schemes', 'glossary', 'computer', 'engineering', 'programming', 'terminology', 'square', 'root', 'approximations', 'inspection', 'procedures', 'equivalence', 'transformation', 'proposal', 'uncol', 'problem', 'communication', 'changing', 'machines', 'proposed', 'solution', 'part', '2', 'error', 'estimation', 'runge', 'kutta', '1', 'recursive', 'curve', 'fitting', 'technique', 'secant', 'modification', 'newton', 'method', 'arithmetic', 'operations', 'simple', 'automatic', 'coding', 'systems', 'accelerating', 'convergence', 'iterative', 'processes', 'discussed', 'applied', 'procedure', 'equation', 'accelerates', 'rate', 'iteration', 'converges', 'induces', 'diverges', 'illustrative', 'formulation', 'flow', 'diagrams', 'unusual', 'applications', 

In [78]:
def read_half_cacm():
    documents = read_cacm()
    return documents[:len(documents)//2]


In [79]:
d_half = tokenize_no_nltk(read_half_cacm())
print("Nombre de tokens :")
T2 = sum([len(L) for L in d_half])
print(T2)
print("Vocabulaire : ")
M2 = len(d_half)
print(M2)

Nombre de tokens :
40262
Vocabulaire : 
5294


In [80]:
from math import log, pow
b = log(M1/M2)/log(T1/T2)
k = M1/(pow(T1, b))
print("K = {}, b = {}".format(k, b))

K = 0.16412801642665256, b = 0.9790887476936934


In [81]:
print('Pour 1 million de tokens, vocabulaire :')
print(int(k * pow(1e6, b)))

Pour 1 million de tokens, vocabulaire :
122946


In [82]:
import re
def word_frequency(documents):
    freq = {}
    d = tokenize_no_nltk(documents)
    words = d.keys()
    for word in words:
        freq[word] = len(d[word])
    return freq
    

In [83]:
freq = word_frequency(documents)
print(freq)

{'preliminary': 20, 'report': 86, 'international': 10, 'algebraic': 83, 'language': 774, 'extraction': 14, 'roots': 55, 'repeated': 7, 'subtractions': 3, 'digital': 154, 'computers': 173, 'techniques': 259, 'department': 14, 'matrix': 272, 'program': 843, 'schemes': 24, 'glossary': 7, 'computer': 1216, 'engineering': 38, 'programming': 790, 'terminology': 19, 'square': 58, 'root': 45, 'approximations': 40, 'inspection': 5, 'procedures': 185, 'equivalence': 16, 'transformation': 80, 'proposal': 21, 'uncol': 2, 'problem': 524, 'communication': 101, 'changing': 14, 'machines': 62, 'proposed': 167, 'solution': 243, 'part': 105, '2': 166, 'error': 243, 'estimation': 21, 'runge': 14, 'kutta': 18, '1': 217, 'recursive': 71, 'curve': 69, 'fitting': 55, 'technique': 266, 'secant': 5, 'modification': 36, 'newton': 44, 'method': 722, 'arithmetic': 158, 'operations': 153, 'simple': 196, 'automatic': 196, 'coding': 77, 'systems': 706, 'accelerating': 4, 'convergence': 28, 'iterative': 41, 'processe

In [88]:
class Token_sorter():
    def __init__(self, word, frequency):
        self.word = word
        self.f = frequency
    def __lt__(self, other):
        return self.f > other.f
    def __repr__(self):
        print(self.word, self.f)
list_word = [Token_sorter(word, freq[word]) for word in freq.keys()]
list_word.sort()
print(list_word)

algorithm 1621


TypeError: __repr__ returned non-string (type NoneType)