# TP_PROJ - Moteur de recherche
#### IMBROGNO Tino - CADOT Firmin

In [65]:
from tqdm import tqdm

In [66]:
data = open("CISI.ALLnettoye","r").readlines()
data=[i[:len(i)-1] for i in data] #Enleve le '\n' de chaque ligne

La collection réunit l'ensemble des documents dans un dictionnaire avec pour clé: le titre du document | valeur: liste des lignes liées au contenu du document

Les document sont un dictionnaire ayant comme clé: le numéro de document | valeur: titre + contenu

Les titres sont contenus dans un dictionnaire ayant pour clé: le numéro du document | valeur: le titre du document associé  

Les contenus (clé:numéro document | valeur:contenu du doc)

In [67]:
collection={}
document={}
titre={}
contenu={}

for line in range(len(data)):
    if data[line][:3]==".I ":
        doc=[]
        content=[]
        numDoc = int(data[line][3:])
    elif data[line-1][:3]==".I ":
        titre[numDoc]=data[line]
        doc.append(data[line])
    else:
        content.append(data[line])
        doc.append(data[line])
        contenu[numDoc]=content
        document[numDoc]=doc
        collection[titre[numDoc]]=content

In [68]:
print(titre[1],'\n')
print(contenu[1],'\n')
print(document[1],'\n')
print(list(collection.items())[0],'\n')

18 Editions of the Dewey Decimal Classifications 

['   The present study is a history of the DEWEY Decimal', 'Classification.  The first edition of the DDC was published', 'in 1876, the eighteenth edition in 1971, and future editions', "will continue to appear as needed.  In spite of the DDC's", 'long and healthy life, however, its full story has never', 'been told.  There have been biographies of Dewey', 'that briefly describe his system, but this is the first', 'attempt to provide a detailed history of the work that', 'more than any other has spurred the growth of', 'librarianship in this country and abroad.'] 

['18 Editions of the Dewey Decimal Classifications', '   The present study is a history of the DEWEY Decimal', 'Classification.  The first edition of the DDC was published', 'in 1876, the eighteenth edition in 1971, and future editions', "will continue to appear as needed.  In spite of the DDC's", 'long and healthy life, however, its full story has never', 'been told.  There

In [69]:
vocabulary = []

In [70]:
def wordOccurrence(dc: dict):
    from nltk.tokenize import word_tokenize
    from nltk.corpus import stopwords

    print("Loading for Word Occurrence")
    stopwords = stopwords.words('english')
    stopwords.extend(['from', 'subject', 're', 'edu', 'use'])
    d=[[word_tokenize(line) for line in dc[i]] for i in dc.keys()] #Tokenization of all line of all documents in the collection
    wordOccurrenceDoc = {}
    for doc in tqdm(range(len(d))):
        voc = []
        wordOccurrenceDoc[doc+1]=dict()
        for line in d[doc]:
            for word in line:
                wordLow = word.lower()
                if len(wordLow)>1 and wordLow not in stopwords:
                    voc.append(wordLow)
                    wordOccurrenceDoc[doc+1][wordLow]=0
        vocabulary.append(voc)

    for doc in tqdm(range(len(d))):
        for line in d[doc]:
            for word in line:
                wordLow = word.lower()
                if wordLow in wordOccurrenceDoc[doc+1].keys():
                    wordOccurrenceDoc[doc+1][wordLow] += 1
    
    return wordOccurrenceDoc 

### d[i] : List of lines from the document "i"
### d[i][j] : List of word from the line "j" of the document "i"
### d[i][j][k] : Word in place "k" from the line "j" of the document "i"

In [71]:
from tqdm import tqdm

def TF(doc,maxi):
    print("Loading for TF")
    # maxi : Nb de docs analysés (maximum = N)
    tfD={}
    for i in tqdm(range(1,maxi+1)):
        tfDict = {}
        bagOfWordsCount = len(doc[i].keys())
        for word, count in doc[i].items():
            tfDict[word] = float("{:.15f}".format(count / float(bagOfWordsCount)))
        tfD[i] = tfDict
    return tfD

In [72]:
import math
from tqdm import tqdm
from nltk.tokenize import word_tokenize
inverseDoc = {}
def IDF(docu,wOccu,maxi):
    print("Loading for IDF")
    # maxi : Nb de docs analysés (maximum = N)
    d=[[word_tokenize(line) for line in docu[i]] for i in docu.keys()] #Tokenization of all line of all documents in the collection
    N = len(wOccu)
    sac_of_words={}
    words={}
    for i in tqdm(range(1,maxi+1)):
        wordDict={}
        for w in wOccu[i].keys():
            if w in sac_of_words.keys():
                wordDict[w] = sac_of_words[w]
            else:
                wordDict[w] = 0
                inverseDoc[w] = []
                for doc in d[:maxi]:
                    isPresent=0
                    for line in doc:
                        for word in line:
                            if w==word.lower():
                                isPresent=1
                                wordDict[w] += 1
                                inverseDoc[w].append(d.index(doc)+1)
                            if isPresent:
                                break
                        if isPresent:
                            break
                wordDict[w] = math.log(maxi/wordDict[w])
                sac_of_words[w] = wordDict[w]
        words[i]=wordDict
    return words

In [73]:
def TFIDF(tfD, idfD):
    print("Loading for TF-IDF")
    maxi = len(tfD)
    if maxi != len(idfD):
        print("Pas le même nombre de doc")
    else:
        maxi += 1
        tfidf = {}
        for i in range(1,maxi):
            tfidf_inter = {}
            for word, val in tfD[i].items():
                tfidf_inter[word] = val * idfD[i][word]
            tfidf[i] = tfidf_inter
        return tfidf

In [74]:
def sortDict(dict):
    dictionary={}
    for i in dict.keys():
        dico={}
        for k, v in sorted(dict[i].items(), key=lambda x: -x[1]):
            dico[k] = v
        dictionary[i] = dico
    return dictionary

In [76]:
wO = wordOccurrence(document)
tf = TF(wO,len(wO))

Loading for Word Occurrence


100%|██████████| 1461/1461 [00:00<00:00, 2188.30it/s]
100%|██████████| 1461/1461 [00:00<00:00, 13511.16it/s]


Loading for TF


100%|██████████| 1461/1461 [00:00<00:00, 12806.30it/s]


In [79]:
idf = IDF(document,wO,len(wO))

Loading for IDF


  1%|          | 11/1461 [00:18<39:46,  1.65s/it]


KeyboardInterrupt: 

In [78]:
tfidf = sortDict(TFIDF(tf,idf))
tfidf[1]['dewey']

Loading for TF-IDF
Pas le même nombre de doc


AttributeError: 'NoneType' object has no attribute 'keys'

In [77]:
for key in inverseDoc.keys():
    dico_intermediaire={}
    for doc in inverseDoc[key]:
        dico_intermediaire[doc]=tfidf[doc][key]
    inverseDoc[key]=dico_intermediaire
inverseDoc = sortDict(inverseDoc)
inverseDoc["large"]

KeyError: 'large'

In [82]:
vocabulary[1460]

['essai', 'test', 'essai', 'try', 'hard', 'bonjour', 'france']

In [None]:
from pprint import pprint
import gensim

num_topics = 10

commonDict = gensim.corpora.Dictionary(vocabulary)

corp = [commonDict.doc2bow(text) for text in vocabulary]

lda = gensim.models.LdaMulticore(corp, num_topics)

#lda_model = gensim.models.Doc2Vec(vocabulary)

In [30]:
vocabulary[0][corp[0][17][0]]

'edition'

In [84]:
def removeStopWord(self):
    from nltk.corpus import stopwords
    import string
    stopwords = stopwords.words('english')
    stopwords.extend(['from', 'subject', 're', 'edu', 'use'])
    return [w.translate(str.maketrans('','',string.punctuation)) for w in self if not w in stopwords]

In [85]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

documents = [TaggedDocument(doc, [i+1]) for i, doc in enumerate(vocabulary)]
model = Doc2Vec(documents, vector_size=1000, window=1, min_count=1, workers=4)

In [86]:
model.build_vocab(documents)

In [87]:
model.train(documents,epochs = 100,total_examples=model.corpus_count)

In [88]:
model.corpus_count

1461

In [114]:
recherche = "Computerized information systems in fields related to chemistry."
tokens = removeStopWord(recherche.lower().split())
print(tokens)
new_vector = model.infer_vector(tokens)

['librarians', 'decision', 'include', 'book', 'librarycollection', 'act', 'book', 'selection', 'censorship', 'there', 'fact', 'discernible', 'difference', 'two', 'terms', 'book', 'selection', 'andcensorship', 'topic', 'discussed', 'lucidly', 'ably', 'long', 'ago', 'bylester', 'asheim', 'become', 'classic', 'essay', 'literature', 'oflibrarianship', 'censorship', 'selection', 'raising', 'againmay', 'appear', 'exercise', 'redundancy']


In [115]:
model.dv.most_similar([new_vector])

[(238, 0.7939483523368835),
 (651, 0.5160901546478271),
 (306, 0.4982931911945343),
 (33, 0.48379889130592346),
 (865, 0.4549260139465332),
 (1440, 0.4510200321674347),
 (234, 0.4472435712814331),
 (939, 0.4373336136341095),
 (280, 0.4372183382511139),
 (370, 0.43611615896224976)]

In [None]:
import pandas as pd

df = pd.DataFrame(inverseDoc)
df