In [21]:
from collections import Counter
from porter import stem
import re
import numpy as np

doc1='the new home has been saled on top forecasts'
doc2='the home sales rise in july'
doc3 ='there is an increase in home sales in july'
doc4='july encounter a new home sales rise'
mots_vides=['the', 'a','an', 'on', 'behind', 'under', 'there', 'in', 'on']

docs=[doc1,doc2,doc3,doc4]


### Q 1.1

In [38]:
def process_doc(doc): 
    doc = doc.lower()
    l = [stem(word) for word in doc.split(" ") if word not in mots_vides]
    counts = Counter(l)
    return counts
    

In [39]:
process_doc(doc1)

Counter({'new': 1,
         'home': 1,
         'ha': 1,
         'been': 1,
         'sale': 1,
         'top': 1,
         'forecast': 1})

### Q 1.2

In [124]:
def build_index(docs):
    """
        Construit la table index
    """
    
    d = {}
    for i, doc in enumerate(docs): 
        doc = doc.lower()
        l = [stem(word) for word in doc.split(" ") if word not in mots_vides]
        counts = Counter(l)
    return d

def build_index_inv(docs): 
    """
        Retourne la tablea index inversée
        # TODO : écrire à partir de build_index
    """
    d = {}   
    for i, doc in enumerate(docs): 
        doc = doc.lower()
        for word in doc.split(" "): 
            if word not in mots_vides: 
                word = stem(word)
                if word not in d: 
                    d[word] = {}
                d[word][i] = 1

    return d

In [125]:
corpus = [doc1, doc2, doc3, doc4]
index_base = build_index(corpus)
index_inv = build_index_inv(corpus)
print(index)
print()
print(index_inv)

{0: Counter({'new': 1, 'home': 1, 'ha': 1, 'been': 1, 'sale': 1, 'top': 1, 'forecast': 1}), 1: Counter({'home': 1, 'sale': 1, 'rise': 1, 'juli': 1}), 2: Counter({'is': 1, 'increas': 1, 'home': 1, 'sale': 1, 'juli': 1}), 3: Counter({'juli': 1, 'encount': 1, 'new': 1, 'home': 1, 'sale': 1, 'rise': 1})}

{'new': {0: 1, 3: 1}, 'home': {0: 1, 1: 1, 2: 1, 3: 1}, 'ha': {0: 1}, 'been': {0: 1}, 'sale': {0: 1, 1: 1, 2: 1, 3: 1}, 'top': {0: 1}, 'forecast': {0: 1}, 'rise': {1: 1, 3: 1}, 'juli': {1: 1, 2: 1, 3: 1}, 'is': {2: 1}, 'increas': {2: 1}, 'encount': {3: 1}}


### Q 1.3 : 


#### Note perso : Créer une classe pour regrouper l'intégralité des méthodes 

In [115]:
def build_index_tfidf(docs, index_inv, index, norm=True): 
    """
        Retourne la table index avec un score tfidf 
        Argument: 
            docs : corpus 
            index_inv : index_inv avec les occurences 
            index : index avec les occurences 
            norm : ajout ou non de normalisation 
    """
    d = {}
    for i, doc in enumerate(docs): # boucle 1
        d[i] = {}
        doc = doc.lower()
        norm_cst = 0
        for word in index_inv: # boucle 2
            tfidf = (index[i][word]/len(index[i])) * np.log((len(docs)/len(index_inv[word])))
            d[i][word] = tfidf 
            norm_cst += tfidf 
                
        if norm: 
            for word in d[i]: 
                d[i][word] = d[i][word] / norm_cst
    return d


def build_index_inv_tfidf(docs, index_tfidf): 
    """
        Construction de l'index inverse avec tfidf
        Arg: 
            docs : corpus 
            index_tfidf : index tfidf
    """
    d = {}
    for word in index_tfidf[0]: 
        d[word] = {}
        for doc_idx in index_tfidf:
            d[word][doc_idx] = index_tfidf[doc_idx][word]
                        
    return d

In [123]:
index_tfidf = build_index_tfidf(corpus, index_inv, index_base)
inverse_tfidf = build_index_inv_tfidf(corpus, index_tfidf)
print(index_tfidf)
print()
print(index_tfidf)

{0: {'new': 0.1111111111111111, 'home': 0.0, 'ha': 0.2222222222222222, 'been': 0.2222222222222222, 'sale': 0.0, 'top': 0.2222222222222222, 'forecast': 0.2222222222222222, 'rise': 0.0, 'juli': 0.0, 'is': 0.0, 'increas': 0.0, 'encount': 0.0}, 1: {'new': 0.0, 'home': 0.0, 'ha': 0.0, 'been': 0.0, 'sale': 0.0, 'top': 0.0, 'forecast': 0.0, 'rise': 0.7066950526114237, 'juli': 0.2933049473885762, 'is': 0.0, 'increas': 0.0, 'encount': 0.0}, 2: {'new': 0.0, 'home': 0.0, 'ha': 0.0, 'been': 0.0, 'sale': 0.0, 'top': 0.0, 'forecast': 0.0, 'rise': 0.0, 'juli': 0.09400543015696614, 'is': 0.4529972849215169, 'increas': 0.4529972849215169, 'encount': 0.0}, 3: {'new': 0.2264986424607585, 'home': 0.0, 'ha': 0.0, 'been': 0.0, 'sale': 0.0, 'top': 0.0, 'forecast': 0.0, 'rise': 0.2264986424607585, 'juli': 0.09400543015696614, 'is': 0.0, 'increas': 0.0, 'encount': 0.452997284921517}}

{0: {'new': 0.1111111111111111, 'home': 0.0, 'ha': 0.2222222222222222, 'been': 0.2222222222222222, 'sale': 0.0, 'top': 0.222222

## Exercice 2

### Q 2.2

In [171]:
def bool_score(q, index_inv, nb_doc): 
    """
        retourne la liste des documents contenant les mots de le requête
    """
    d = {k:0 for k in range(nb_doc)}
    q = transfo_and_split(q)
    
    for word in q: 
        doc_word = index_inv[word]
        for doc in doc_word: 
            d[int(doc)] += 1
    
    for k in d: 
        if d[k] == len(q): 
            d[k] = 1
        else: 
            d[k] = 0
        
    return d
        
        
bool_score("home sales top", index_inv, 4)

{0: 1, 1: 0, 2: 0, 3: 0}

### Q 2.3

In [176]:
def vector_score(q, index_inv): 
    q = transfo_and_split(q)
    result = {}
    for word in q: 
        for doc, score in index_inv[word].items(): 
            print(f"Pour le mot : {word} | {(doc, score)}")
            if doc not in result:
                result[doc] = score
            else: 
                result[doc] += score
    
    result = dict(sorted(result.items(), key=lambda val : val[1], reverse=True))
    return result
        

In [177]:
vector_score("home sales top", inverse_tfidf)

Pour le mot : home | (0, 0.0)
Pour le mot : home | (1, 0.0)
Pour le mot : home | (2, 0.0)
Pour le mot : home | (3, 0.0)
Pour le mot : sale | (0, 0.0)
Pour le mot : sale | (1, 0.0)
Pour le mot : sale | (2, 0.0)
Pour le mot : sale | (3, 0.0)
Pour le mot : top | (0, 0.2222222222222222)
Pour le mot : top | (1, 0.0)
Pour le mot : top | (2, 0.0)
Pour le mot : top | (3, 0.0)


{0: 0.2222222222222222, 1: 0.0, 2: 0.0, 3: 0.0}