## Motore di ricerca

In [23]:
import pandas as pd
import numpy as np
import os
from tqdm.notebook import tqdm

In [5]:
folder = "/Users/Flint/Data/recipes/textit"
docids = []
docs = []
for filename in os.listdir(folder):
    if filename.endswith('.txt'):
        docid = filename.replace(".txt", "").split("_")[1]
        docids.append(int(docid))
        with open(os.path.join(folder, filename), 'r') as infile:
            text = infile.read()
            docs.append(text)

### 1. Tokenizzazione

#### Opzione 1: espressioni regolari

In [8]:
from nltk.tokenize import word_tokenize

In [11]:
wt_docs = []
for doc in docs:
    tokens = word_tokenize(doc.lower(), language='italian')
    wt_docs.append(tokens)

#### Opzione 2: tokenizzazione linguistica

In [12]:
import spacy

In [13]:
nlp = spacy.load('it_core_news_lg')

In [17]:
def spacy_tokenizer(text, skip=None):
    tokens = []
    for token in nlp(text.lower()):
        if skip is not None and token.pos_ in skip:
            pass 
        else:
            tokens.append(token.lemma_)
    return tokens

In [25]:
st_docs = []
for doc in tqdm(docs):
    tokens = spacy_tokenizer(doc.lower(), skip=['ADP', 'CONJ', 'SPACE', 'PUNCT', 'DET'])
    st_docs.append(tokens)


  0%|          | 0/5933 [00:00<?, ?it/s]

### Indicizzazione

In [27]:
from collections import Counter

In [34]:
def indexing(tokenized_docs, index_ids):
    l_index = []
    for tokens in tokenized_docs:
        bow = dict(Counter(tokens).most_common())
        l_index.append(bow)
    I = pd.DataFrame(l_index, index=index_ids).fillna(0).T
    return I 

In [35]:
W = indexing(wt_docs, docids)
S = indexing(st_docs, docids)

In [37]:
print(f"tokenizzazione non normalizzata: {W.shape[0]}")
print(f"tokenizzazione normalizzata: {S.shape[0]}")

tokenizzazione non normalizzata: 20753
tokenizzazione normalizzata: 18058


In [38]:
S.head()

Unnamed: 0,1994,2245,810,4634,4152,2523,4146,3629,2537,5258,...,5241,3630,1027,1033,5255,3624,3142,5533,809,1755
e,9.0,1.0,11.0,6.0,9.0,2.0,5.0,3.0,8.0,3.0,...,7.0,20.0,14.0,7.0,7.0,12.0,1.0,4.0,10.0,6.0
g,6.0,6.0,10.0,4.0,6.0,0.0,8.0,2.0,8.0,7.0,...,4.0,8.0,6.0,4.0,6.0,8.0,3.0,9.0,7.0,4.0
tarallo,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
dolce,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
farina,4.0,2.0,7.0,4.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,4.0,0.0,0.0,2.0,3.0,0.0,2.0,0.0,3.0


### Ricerca

In [44]:
q = "uova strapazzate con bacon e krls"
q_tokens = spacy_tokenizer(q, skip=['ADP', 'CONJ', 'SPACE', 'PUNCT', 'DET'])
q_bow = dict(Counter(q_tokens).most_common())
q_bow

{'uovo': 1, 'strapazzato': 1, 'bacon': 1, 'e': 1, 'Krls': 1}

In [45]:
q_vector = []
for token in S.index.values:
    if token in q_bow.keys():
        q_vector.append(q_bow[token])
    else:
        q_vector.append(0)
q_vector = np.array(q_vector)
example = pd.Series(q_vector, index=S.index)
example.sort_values(ascending=False)

e                    1
strapazzato          1
bacon                1
uovo                 1
schiacciatere li     0
                    ..
1250                 0
ridue cet ne         0
tagliare ne          0
dapprima             0
gnocchi.preparate    0
Length: 18058, dtype: int64

In [46]:
from sklearn.metrics.pairwise import cosine_similarity

In [50]:
sigma = cosine_similarity(np.array([q_vector]), S.T)

  ret = a @ b
  ret = a @ b
  ret = a @ b


In [55]:
ranking = pd.Series(sigma[0], index=S.columns)

In [57]:
ranking.sort_values(ascending=False)

1947    0.560576
3043    0.519615
1864    0.510527
4717    0.491539
5748    0.457116
          ...   
2760    0.000000
315     0.000000
5829    0.000000
2898    0.000000
276     0.000000
Length: 5933, dtype: float64

In [61]:
W[1994].sort_values(ascending=False)

di            19.0
e              9.0
,              8.0
per            6.0
un             6.0
              ... 
5mm            0.0
23cm           0.0
3cm            0.0
placchette     0.0
emergere       0.0
Name: 1994, Length: 20753, dtype: float64