In [1]:
text = 'Originally, vegetables were collected from the wild by hunter-gatherers. Vegetables are all plants. Vegetables are eaten either raw or cooked.'
question = 'What are vegetables?'

In [2]:
import nltk

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemma_me(sent):
    sent_tokens = nltk.word_tokenize(sent.lower())
    pos_tags = nltk.pos_tag(sent_tokens)

    sentence_lemmas = []

    for token, pos_tag in zip(sent_tokens, pos_tags):
        if pos_tag[1][0].lower() in ['n', 'v', 'a', 'r']:
            lemma = lemmatizer.lemmatize(token, pos_tag[1][0].lower())
            sentence_lemmas.append(lemma)

    return sentence_lemmas

In [3]:
nltk.download('punkt')
sentence_tokens = nltk.sent_tokenize(text)
sentence_tokens.append(question)
sentence_tokens


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['Originally, vegetables were collected from the wild by hunter-gatherers.',
 'Vegetables are all plants.',
 'Vegetables are eaten either raw or cooked.',
 'What are vegetables?']

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [5]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [6]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
tv = TfidfVectorizer(tokenizer=lemma_me)

In [8]:
tf = tv.fit_transform(sentence_tokens)

In [9]:
tf

<4x8 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [10]:
tf.toarray()

array([[0.27717414, 0.53114624, 0.        , 0.        , 0.53114624,
        0.53114624, 0.        , 0.27717414],
       [0.41988018, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.8046125 , 0.41988018],
       [0.32713399, 0.        , 0.62688384, 0.62688384, 0.        ,
        0.        , 0.        , 0.32713399],
       [0.70710678, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.70710678]])

In [11]:
##import pandas

##df = pandas.DataFrame(tf.toarray(), columns=tv.get_feature_names_out())

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
values = cosine_similarity(tf[-1], tf)
values


array([[0.39198343, 0.59380024, 0.46263733, 1.        ]])

In [13]:
index = values.argsort()[0][-2]
index

1

In [14]:
values_flat = values.flatten()
values_flat.sort()
values_flat

array([0.39198343, 0.46263733, 0.59380024, 1.        ])

In [15]:
coeff = values_flat[-2]
coeff

0.593800244493221

In [16]:
if coeff > 0.3:
    print(sentence_tokens[index])

Vegetables are all plants.


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=18c33705-2c9c-422d-be07-56fe6239dac2' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>