https://towardsdatascience.com/lovecraft-with-natural-language-processing-part-3-tf-idf-vectors-8c2d4df98621

In [1]:
import spacy

In [12]:
nlp = spacy.load("en_core_web_sm")

In [8]:
doc = nlp("At midnight the doorbell rang, startling him fearfully.")

In [9]:
type(doc)

spacy.tokens.doc.Doc

In [10]:
for token in doc: 
    print(token.text, token.lemma_, token.pos_)

At at ADP
midnight midnight NOUN
the the DET
doorbell doorbell PROPN
rang rang PROPN
, , PUNCT
startling startle VERB
him -PRON- PRON
fearfully fearfully ADV
. . PUNCT


In [13]:
spacy.displacy.render(doc, style="ent")

In [2]:
example_corpus = [
    "Monsters are bad.", \
    "I saw a monster yesterday.", \
    "Why are we talking about bad monsters?"]

In [8]:
def spacy_tokenizer(document):
    tokens = nlp(document)
    tokens = [token.lemma_ for token in tokens if (
        token.is_stop == False and \
        token.is_punct == False and \
        token.lemma_.strip()!= '')]
    return tokens

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [10]:
tfidf_vector = TfidfVectorizer(input = 'content', tokenizer = spacy_tokenizer)

In [13]:
result = tfidf_vector.fit_transform(example_corpus)

In [14]:
tfidf_vector.get_feature_names()

['bad', 'monster', 'see', 'talk', 'yesterday']

In [17]:
dense = result.todense()
denselist = dense.tolist()
df = pd.DataFrame(
    denselist,columns=tfidf_vector.get_feature_names())

In [18]:
df

Unnamed: 0,bad,monster,see,talk,yesterday
0,0.789807,0.613356,0.0,0.0,0.0
1,0.0,0.385372,0.652491,0.0,0.652491
2,0.547832,0.425441,0.0,0.720333,0.0


In [19]:
from sklearn.metrics.pairwise import linear_kernel

In [21]:
cos_df = pd.DataFrame(columns=[0,1,2])
for i in range(2 + 1):
    curr_cos_sim = linear_kernel(result[i:i+1], result).flatten()
    cos_df[i] = curr_cos_sim

In [22]:
cos_df

Unnamed: 0,0,1,2
0,1.0,0.23637,0.693628
1,0.23637,1.0,0.163953
2,0.693628,0.163953,1.0
