In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
text=[
 "NLP unlocks insights, NLP evolving.",
 "ML learns patterns; ML applications abound.",
 "NLP preprocessing crucial for NLP understanding."
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text)
print(vectorizer.get_feature_names_out())
# TF-IDF-Vektordaten ausgeben
print(X.toarray())

['abound' 'applications' 'crucial' 'evolving' 'for' 'insights' 'learns'
 'ml' 'nlp' 'patterns' 'preprocessing' 'understanding' 'unlocks']
[[0.         0.         0.         0.43381609 0.         0.43381609
  0.         0.         0.65985664 0.         0.         0.
  0.43381609]
 [0.35355339 0.35355339 0.         0.         0.         0.
  0.35355339 0.70710678 0.         0.35355339 0.         0.
  0.        ]
 [0.         0.         0.39798027 0.         0.39798027 0.
  0.         0.         0.60534851 0.         0.39798027 0.39798027
  0.        ]]


In [3]:
import spacy
nlp = spacy.load("de_core_news_lg")

# Unicode String (u)
banana_vector = nlp(u'Banana').vector
print(banana_vector)

[-2.8751e+00 -3.0886e+00 -6.5668e-02  7.6071e-01 -1.0454e+00  1.4128e+00
  2.9616e+00 -1.0241e+00 -3.2125e+00  2.4650e+00  1.8139e+00 -1.3931e+00
 -1.0748e+00 -1.0771e+00 -4.1786e-01  2.2618e+00 -1.6344e+00 -6.6468e-01
 -2.2136e+00  1.2849e+00  1.4909e+00  3.6191e+00 -2.1748e+00  1.4585e+00
  2.1528e+00 -8.1199e-02 -1.2381e+00 -3.3737e+00 -4.0432e-01 -2.3723e-01
 -8.0084e-01  9.4969e-01 -4.6358e-02  5.2613e-01 -8.0498e-01 -1.2646e+00
 -7.0785e-01  5.4351e-01 -9.9471e-01 -2.1358e+00 -1.2586e+00 -2.6932e+00
 -1.8820e-01 -7.2355e-01 -1.1235e+00 -4.6935e-01 -2.2595e-01 -4.1580e-01
 -1.3651e+00 -5.4309e-02  1.3478e+00 -1.5488e+00  2.1997e-01 -6.5675e-01
  1.3244e-01 -1.8588e+00 -1.1909e+00  1.4809e+00  3.2557e+00 -7.4432e-01
 -1.0937e+00 -1.4814e+00 -9.6496e-01 -1.1938e+00 -1.7297e-01 -1.1948e+00
  4.7582e-01  1.3427e+00 -9.1829e-01 -1.5111e+00  5.8706e-01  1.3217e+00
  5.9117e-01  4.6654e-01  1.3922e+00  7.8353e-01  2.7861e+00 -3.2419e+00
 -1.9398e+00  2.6994e+00 -7.5187e-01  1.3133e+00  1

In [4]:
print(banana_vector.shape) # 300 mit lg, 96 mit sm 

(300,)


In [5]:
tokens = nlp("dog cat banana afskfsd")
for token in tokens:
    print(f"{token.text:{10}} {token.has_vector:{3}}", f"{token.vector_norm:{20}} {token.is_oov:{5}}") 

dog          1    39.64518356323242     0
cat          1   43.187129974365234     0
banana       1    21.89556884765625     0
afskfsd      0                  0.0     1


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

# Install first: python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")

king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

# Wortvektoren verrechnen
new_vector = king - man + woman

# Vokabular durchlaufen und ähnliche Wörter mit den ähnlichsten Vektoren finden. Ähnlichkeit wird durch Schwellenwert definiert.
similar_words = []

for token in nlp.vocab:
    # Stoppwörter überspringen
    if not token.has_vector:
        continue
    
    # Ähnlichkeit zwischen dem gegebenen Vektor und dem jeweiligen
    similarity = cosine_similarity(new_vector.reshape(1,-1), token.vector.reshape(1,-1))
    
    # > Schwellwert, dann hinzufügen
    if similarity > 0.:
        similar_words.append(token.text)

print("Ähnliche Wörter:", similar_words)

Ähnliche Wörter: ['c.', 'it', 'is', 'Might', 'where', 'must', 'had', 'Calif', 'does', 'nothin’', 'b', 'might', 'ought', 'these', 'somethin', "there's", 'When', 'i.e', 'i.e.', '(;', 'would', ':-/', ';', 'a.', 'Ought', 'king', 'need', 'cause', 'you', 'that’s', 'was', 'Where', '.', 'who', 'how', 'could', 'those', 'space', 'He', 'were', 'a', 'somethin’', '’cause', 'this', 'they', 'he', 'woman', 'there', 'b.', 'It', 'Those', 'Id.', '(=', 'or', 'when', 'she', 'may', 'all', 'what', 'and/or', "''", 'Sen', 'i.', 'nothin', 'This', 'There', "There's", 'These', 'e.g.', 'not', 'f.', 'e.', 'has', 'Ala', 'have', 'She', 'd.', 'can', 'that', 'Should', 'Nothin', 'They', 'r.', 'Cause', 's.', 'should', 'are', 'and', '—']


In [12]:
nlp("apple").similarity(nlp("mango"))

0.6305076508837961

In [14]:
cosine_similarity(nlp("apple").vector.reshape(1,-1), nlp("cherry").vector.reshape(1,-1))

array([[0.6875837]], dtype=float32)