## Vectorization of features

In [23]:
text = ["Peter Piper picked a peck of pickled peppers.",
        "A peck of pickled peppers Peter Piper picked.",
        "If Peter Piper picked a peck of pickled peppers, where's the peck of pickled peppers Peter Piper picked?"]

### Count Vectorizer

In [24]:
from sklearn.feature_extraction.text import CountVectorizer

# by default: tokenization, lowercasing, punctuation is completely ignored
c_vec = CountVectorizer()
x = c_vec.fit(text)

In [25]:
print(c_vec.get_feature_names())

['if', 'of', 'peck', 'peppers', 'peter', 'picked', 'pickled', 'piper', 'the', 'where']


In [26]:
x.transform(text).toarray()

array([[0, 1, 1, 1, 1, 1, 1, 1, 0, 0],
       [0, 1, 1, 1, 1, 1, 1, 1, 0, 0],
       [1, 2, 2, 2, 2, 2, 2, 2, 1, 1]])

In [27]:
x.transform(["I will call Peter if I need to pick peppers."]).toarray()

array([[1, 0, 0, 1, 1, 0, 0, 0, 0, 0]])

#### What if I want lemmas? And no lowercasing?

In [28]:
import en_core_web_md
from spacy import displacy

nlp = en_core_web_md.load()

In [29]:
data = [nlp(sent) for sent in text]
data = [" ".join([token.lemma_ for token in sent]) for sent in data]

In [34]:
c_vec2 = CountVectorizer(lowercase=False, stop_words='english',
                         tokenizer=lambda x: x.split(' '))
x = c_vec2.fit(data)

In [35]:
print(c_vec2.get_feature_names())

[',', '.', '?', 'Peter', 'Piper', 'peck', 'pepper', 'pick', 'pickle', 'pickled']


In [36]:
x.transform(data).toarray()

array([[0, 1, 0, 1, 1, 1, 1, 1, 1, 0],
       [0, 1, 0, 1, 1, 1, 1, 1, 1, 0],
       [1, 0, 1, 2, 2, 2, 2, 2, 1, 1]])

In [37]:
x.transform(["I will call Peter if I need to pick peppers."]).toarray()

array([[0, 0, 0, 1, 0, 0, 0, 1, 0, 0]])

### TF-IDF Vectorizer

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

# bacisally it's CountVectorizer followed by TfidfTransformer
tfidf_vec = TfidfVectorizer()
x = tfidf_vec.fit(text)

In [39]:
print(tfidf_vec.get_feature_names())

['if', 'of', 'peck', 'peppers', 'peter', 'picked', 'pickled', 'piper', 'the', 'where']


In [40]:
x.transform(text).toarray()

array([[0.        , 0.37796447, 0.37796447, 0.37796447, 0.37796447,
        0.37796447, 0.37796447, 0.37796447, 0.        , 0.        ],
       [0.        , 0.37796447, 0.37796447, 0.37796447, 0.37796447,
        0.37796447, 0.37796447, 0.37796447, 0.        , 0.        ],
       [0.27986767, 0.33058871, 0.33058871, 0.33058871, 0.33058871,
        0.33058871, 0.33058871, 0.33058871, 0.27986767, 0.27986767]])

In [41]:
x.transform(["I will call Peter if I need to pick peppers."]).toarray()

array([[0.76749457, 0.        , 0.        , 0.45329466, 0.45329466,
        0.        , 0.        , 0.        , 0.        , 0.        ]])

### DictVectorizer

In [42]:
from sklearn.feature_extraction import DictVectorizer

In [43]:
doc = nlp("I will call Peter if I need to pick peppers.")

In [44]:
displacy.render(doc, style='dep', options={"collapse_punct": False, "font": "Arial",
                                           "font_size": "40", "distance": 100},
                jupyter=True)

In [45]:
# Collect features for each word in the sentence

def extract_features(sentence):
    data = []
    for i in range(len(sentence)):
        features = dict()
        features["lemma"] = sentence[i].lemma_
        features["is_capitalized"] = sentence[i].text.istitle()
        features["word-1"] = sentence[i-1].text if i > 0 else "NONE"
        features["parent"] = sentence[i].dep_ + "_" + sentence[i].head.lemma_
        features["right-bigram"] = sentence[i+1].text + "_" + sentence[i+2].text \
            if i < (len(sentence) - 2) else "NONE"
        data.append(features)
    return data

data = extract_features(doc)

In [46]:
dict_vec = DictVectorizer()
x = dict_vec.fit(data)

In [47]:
# The full feature set:
print("{} features were generated\n".format(len(dict_vec.get_feature_names())))
print(dict_vec.get_feature_names())

42 features were generated

['is_capitalized', 'lemma=-PRON-', 'lemma=.', 'lemma=Peter', 'lemma=call', 'lemma=if', 'lemma=need', 'lemma=pepper', 'lemma=pick', 'lemma=to', 'lemma=will', 'parent=ROOT_call', 'parent=advcl_call', 'parent=aux_call', 'parent=aux_pick', 'parent=dobj_call', 'parent=dobj_pick', 'parent=mark_need', 'parent=nsubj_call', 'parent=nsubj_need', 'parent=punct_call', 'parent=xcomp_need', 'right-bigram=I_need', 'right-bigram=NONE', 'right-bigram=Peter_if', 'right-bigram=call_Peter', 'right-bigram=if_I', 'right-bigram=need_to', 'right-bigram=peppers_.', 'right-bigram=pick_peppers', 'right-bigram=to_pick', 'right-bigram=will_call', 'word-1=I', 'word-1=NONE', 'word-1=Peter', 'word-1=call', 'word-1=if', 'word-1=need', 'word-1=peppers', 'word-1=pick', 'word-1=to', 'word-1=will']


In [48]:
new_text = nlp("Mary called Peter.")
dict_vec.transform(extract_features(new_text)).toarray()

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 1., 0., 0., 0., 0., 0., 0., 0.]])