# **Hands-on Scikit-Learn**

# **Word Counts with CountVectorizer**

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]

## **Create the transform**

In [3]:
vectorizer = CountVectorizer()

##**Tokenize and build vocabulary and print it**

In [4]:
vectorizer.fit(text)
print(vectorizer.vocabulary_)

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}


##**Encode the document and print details**

In [5]:
vector = vectorizer.transform(text)

print(vector.shape)
print(type(vector))
print(vector.toarray())

(1, 8)
<class 'scipy.sparse.csr.csr_matrix'>
[[1 1 1 1 1 1 1 2]]


##**Encode another document and print the details**

In [6]:
text2 = ["the puppy"]
vector = vectorizer.transform(text2)
print(vector.toarray())

[[0 0 0 0 0 0 0 1]]


#**Word Frequencies with TfidfVectorizer**

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# list of text documents
text = ["The quick brown fox jumped over the lazy dog.", "The dog.", "The fox"]

vectorizer = TfidfVectorizer()
vectorizer.fit(text)

print(vectorizer.vocabulary_)
print(vectorizer.idf_)

{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}
[1.69314718 1.28768207 1.28768207 1.69314718 1.69314718 1.69314718
 1.69314718 1.        ]


##**Transform and print details**

In [9]:
vector = vectorizer.transform([text[0]])
print(vector.shape)
print(vector.toarray())

(1, 8)
[[0.36388646 0.27674503 0.27674503 0.36388646 0.36388646 0.36388646
  0.36388646 0.42983441]]


#**Hashing with HashingVectorizer**

##**Create hashing transform and print the details**

In [10]:
from sklearn.feature_extraction.text import HashingVectorizer
text = ["The quick brown fox jumped over the lazy dog."]

vectorizer = HashingVectorizer(n_features=20)
vector = vectorizer.transform(text)

print(vector.shape)
print(vector.toarray())

(1, 20)
[[ 0.          0.          0.          0.          0.          0.33333333
   0.         -0.33333333  0.33333333  0.          0.          0.33333333
   0.          0.          0.         -0.33333333  0.          0.
  -0.66666667  0.        ]]


# **Hands-on Keras**

## **Split words using text_to_word_sequence**

In [13]:
from keras.preprocessing.text import text_to_word_sequence
text = ' The quick brown fox jumped over the lazy dog. '

words = text_to_word_sequence(text)
print(words)

['the', 'quick', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']


##**Find vocabulary size**

In [14]:
from keras.preprocessing.text import text_to_word_sequence
text = ' The quick brown fox jumped over the lazy dog. '

words = set(text_to_word_sequence(text))
vocab_size = len(words)
print(vocab_size)

8


##**Encoding with one-hot vector**

In [19]:
from keras.preprocessing.text import one_hot
from keras.preprocessing.text import text_to_word_sequence
text = ' The quick brown fox jumped over the lazy dog. '

words = set(text_to_word_sequence(text))
vocab_size = len(words)

result = one_hot(text, round(vocab_size * 1.3))
print(result)

[3, 7, 7, 3, 4, 5, 3, 4, 3]


##**Using hashing_trick to hash encode**

In [22]:
from keras.preprocessing.text import hashing_trick
from keras.preprocessing.text import text_to_word_sequence

text = ' The quick brown fox jumped over the lazy dog.'
words = set(text_to_word_sequence(text))
vocab_size = len(words)

result = hashing_trick(text, round(vocab_size*1.3), hash_function = 'md5')
print(result)

[6, 4, 1, 2, 7, 5, 6, 2, 6]


##**Using Tokenizer API and printing details**

In [24]:
from keras.preprocessing.text import Tokenizer

docs = ['Well done!', 'Good work', 'Great effort', 'nice work', 'Excellent!']

t = Tokenizer()
t.fit_on_texts(docs)

print(t.word_counts)
print(t.document_count)
print(t.word_index)
print(t.word_docs)

encode_docs = t.texts_to_matrix(docs, mode = 'count')
print(encode_docs)

OrderedDict([('well', 1), ('done', 1), ('good', 1), ('work', 2), ('great', 1), ('effort', 1), ('nice', 1), ('excellent', 1)])
5
{'work': 1, 'well': 2, 'done': 3, 'good': 4, 'great': 5, 'effort': 6, 'nice': 7, 'excellent': 8}
defaultdict(<class 'int'>, {'done': 1, 'well': 1, 'good': 1, 'work': 2, 'great': 1, 'effort': 1, 'nice': 1, 'excellent': 1})
[[0. 0. 1. 1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 1. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1.]]
