In [1]:
from sklearn.feature_extraction.text import HashingVectorizer
vectorizer = HashingVectorizer()
print(vectorizer.transform(['a very small document']).shape)
(1, 1048576)

(1, 1048576)


(1, 1048576)

## HashingVectorizer has a parameter n_features which is 1048576 by default.

In [2]:
small_vectorizer = HashingVectorizer(n_features=5)
print(small_vectorizer.transform(['a very small document']).shape)    

(1, 5)


#### TF IDF Vectorization

In [11]:

from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
text = ["Pavani has successfully completed the assessment on the nycopendata."]
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [12]:
print(vectorizer.vocabulary_)

{'pavani': 5, 'has': 2, 'successfully': 6, 'completed': 1, 'the': 7, 'assessment': 0, 'on': 4, 'nycopendata': 3}


In [13]:
# encode document
vector = vectorizer.transform(text)


##### Count of terms

In [15]:
# summarize encoded vector
print(vector.shape)
print(type(vector))
print(vector.toarray()) ####Count of terms is reperensented

(1, 8)
<class 'scipy.sparse.csr.csr_matrix'>
[[1 1 1 1 1 1 1 2]]


### TF IDF

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

# list of text documents
text = ["Pavani has successfully completed the assessment on the nycopendata."
        "The nycopendata.",
        "Pavani"]

In [27]:
# create the transform
vectorizer = TfidfVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_)

{'pavani': 5, 'has': 2, 'successfully': 6, 'completed': 1, 'the': 7, 'assessment': 0, 'on': 4, 'nycopendata': 3}


In [28]:
print(vectorizer.idf_)  ####log(N/no of docs term t appear)

[1.40546511 1.40546511 1.40546511 1.40546511 1.40546511 1.
 1.40546511 1.40546511]


In [29]:
[text[0]]

['Pavani has successfully completed the assessment on the nycopendata.The nycopendata.']

In [30]:
# encode document
vector = vectorizer.transform([text[0]])

In [31]:
# summarize encoded vector
print(vector.shape)
print(vector.toarray())

(1, 8)
[[0.23245605 0.23245605 0.23245605 0.4649121  0.23245605 0.1653944
  0.23245605 0.69736816]]


1. The CountVectorizer provides a simple way to both tokenize a collection of text documents 
2. Build a vocabulary of known words
3. Encode new documents using that vocabulary.

In [42]:
import pandas as pd
df = pd.DataFrame({'docId': [1,2,3], 
               'sent': ['This is the first sentence','This is the second sentence', 'This is the third sentence']})

In [43]:
df.head()

Unnamed: 0,docId,sent
0,1,This is the first sentence
1,2,This is the second sentence
2,3,This is the third sentence


In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(df['sent'])

In [48]:
print(tfidf.vocabulary_)

{'this': 6, 'is': 1, 'the': 4, 'first': 0, 'sentence': 3, 'second': 2, 'third': 5}


In [50]:
print(tfidf.idf_)

[1.69314718 1.         1.69314718 1.         1.         1.69314718
 1.        ]


In [45]:
x

<3x7 sparse matrix of type '<class 'numpy.float64'>'
	with 15 stored elements in Compressed Sparse Row format>

In [47]:
x.toarray()

array([[0.64612892, 0.38161415, 0.        , 0.38161415, 0.38161415,
        0.        , 0.38161415],
       [0.        , 0.38161415, 0.64612892, 0.38161415, 0.38161415,
        0.        , 0.38161415],
       [0.        , 0.38161415, 0.        , 0.38161415, 0.38161415,
        0.64612892, 0.38161415]])