In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [0]:
#convert text to word count vectors with CountVectorizer.
#convert text to word frequency vectors with TfidfVectorizer.

In [0]:
#text data
text = ["I love my cat but the cat sat on my face.", 
        "I love my dog but the dog sat on my bed",
        "I love my bird but the bird sat on my blanket"]

In [134]:
cv = CountVectorizer(text)
count_vector = cv.fit_transform(text)
print(cv.vocabulary_)
print(cv.stop_words_)
print(count_vector.shape)
#count_vector[1] shows less than count_vector.shape[1].
print(count_vector[1])

{'love': 7, 'my': 8, 'cat': 4, 'but': 3, 'the': 11, 'sat': 10, 'on': 9, 'face': 6, 'dog': 5, 'bed': 0, 'bird': 1, 'blanket': 2}
(3, 12)
set()
  (0, 7)	1
  (0, 8)	2
  (0, 3)	1
  (0, 11)	1
  (0, 10)	1
  (0, 9)	1
  (0, 5)	2
  (0, 0)	1


In [135]:
#exclude some stop words which are not meaningful
cv = CountVectorizer(text, stop_words=["the","on","and"])
count_vector = cv.fit_transform(text)
print(cv.vocabulary_)
print(cv.stop_words_)
print(count_vector.shape)

{'love': 7, 'my': 8, 'cat': 4, 'but': 3, 'sat': 9, 'face': 6, 'dog': 5, 'bed': 0, 'bird': 1, 'blanket': 2}
set()
(3, 10)


In [136]:
#ignore terms that appeared in less than 2 documents 
cv = CountVectorizer(text, min_df=2)
count_vector = cv.fit_transform(text)
print(cv.vocabulary_)
print(cv.stop_words_)
print(count_vector.shape)

{'love': 1, 'my': 2, 'but': 0, 'the': 5, 'sat': 4, 'on': 3}
{'cat', 'dog', 'face', 'bed', 'bird', 'blanket'}
(3, 6)


In [137]:
# ignore terms that appear in 50% of the documents
cv = CountVectorizer(text, max_df=0.50)
count_vector=cv.fit_transform(text)
print(cv.vocabulary_)
print(cv.stop_words_)
print(count_vector.shape)

{'cat': 3, 'face': 5, 'dog': 4, 'bed': 0, 'bird': 1, 'blanket': 2}
{'sat', 'my', 'but', 'love', 'on', 'the'}
(3, 6)


In [0]:
#Custom Tokenization
#Custom Preprocessing

In [139]:
#bi-grams and tri-grams can capture contextual information compared to just unigrams
#e.g, good food carries more meaning than just good and food when observed independently.
# only bigrams, word level
cv = CountVectorizer(text, ngram_range=(2,2))
count_vector = cv.fit_transform(text)
print(cv.vocabulary_)
print(cv.stop_words_)
print(count_vector.shape)

{'love my': 7, 'my cat': 11, 'cat but': 3, 'but the': 2, 'the cat': 17, 'cat sat': 4, 'sat on': 15, 'on my': 14, 'my face': 13, 'my dog': 12, 'dog but': 5, 'the dog': 18, 'dog sat': 6, 'my bed': 8, 'my bird': 9, 'bird but': 0, 'the bird': 16, 'bird sat': 1, 'my blanket': 10}
set()
(3, 19)


In [140]:
# unigrams and bigrams, word level
cv = CountVectorizer(text, ngram_range=(1,2))
count_vector = cv.fit_transform(text)
print(cv.vocabulary_)
print(cv.stop_words_)
print(count_vector.shape)

{'love': 14, 'my': 16, 'cat': 7, 'but': 5, 'the': 27, 'sat': 25, 'on': 23, 'face': 13, 'love my': 15, 'my cat': 20, 'cat but': 8, 'but the': 6, 'the cat': 29, 'cat sat': 9, 'sat on': 26, 'on my': 24, 'my face': 22, 'dog': 10, 'bed': 0, 'my dog': 21, 'dog but': 11, 'the dog': 30, 'dog sat': 12, 'my bed': 17, 'bird': 1, 'blanket': 4, 'my bird': 18, 'bird but': 2, 'the bird': 28, 'bird sat': 3, 'my blanket': 19}
set()
(3, 31)


In [141]:
#limit vocabulary size to 5
cv = CountVectorizer(text, max_features=5)
count_vector = cv.fit_transform(text)
print(cv.vocabulary_)
print(cv.stop_words_)
print(count_vector.shape)

{'love': 1, 'my': 2, 'but': 0, 'sat': 4, 'on': 3}
{'cat', 'dog', 'face', 'bed', 'the', 'bird', 'blanket'}
(3, 5)


In [142]:
#try TfidfVectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(text)

feature_names = vectorizer.get_feature_names()
print(feature_names)

text_index = [i for i in text]
print(text_index)

df = pd.DataFrame(X.T.todense(), index = feature_names, columns = text_index)
df.style


['bed', 'bird', 'blanket', 'but', 'cat', 'dog', 'face', 'love', 'my', 'on', 'sat', 'the']
['I love my cat but the cat sat on my face.', 'I love my dog but the dog sat on my bed', 'I love my bird but the bird sat on my blanket']


Unnamed: 0,I love my cat but the cat sat on my face.,I love my dog but the dog sat on my bed,I love my bird but the bird sat on my blanket
bed,0.0,0.350512,0.0
bird,0.0,0.0,0.701023
blanket,0.0,0.0,0.350512
but,0.207018,0.207018,0.207018
cat,0.701023,0.0,0.0
dog,0.0,0.701023,0.0
face,0.350512,0.0,0.0
love,0.207018,0.207018,0.207018
my,0.414036,0.414036,0.414036
on,0.207018,0.207018,0.207018
