In [80]:
from sklearn.feature_extraction.text import CountVectorizer

In [81]:
##CountVectorizer can lowercase letters, disregard punctuation and stopwords, but it can't LEMMATIZE or STEM
txt = ["He is ::having a great 12 Time @, at the park time?",
       "She, unlike most women, 12 is a big player on the park's grass.",
       "she can't be going"]

In [82]:
# Initialize a CountVectorizer object: count_vectorizer
count_vec = CountVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=None)

# Transforms the data into a bag of words
count_train = count_vec.fit(txt)
bag_of_words = count_vec.transform(txt)

# Print the first 10 features of the count_vec
print("Every feature:\n{}".format(count_vec.get_feature_names()))
print("\nEvery 3rd feature:\n{}".format(count_vec.get_feature_names()[::3]))

Every feature:
['12', 'big', 'going', 'grass', 'great', 'having', 'park', 'player', 'time', 'unlike', 'women']

Every 3rd feature:
['12', 'grass', 'park', 'unlike']


In [83]:
print("Vocabulary size: {}".format(len(count_train.vocabulary_)))
print("Vocabulary content:\n {}".format(count_train.vocabulary_))

Vocabulary size: 11
Vocabulary content:
 {'having': 5, 'great': 4, '12': 0, 'time': 8, 'park': 6, 'unlike': 9, 'women': 10, 'big': 1, 'player': 7, 'grass': 3, 'going': 2}


In [84]:
print(count_vec.vocabulary_)
print(bag_of_words.shape)
print(type(bag_of_words))
print(bag_of_words.toarray())

{'having': 5, 'great': 4, '12': 0, 'time': 8, 'park': 6, 'unlike': 9, 'women': 10, 'big': 1, 'player': 7, 'grass': 3, 'going': 2}
(3, 11)
<class 'scipy.sparse.csr.csr_matrix'>
[[1 0 0 0 1 1 1 0 2 0 0]
 [1 1 0 1 0 0 1 1 0 1 1]
 [0 0 1 0 0 0 0 0 0 0 0]]


### N-grams (sets of consecutive words)
N=2

In [85]:
count_vec = CountVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 2), max_df=1.0, min_df=1, max_features=None)

count_train = count_vec.fit(txt)
bag_of_words = count_vec.transform(txt)

print(count_vec.get_feature_names())

['12', '12 big', '12 time', 'big', 'big player', 'going', 'grass', 'great', 'great 12', 'having', 'having great', 'park', 'park grass', 'park time', 'player', 'player park', 'time', 'time park', 'unlike', 'unlike women', 'women', 'women 12']


N=3

In [86]:
count_vec = CountVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 3), max_df=1.0, min_df=1, max_features=None)

count_train = count_vec.fit(txt)
bag_of_words = count_vec.transform(txt)

print(count_vec.get_feature_names())

['12', '12 big', '12 big player', '12 time', '12 time park', 'big', 'big player', 'big player park', 'going', 'grass', 'great', 'great 12', 'great 12 time', 'having', 'having great', 'having great 12', 'park', 'park grass', 'park time', 'player', 'player park', 'player park grass', 'time', 'time park', 'time park time', 'unlike', 'unlike women', 'unlike women 12', 'women', 'women 12', 'women 12 big']


### Min_df
#### Min_df ignores terms that have a document frequency (presence in % of documents) strictly lower than the given threshold. For example, Min_df=0.66 requires that a term appear in 66% of the docuemnts for it to be considered part of the vocabulary.

Sometimes min_df is used to limit the vocabulary size, so it learns only those terms that appear in at least 10%, 20%, etc. of the documents.

In [87]:
count_vec = CountVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 1), max_df=1.0, min_df=0.6, max_features=None)

count_train = count_vec.fit(txt)
bag_of_words = count_vec.transform(txt)

print(count_vec.get_feature_names())
print("\nOnly 'park' becomes the vocabulary of the document term matrix (dtm) because it appears in 2 out of 3 documents, \
meaning 0.66% of the time.\
      \nThe rest of the words such as 'big' appear only in 1 out of 3 documents, meaning 0.33%. which is why they don't appear")



['12', 'park']

Only 'park' becomes the vocabulary of the document term matrix (dtm) because it appears in 2 out of 3 documents, meaning 0.66% of the time.      
The rest of the words such as 'big' appear only in 1 out of 3 documents, meaning 0.33%. which is why they don't appear


### Max_df
#### When building the vocabulary, it ignores terms that have a document frequency strictly higher than the given threshold. This could be used to exclude terms that are too frequent and are unlikely to help predict the label.
For example, by analyzing reviews on the movie Lion King, the term 'Lion' might appear in 90% of the reviews (documents), in which case, we could consider establishing Max_df=0.89

In [88]:
count_vec = CountVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 1), max_df=0.50, min_df=1, max_features=None)

count_train = count_vec.fit(txt)
bag_of_words = count_vec.transform(txt)

print(count_vec.get_feature_names())
print("\nOnly 'park' is ignored because it appears in 2 out of 3 documents, meaning 0.66% of the time.")

['big', 'going', 'grass', 'great', 'having', 'player', 'time', 'unlike', 'women']

Only 'park' is ignored because it appears in 2 out of 3 documents, meaning 0.66% of the time.


### Max_features
#### Limit the amount of features (vocabulary) that the vectorizer will learn

In [89]:
count_vec = CountVectorizer(stop_words="english", analyzer='word', 
                            ngram_range=(1, 1), max_df=1.0, min_df=1, max_features=4)

count_train = count_vec.fit(txt)
bag_of_words = count_vec.transform(txt)

print(count_vec.get_feature_names())

['12', 'big', 'park', 'time']


In [90]:
CountVectorizer()

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)