In [2]:
import sklearn

from sklearn.feature_extraction.text import CountVectorizer

In [3]:
train_text = ["The Pessimist Sees Difficulty In Every Opportunity.",
              "The Optimist Sees Opportunity In Every Difficulty.",
              "Don’t Let Yesterday Take Up Too Much Of Today. ",
              "You Learn More From Failure Than From Success.",
              "We May Encounter Many Defeats But We Must Not Be Defeated.",
              "Life Is Either A Daring Adventure Or Nothing."]

count_vectorizer = CountVectorizer()

In [4]:
count_vectorizer.fit(train_text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [5]:
count_vectorizer.get_feature_names()

['adventure',
 'be',
 'but',
 'daring',
 'defeated',
 'defeats',
 'difficulty',
 'don',
 'either',
 'encounter',
 'every',
 'failure',
 'from',
 'in',
 'is',
 'learn',
 'let',
 'life',
 'many',
 'may',
 'more',
 'much',
 'must',
 'not',
 'nothing',
 'of',
 'opportunity',
 'optimist',
 'or',
 'pessimist',
 'sees',
 'success',
 'take',
 'than',
 'the',
 'today',
 'too',
 'up',
 'we',
 'yesterday',
 'you']

In [6]:
count_vectorizer.vocabulary_

{'adventure': 0,
 'be': 1,
 'but': 2,
 'daring': 3,
 'defeated': 4,
 'defeats': 5,
 'difficulty': 6,
 'don': 7,
 'either': 8,
 'encounter': 9,
 'every': 10,
 'failure': 11,
 'from': 12,
 'in': 13,
 'is': 14,
 'learn': 15,
 'let': 16,
 'life': 17,
 'many': 18,
 'may': 19,
 'more': 20,
 'much': 21,
 'must': 22,
 'not': 23,
 'nothing': 24,
 'of': 25,
 'opportunity': 26,
 'optimist': 27,
 'or': 28,
 'pessimist': 29,
 'sees': 30,
 'success': 31,
 'take': 32,
 'than': 33,
 'the': 34,
 'today': 35,
 'too': 36,
 'up': 37,
 'we': 38,
 'yesterday': 39,
 'you': 40}

In [7]:
count_vectorizer.vocabulary_.get('life')

17

In [8]:
count_vectorizer.vocabulary_.get('optimist')

27

In [9]:
train_text

['The Pessimist Sees Difficulty In Every Opportunity.',
 'The Optimist Sees Opportunity In Every Difficulty.',
 'Don’t Let Yesterday Take Up Too Much Of Today. ',
 'You Learn More From Failure Than From Success.',
 'We May Encounter Many Defeats But We Must Not Be Defeated.',
 'Life Is Either A Daring Adventure Or Nothing.']

In [10]:
transformed_vector = count_vectorizer.transform(train_text)

In [11]:
print(transformed_vector.shape)

(6, 41)


In [12]:
print(transformed_vector.toarray())

[[0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0
  0 0 0 0 0]
 [0 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 1 0
  0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 0 1
  1 1 0 1 0]
 [0 0 0 0 0 0 0 0 0 0 0 1 2 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0
  0 0 0 0 1]
 [0 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 2 0 0]
 [1 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0
  0 0 0 0 0]]


In [13]:
count_vectorizer.inverse_transform(transformed_vector)

[array(['difficulty', 'every', 'in', 'opportunity', 'pessimist', 'sees',
        'the'], dtype='<U11'),
 array(['difficulty', 'every', 'in', 'opportunity', 'optimist', 'sees',
        'the'], dtype='<U11'),
 array(['don', 'let', 'much', 'of', 'take', 'today', 'too', 'up',
        'yesterday'], dtype='<U11'),
 array(['failure', 'from', 'learn', 'more', 'success', 'than', 'you'],
       dtype='<U11'),
 array(['be', 'but', 'defeated', 'defeats', 'encounter', 'many', 'may',
        'must', 'not', 'we'], dtype='<U11'),
 array(['adventure', 'daring', 'either', 'is', 'life', 'nothing', 'or'],
       dtype='<U11')]

In [14]:
test_text = ["A room without books is like a body without a soul."]

count_vectorizer.transform(test_text).toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [15]:
count_vectorizer.fit(train_text + test_text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [16]:
count_vectorizer.vocabulary_

{'adventure': 0,
 'be': 1,
 'body': 2,
 'books': 3,
 'but': 4,
 'daring': 5,
 'defeated': 6,
 'defeats': 7,
 'difficulty': 8,
 'don': 9,
 'either': 10,
 'encounter': 11,
 'every': 12,
 'failure': 13,
 'from': 14,
 'in': 15,
 'is': 16,
 'learn': 17,
 'let': 18,
 'life': 19,
 'like': 20,
 'many': 21,
 'may': 22,
 'more': 23,
 'much': 24,
 'must': 25,
 'not': 26,
 'nothing': 27,
 'of': 28,
 'opportunity': 29,
 'optimist': 30,
 'or': 31,
 'pessimist': 32,
 'room': 33,
 'sees': 34,
 'soul': 35,
 'success': 36,
 'take': 37,
 'than': 38,
 'the': 39,
 'today': 40,
 'too': 41,
 'up': 42,
 'we': 43,
 'without': 44,
 'yesterday': 45,
 'you': 46}

In [17]:
count_vectorizer.transform(test_text).toarray()

array([[0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        2, 0, 0]])

In [18]:
text = ["I'm a pessimist because of intelligence, but an optimist because of will.",
        "Success is not final, failure is not fatal"]

transformed_vector = count_vectorizer.transform(text)

transformed_vector

<2x47 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [19]:
print(transformed_vector)

  (0, 4)	1
  (0, 28)	2
  (0, 30)	1
  (0, 32)	1
  (1, 13)	1
  (1, 16)	2
  (1, 26)	2
  (1, 36)	1


In [20]:
transformed_vector.shape

(2, 47)

In [21]:
print(transformed_vector.toarray())

[[0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 1 0 1 0 0 0
  0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 2 0 0 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0
  1 0 0 0 0 0 0 0 0 0 0]]
