## Vectorize text as a bag-of-n-grams

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
train_text = ["The Pessimist Sees Difficulty In Every Opportunity.",
              "The Optimist Sees Opportunity In Every Difficulty.",
              "Don’t Let Yesterday Take Up Too Much Of Today. ",
              "You Learn More From Failure Than From Success.",
              "We May Encounter Many Defeats But We Must Not Be Defeated.",
              "Life Is Either A Daring Adventure Or Nothing."]

#### Set the ngram_range attribute
The argument is a tuple with a minimum and maximum value for the size of the ngram. We start of with only 2-word ngrams

In [21]:
n_gram_vectorizer = CountVectorizer(ngram_range=(2, 2))

In [22]:
transformed_vector = n_gram_vectorizer.fit_transform(train_text)

In [23]:
n_gram_vectorizer.vocabulary_

{'adventure or': 0,
 'be defeated': 1,
 'but we': 2,
 'daring adventure': 3,
 'defeats but': 4,
 'difficulty in': 5,
 'don let': 6,
 'either daring': 7,
 'encounter many': 8,
 'every difficulty': 9,
 'every opportunity': 10,
 'failure than': 11,
 'from failure': 12,
 'from success': 13,
 'in every': 14,
 'is either': 15,
 'learn more': 16,
 'let yesterday': 17,
 'life is': 18,
 'many defeats': 19,
 'may encounter': 20,
 'more from': 21,
 'much of': 22,
 'must not': 23,
 'not be': 24,
 'of today': 25,
 'opportunity in': 26,
 'optimist sees': 27,
 'or nothing': 28,
 'pessimist sees': 29,
 'sees difficulty': 30,
 'sees opportunity': 31,
 'take up': 32,
 'than from': 33,
 'the optimist': 34,
 'the pessimist': 35,
 'too much': 36,
 'up too': 37,
 'we may': 38,
 'we must': 39,
 'yesterday take': 40,
 'you learn': 41}

In [24]:
transformed_vector.toarray()

array([[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
        1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0],
       [1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
      dtype=int64)

In [25]:
transformed_vector.toarray().shape

(6, 42)

In [26]:
n_gram_vectorizer.inverse_transform(transformed_vector)

[array(['every opportunity', 'in every', 'difficulty in',
        'sees difficulty', 'pessimist sees', 'the pessimist'], dtype='<U17'),
 array(['every difficulty', 'opportunity in', 'sees opportunity',
        'optimist sees', 'the optimist', 'in every'], dtype='<U17'),
 array(['of today', 'much of', 'too much', 'up too', 'take up',
        'yesterday take', 'let yesterday', 'don let'], dtype='<U17'),
 array(['from success', 'than from', 'failure than', 'from failure',
        'more from', 'learn more', 'you learn'], dtype='<U17'),
 array(['be defeated', 'not be', 'must not', 'we must', 'but we',
        'defeats but', 'many defeats', 'encounter many', 'may encounter',
        'we may'], dtype='<U17'),
 array(['or nothing', 'adventure or', 'daring adventure', 'either daring',
        'is either', 'life is'], dtype='<U17')]

#### 1 and 2-character ngrams

In [27]:
n_gram_vectorizer = CountVectorizer(ngram_range=(1, 2))

transformed_vector = n_gram_vectorizer.fit_transform(train_text)

n_gram_vectorizer.vocabulary_

{'adventure': 0,
 'adventure or': 1,
 'be': 2,
 'be defeated': 3,
 'but': 4,
 'but we': 5,
 'daring': 6,
 'daring adventure': 7,
 'defeated': 8,
 'defeats': 9,
 'defeats but': 10,
 'difficulty': 11,
 'difficulty in': 12,
 'don': 13,
 'don let': 14,
 'either': 15,
 'either daring': 16,
 'encounter': 17,
 'encounter many': 18,
 'every': 19,
 'every difficulty': 20,
 'every opportunity': 21,
 'failure': 22,
 'failure than': 23,
 'from': 24,
 'from failure': 25,
 'from success': 26,
 'in': 27,
 'in every': 28,
 'is': 29,
 'is either': 30,
 'learn': 31,
 'learn more': 32,
 'let': 33,
 'let yesterday': 34,
 'life': 35,
 'life is': 36,
 'many': 37,
 'many defeats': 38,
 'may': 39,
 'may encounter': 40,
 'more': 41,
 'more from': 42,
 'much': 43,
 'much of': 44,
 'must': 45,
 'must not': 46,
 'not': 47,
 'not be': 48,
 'nothing': 49,
 'of': 50,
 'of today': 51,
 'opportunity': 52,
 'opportunity in': 53,
 'optimist': 54,
 'optimist sees': 55,
 'or': 56,
 'or nothing': 57,
 'pessimist': 58,
 'pe

In [30]:
transformed_vector.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1,
        0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
        0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
        0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
        0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1,
        1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
        0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        1, 1, 2, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
      

In [29]:
transformed_vector.toarray().shape

(6, 83)

In [31]:
n_gram_vectorizer.inverse_transform(transformed_vector)

[array(['every opportunity', 'in every', 'difficulty in',
        'sees difficulty', 'pessimist sees', 'the pessimist',
        'opportunity', 'every', 'in', 'difficulty', 'sees', 'pessimist',
        'the'], dtype='<U17'),
 array(['every difficulty', 'opportunity in', 'sees opportunity',
        'optimist sees', 'the optimist', 'optimist', 'in every',
        'opportunity', 'every', 'in', 'difficulty', 'sees', 'the'],
       dtype='<U17'),
 array(['of today', 'much of', 'too much', 'up too', 'take up',
        'yesterday take', 'let yesterday', 'don let', 'today', 'of',
        'much', 'too', 'up', 'take', 'yesterday', 'let', 'don'],
       dtype='<U17'),
 array(['from success', 'than from', 'failure than', 'from failure',
        'more from', 'learn more', 'you learn', 'success', 'than',
        'failure', 'from', 'more', 'learn', 'you'], dtype='<U17'),
 array(['be defeated', 'not be', 'must not', 'we must', 'but we',
        'defeats but', 'many defeats', 'encounter many', 'may enco