In [66]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer

In [67]:
sklearn.__version__

'0.22.2.post1'

In [68]:
train_text = ["The Curse of the Cheese Pyramid. ",
              "The Hunt for the Golden Book. ",
              "The Temple of the Ruby of Fire. ",
              "Harry Potter and the Prisoner of Azkaban. ",
              "Harry Potter and the Goblet of Fire. ",
              "Harry Potter and the Order of the Phoenix. "]

### Represent text using the bag of words model

In [69]:
count_vectorizer = CountVectorizer()

count_vectorizer.fit(train_text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

### Here are the words in the vocabulary processed from the text

In [70]:
count_vectorizer.get_feature_names()

['and',
 'azkaban',
 'book',
 'cheese',
 'curse',
 'fire',
 'for',
 'goblet',
 'golden',
 'harry',
 'hunt',
 'of',
 'order',
 'phoenix',
 'potter',
 'prisoner',
 'pyramid',
 'ruby',
 'temple',
 'the']

### The unique identifiers for each word in the vocabulary

Notice that they are all in lowercase, this is a part of the default preprocessing

In [71]:
count_vectorizer.vocabulary_

{'the': 19,
 'curse': 4,
 'of': 11,
 'cheese': 3,
 'pyramid': 16,
 'hunt': 10,
 'for': 6,
 'golden': 8,
 'book': 2,
 'temple': 18,
 'ruby': 17,
 'fire': 5,
 'harry': 9,
 'potter': 14,
 'and': 0,
 'prisoner': 15,
 'azkaban': 1,
 'goblet': 7,
 'order': 12,
 'phoenix': 13}

In [72]:
count_vectorizer.vocabulary_.get("azkaban")

1

In [73]:
train_text

['The Curse of the Cheese Pyramid. ',
 'The Hunt for the Golden Book. ',
 'The Temple of the Ruby of Fire. ',
 'Harry Potter and the Prisoner of Azkaban. ',
 'Harry Potter and the Goblet of Fire. ',
 'Harry Potter and the Order of the Phoenix. ']

### Shape of feature vectors

- 6 sentences
- 20 words in the vocabulary
- counts of each word in the vocabulary

In [74]:
transformed_vector = count_vectorizer.transform(train_text)

transformed_vector.shape

(6, 20)

In [75]:
transformed_vector.toarray()

array([[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 2],
       [0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 1, 2],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 2]])

In [76]:
test_text = ["Harry Potter and the Chamber of Secrets. "]

### All words in the test text are not present in our vocabulary

In [77]:
count_vectorizer.transform(test_text).toarray()

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1]])

In [78]:
count_vectorizer.fit(train_text + test_text)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [79]:
count_vectorizer.vocabulary_

{'the': 21,
 'curse': 5,
 'of': 12,
 'cheese': 4,
 'pyramid': 17,
 'hunt': 11,
 'for': 7,
 'golden': 9,
 'book': 2,
 'temple': 20,
 'ruby': 18,
 'fire': 6,
 'harry': 10,
 'potter': 15,
 'and': 0,
 'prisoner': 16,
 'azkaban': 1,
 'goblet': 8,
 'order': 13,
 'phoenix': 14,
 'chamber': 3,
 'secrets': 19}

In [80]:
count_vectorizer.transform(test_text).toarray()

array([[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1]])

In [81]:
text = ["A Fabulous School Adventure",
        "Diary of an Awesomely Friendly Kid",
        "The Hunt for the Hundredth Key",
        "Harry Potter and the Half-Blood Prince"]

### Sparse matrix

The actual transformed matrix is actually a sparse matrix representation

In [82]:
transformed_vector = count_vectorizer.transform(text)

transformed_vector

<4x22 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [83]:
print(transformed_vector)

  (1, 12)	1
  (2, 7)	1
  (2, 11)	1
  (2, 21)	2
  (3, 0)	1
  (3, 10)	1
  (3, 15)	1
  (3, 21)	1


In [84]:
transformed_vector.shape

(4, 22)

In [85]:
transformed_vector.toarray()

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1]])

In [86]:
with open("datasets/harrypotter.txt", "r") as f:
    file_contents = f.read()

In [87]:
print(file_contents)

Nearly ten years had passed since the Dursleys had woken up to find their nephew on the front step, but Privet Drive had hardly changed at all. The sun rose on the same tidy front gardens and lit up the brass number four on the Dursleys' front door; it crept into their living room, which was almost exactly the same as it had been on the night when Mr. Dursley had seen that fateful news report about the owls. Only the photographs on the mantelpiece really showed how much time had passed. Ten years ago, there had been lots of pictures of what looked like a large pink beach ball wearing different-colored bonnets - but Dudley Dursley was no longer a baby, and now the photographs showed a large blond boy riding his first bicycle, on a carousel at the fair, playing a computer game with his father, being hugged and kissed by his mother. The room held no sign at all that another boy lived in the house, too.


In [88]:
sentences = file_contents.split(".")

print(sentences)

['Nearly ten years had passed since the Dursleys had woken up to find their nephew on the front step, but Privet Drive had hardly changed at all', " The sun rose on the same tidy front gardens and lit up the brass number four on the Dursleys' front door; it crept into their living room, which was almost exactly the same as it had been on the night when Mr", ' Dursley had seen that fateful news report about the owls', ' Only the photographs on the mantelpiece really showed how much time had passed', ' Ten years ago, there had been lots of pictures of what looked like a large pink beach ball wearing different-colored bonnets - but Dudley Dursley was no longer a baby, and now the photographs showed a large blond boy riding his first bicycle, on a carousel at the fair, playing a computer game with his father, being hugged and kissed by his mother', ' The room held no sign at all that another boy lived in the house, too', '']


In [89]:
transformed_vector = count_vectorizer.fit_transform(sentences)

transformed_vector.shape

(7, 111)

In [90]:
print(transformed_vector)

  (0, 64)	1
  (0, 93)	1
  (0, 110)	1
  (0, 41)	3
  (0, 75)	1
  (0, 90)	1
  (0, 95)	2
  (0, 30)	1
  (0, 109)	1
  (0, 102)	1
  (0, 100)	1
  (0, 35)	1
  (0, 96)	1
  (0, 65)	1
  (0, 72)	1
  (0, 38)	1
  (0, 91)	1
  (0, 18)	1
  (0, 80)	1
  (0, 27)	1
  (0, 42)	1
  (0, 21)	1
  (0, 7)	1
  (0, 2)	1
  (1, 41)	1
  :	:
  (4, 32)	1
  (4, 79)	1
  (4, 23)	1
  (4, 39)	1
  (4, 108)	1
  (4, 34)	1
  (4, 12)	1
  (4, 47)	1
  (4, 51)	1
  (4, 19)	1
  (4, 61)	1
  (5, 95)	2
  (5, 7)	1
  (5, 2)	1
  (5, 84)	1
  (5, 94)	1
  (5, 68)	1
  (5, 16)	1
  (5, 43)	1
  (5, 89)	1
  (5, 5)	1
  (5, 55)	1
  (5, 48)	1
  (5, 45)	1
  (5, 101)	1


In [91]:
print(count_vectorizer.vocabulary_)

{'nearly': 64, 'ten': 93, 'years': 110, 'had': 41, 'passed': 75, 'since': 90, 'the': 95, 'dursleys': 30, 'woken': 109, 'up': 102, 'to': 100, 'find': 35, 'their': 96, 'nephew': 65, 'on': 72, 'front': 38, 'step': 91, 'but': 18, 'privet': 80, 'drive': 27, 'hardly': 42, 'changed': 21, 'at': 7, 'all': 2, 'sun': 92, 'rose': 85, 'same': 86, 'tidy': 98, 'gardens': 40, 'and': 4, 'lit': 54, 'brass': 17, 'number': 70, 'four': 37, 'door': 26, 'it': 50, 'crept': 24, 'into': 49, 'living': 56, 'room': 84, 'which': 107, 'was': 103, 'almost': 3, 'exactly': 31, 'as': 6, 'been': 11, 'night': 67, 'when': 106, 'mr': 62, 'dursley': 29, 'seen': 87, 'that': 94, 'fateful': 33, 'news': 66, 'report': 82, 'about': 0, 'owls': 74, 'only': 73, 'photographs': 76, 'mantelpiece': 60, 'really': 81, 'showed': 88, 'how': 46, 'much': 63, 'time': 99, 'ago': 1, 'there': 97, 'lots': 59, 'of': 71, 'pictures': 77, 'what': 105, 'looked': 58, 'like': 53, 'large': 52, 'pink': 78, 'beach': 10, 'ball': 9, 'wearing': 104, 'different'

### Reconstructs the original sentences

We have lost the ordering of the words, so the sentences may not be well-formed

In [92]:
count_vectorizer.inverse_transform(transformed_vector)

[array(['nearly', 'ten', 'years', 'had', 'passed', 'since', 'the',
        'dursleys', 'woken', 'up', 'to', 'find', 'their', 'nephew', 'on',
        'front', 'step', 'but', 'privet', 'drive', 'hardly', 'changed',
        'at', 'all'], dtype='<U11'),
 array(['had', 'the', 'dursleys', 'up', 'their', 'on', 'front', 'sun',
        'rose', 'same', 'tidy', 'gardens', 'and', 'lit', 'brass', 'number',
        'four', 'door', 'it', 'crept', 'into', 'living', 'room', 'which',
        'was', 'almost', 'exactly', 'as', 'been', 'night', 'when', 'mr'],
       dtype='<U11'),
 array(['had', 'the', 'dursley', 'seen', 'that', 'fateful', 'news',
        'report', 'about', 'owls'], dtype='<U11'),
 array(['had', 'passed', 'the', 'on', 'only', 'photographs', 'mantelpiece',
        'really', 'showed', 'how', 'much', 'time'], dtype='<U11'),
 array(['ten', 'years', 'had', 'the', 'on', 'but', 'at', 'and', 'was',
        'been', 'dursley', 'photographs', 'showed', 'ago', 'there', 'lots',
        'of', 'pictures'