In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
train_text = ["The Curse of the Cheese Pyramid. ",
              "The Hunt for the Golden Book. ",
              "The Temple of the Ruby of Fire. ",
              "Harry Potter and the Prisoner of Azkaban. ",
              "Harry Potter and the Goblet of Fire. ",
              "Harry Potter and the Order of the Phoenix. "]

### Generate the term frequency matrix

In [17]:
count_vectorizer = CountVectorizer()

frequency_term_matrix = count_vectorizer.fit_transform(train_text)

In [24]:
count_vectorizer.vocabulary_

{'the': 19,
 'curse': 4,
 'of': 11,
 'cheese': 3,
 'pyramid': 16,
 'hunt': 10,
 'for': 6,
 'golden': 8,
 'book': 2,
 'temple': 18,
 'ruby': 17,
 'fire': 5,
 'harry': 9,
 'potter': 14,
 'and': 0,
 'prisoner': 15,
 'azkaban': 1,
 'goblet': 7,
 'order': 12,
 'phoenix': 13}

In [25]:
frequency_term_matrix.shape

(6, 20)

In [26]:
frequency_term_matrix.toarray()

array([[0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 2],
       [0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 1, 1, 2],
       [1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 2]])

### The TfidfTransformer

Converts a term frequency matrix to a Tf-IDF representation of words in text

In [36]:
tfidf_transformer = TfidfTransformer()

tfidf_vector1 = tfidf_transformer.fit_transform(frequency_term_matrix)

In [37]:
tfidf_vector1.shape

(6, 20)

### Term Frequency, Inverse Document Frequency

- Upweigh words which are present more often in a single sentence
- Downweigh words which are common across the document corpus

In [38]:
tfidf_vector1.toarray()

array([[0.        , 0.        , 0.        , 0.49686319, 0.49686319,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.25455629, 0.        , 0.        , 0.        ,
        0.        , 0.49686319, 0.        , 0.        , 0.44111449],
       [0.        , 0.        , 0.45699818, 0.        , 0.        ,
        0.        , 0.45699818, 0.        , 0.45699818, 0.        ,
        0.45699818, 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.40572238],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.3861072 , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.48246241, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.47085422, 0.47085422, 0.41802376],
       [0.35068227, 0.5065376 , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.35068227,
        0.        , 0.25951275, 0.        , 0

In [39]:
tfidf_vectorizer = TfidfVectorizer()

tfidf_vector2 = tfidf_vectorizer.fit_transform(train_text)

tfidf_vectorizer.vocabulary_

{'the': 19,
 'curse': 4,
 'of': 11,
 'cheese': 3,
 'pyramid': 16,
 'hunt': 10,
 'for': 6,
 'golden': 8,
 'book': 2,
 'temple': 18,
 'ruby': 17,
 'fire': 5,
 'harry': 9,
 'potter': 14,
 'and': 0,
 'prisoner': 15,
 'azkaban': 1,
 'goblet': 7,
 'order': 12,
 'phoenix': 13}

In [31]:
tfidf_vector2.shape

(6, 20)

In [32]:
tfidf_vectorizer.idf_

array([1.55961579, 2.25276297, 2.25276297, 2.25276297, 2.25276297,
       1.84729786, 2.25276297, 2.25276297, 2.25276297, 1.55961579,
       2.25276297, 1.15415068, 2.25276297, 2.25276297, 1.55961579,
       2.25276297, 2.25276297, 2.25276297, 2.25276297, 1.        ])

### Mapping words and IDF scores

The IDF scores of more common words are lower that less common words

In [40]:
dict(zip(tfidf_vectorizer.get_feature_names(), tfidf_vectorizer.idf_))

{'and': 1.5596157879354227,
 'azkaban': 2.252762968495368,
 'book': 2.252762968495368,
 'cheese': 2.252762968495368,
 'curse': 2.252762968495368,
 'fire': 1.8472978603872037,
 'for': 2.252762968495368,
 'goblet': 2.252762968495368,
 'golden': 2.252762968495368,
 'harry': 1.5596157879354227,
 'hunt': 2.252762968495368,
 'of': 1.1541506798272583,
 'order': 2.252762968495368,
 'phoenix': 2.252762968495368,
 'potter': 1.5596157879354227,
 'prisoner': 2.252762968495368,
 'pyramid': 2.252762968495368,
 'ruby': 2.252762968495368,
 'temple': 2.252762968495368,
 'the': 1.0}

In [41]:
print(tfidf_vector2)

  (0, 16)	0.4968631937710093
  (0, 3)	0.4968631937710093
  (0, 11)	0.2545562941559567
  (0, 4)	0.4968631937710093
  (0, 19)	0.44111448982390433
  (1, 2)	0.4569981804131008
  (1, 8)	0.4569981804131008
  (1, 6)	0.4569981804131008
  (1, 10)	0.4569981804131008
  (1, 19)	0.40572238340577144
  (2, 5)	0.3861071975850641
  (2, 17)	0.4708542218561549
  (2, 18)	0.4708542218561549
  (2, 11)	0.48246240537039725
  (2, 19)	0.41802375877178133
  (3, 1)	0.5065375978836059
  (3, 15)	0.5065375978836059
  (3, 0)	0.3506822714552184
  (3, 14)	0.3506822714552184
  (3, 9)	0.3506822714552184
  (3, 11)	0.2595127499569568
  (3, 19)	0.2248517065343652
  (4, 7)	0.5292681154550154
  (4, 0)	0.3664188911387316
  (4, 14)	0.3664188911387316
  (4, 9)	0.3664188911387316
  (4, 5)	0.43400742595846153
  (4, 11)	0.27115820164217774
  (4, 19)	0.23494176833371702
  (5, 13)	0.47200515447423746
  (5, 12)	0.47200515447423746
  (5, 0)	0.32677503190519636
  (5, 14)	0.32677503190519636
  (5, 9)	0.32677503190519636
  (5, 11)	0.24182

In [42]:
print(tfidf_vector1)

  (0, 19)	0.44111448982390433
  (0, 16)	0.4968631937710093
  (0, 11)	0.2545562941559567
  (0, 4)	0.4968631937710093
  (0, 3)	0.4968631937710093
  (1, 19)	0.40572238340577144
  (1, 10)	0.4569981804131008
  (1, 8)	0.4569981804131008
  (1, 6)	0.4569981804131008
  (1, 2)	0.4569981804131008
  (2, 19)	0.41802375877178133
  (2, 18)	0.4708542218561549
  (2, 17)	0.4708542218561549
  (2, 11)	0.48246240537039725
  (2, 5)	0.3861071975850641
  (3, 19)	0.2248517065343652
  (3, 15)	0.5065375978836059
  (3, 14)	0.3506822714552184
  (3, 11)	0.2595127499569568
  (3, 9)	0.3506822714552184
  (3, 1)	0.5065375978836059
  (3, 0)	0.3506822714552184
  (4, 19)	0.23494176833371708
  (4, 14)	0.36641889113873166
  (4, 11)	0.2711582016421778
  (4, 9)	0.36641889113873166
  (4, 7)	0.5292681154550155
  (4, 5)	0.43400742595846165
  (4, 0)	0.36641889113873166
  (5, 19)	0.4190455552361038
  (5, 14)	0.32677503190519636
  (5, 13)	0.47200515447423746
  (5, 12)	0.47200515447423746
  (5, 11)	0.24182085622717006
  (5, 9)	0.326