In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer 
from sklearn.feature_extraction.text import CountVectorizer 
 
docs=["the house had a tiny little mouse", 
"the cat saw the mouse", 
"the mouse ran away from the house", 
"the cat finally ate the mouse", 
"the end of the mouse story"
]

# TfidfTransformer

In [2]:
#instantiate CountVectorizer() 
cv=CountVectorizer() 
 
# this steps generates word counts for the words in your docs 
word_count_vector=cv.fit_transform(docs)


In [3]:
word_count_vector.shape

(5, 16)

In [6]:
for i in word_count_vector:
    print(i)

  (0, 14)	1
  (0, 7)	1
  (0, 6)	1
  (0, 15)	1
  (0, 8)	1
  (0, 9)	1
  (0, 14)	2
  (0, 9)	1
  (0, 2)	1
  (0, 12)	1
  (0, 14)	2
  (0, 7)	1
  (0, 9)	1
  (0, 11)	1
  (0, 1)	1
  (0, 5)	1
  (0, 14)	2
  (0, 9)	1
  (0, 2)	1
  (0, 4)	1
  (0, 0)	1
  (0, 14)	2
  (0, 9)	1
  (0, 3)	1
  (0, 10)	1
  (0, 13)	1


In [7]:
feature_names = cv.get_feature_names()
## EVERY WORD NAME
print(feature_names)
row_name=["DOC1","DOC2","DOC3","DOC4","DOC5"]

output=pd.DataFrame.sparse.from_spmatrix(word_count_vector,index=row_name,columns=feature_names)
output

['ate', 'away', 'cat', 'end', 'finally', 'from', 'had', 'house', 'little', 'mouse', 'of', 'ran', 'saw', 'story', 'the', 'tiny']


Unnamed: 0,ate,away,cat,end,finally,from,had,house,little,mouse,of,ran,saw,story,the,tiny
DOC1,0,0,0,0,0,0,1,1,1,1,0,0,0,0,1,1
DOC2,0,0,1,0,0,0,0,0,0,1,0,0,1,0,2,0
DOC3,0,1,0,0,0,1,0,1,0,1,0,1,0,0,2,0
DOC4,1,0,1,0,1,0,0,0,0,1,0,0,0,0,2,0
DOC5,0,0,0,1,0,0,0,0,0,1,1,0,0,1,2,0


### Compute the IDF values

In [8]:
tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
tfidf_transformer.fit(word_count_vector)

TfidfTransformer()

In [13]:
tfidf_transformer.idf_

array([2.09861229, 2.09861229, 1.69314718, 2.09861229, 2.09861229,
       2.09861229, 2.09861229, 1.69314718, 2.09861229, 1.        ,
       2.09861229, 2.09861229, 2.09861229, 2.09861229, 1.        ,
       2.09861229])

### convert idf value into dataframe 

In [14]:
# print idf values 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"]) 
 
# sort ascending 
df_idf.sort_values(by=['idf_weights'])


Unnamed: 0,idf_weights
mouse,1.0
the,1.0
cat,1.693147
house,1.693147
ate,2.098612
away,2.098612
end,2.098612
finally,2.098612
from,2.098612
had,2.098612


In [15]:
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(word_count_vector)
output=pd.DataFrame.sparse.from_spmatrix(tf_idf_vector,index=row_name,columns=feature_names)
output

Unnamed: 0,ate,away,cat,end,finally,from,had,house,little,mouse,of,ran,saw,story,the,tiny
DOC1,0.0,0.0,0.0,0.0,0.0,0.0,0.493562,0.398203,0.493562,0.235185,0.0,0.0,0.0,0.0,0.235185,0.493562
DOC2,0.0,0.0,0.483344,0.0,0.0,0.0,0.0,0.0,0.0,0.285471,0.0,0.0,0.599092,0.0,0.570941,0.0
DOC3,0.0,0.457093,0.0,0.0,0.0,0.457093,0.0,0.36878,0.0,0.217807,0.0,0.457093,0.0,0.0,0.435614,0.0
DOC4,0.513923,0.0,0.41463,0.0,0.513923,0.0,0.0,0.0,0.0,0.244887,0.0,0.0,0.0,0.0,0.489774,0.0
DOC5,0.0,0.0,0.0,0.491753,0.0,0.0,0.0,0.0,0.0,0.234323,0.491753,0.0,0.0,0.491753,0.468646,0.0


# Using TfidfVectorizer

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer 
 
# settings that you use for count vectorizer will go here 
tfidf_vectorizer=TfidfVectorizer(use_idf=True) 
 
# just send in all your docs here 
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(docs)
feature_names = cv.get_feature_names()  ## EVERY WORD NAME
#row_name=["DOC1","DOC2","DOC3","DOC4","DOC5"]

output=pd.DataFrame.sparse.from_spmatrix(tfidf_vectorizer_vectors,
                                         index=row_name,
                                         columns=feature_names)
output

Unnamed: 0,ate,away,cat,end,finally,from,had,house,little,mouse,of,ran,saw,story,the,tiny
DOC1,0.0,0.0,0.0,0.0,0.0,0.0,0.493562,0.398203,0.493562,0.235185,0.0,0.0,0.0,0.0,0.235185,0.493562
DOC2,0.0,0.0,0.483344,0.0,0.0,0.0,0.0,0.0,0.0,0.285471,0.0,0.0,0.599092,0.0,0.570941,0.0
DOC3,0.0,0.457093,0.0,0.0,0.0,0.457093,0.0,0.36878,0.0,0.217807,0.0,0.457093,0.0,0.0,0.435614,0.0
DOC4,0.513923,0.0,0.41463,0.0,0.513923,0.0,0.0,0.0,0.0,0.244887,0.0,0.0,0.0,0.0,0.489774,0.0
DOC5,0.0,0.0,0.0,0.491753,0.0,0.0,0.0,0.0,0.0,0.234323,0.491753,0.0,0.0,0.491753,0.468646,0.0
