In [1]:
import pandas as pd 

from sklearn.feature_extraction.text import TfidfVectorizer

# create the transform
vectorizer = TfidfVectorizer()

from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()

In [2]:
# list of text documents
texts=['NLP is an interesting area of NLP work and NLP is getting popular every day for classification', 
       'New algortithms are being build day by day', 
       'I am working on accuracy of classification']

In [3]:
# tokenize and build vocab
vectorizer.fit(texts)


TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [4]:
# summarize
print(vectorizer.vocabulary_)


{'nlp': 18, 'is': 16, 'an': 3, 'interesting': 15, 'area': 6, 'of': 19, 'work': 22, 'and': 4, 'getting': 14, 'popular': 21, 'every': 12, 'day': 11, 'for': 13, 'classification': 10, 'new': 17, 'algortithms': 1, 'are': 5, 'being': 7, 'build': 8, 'by': 9, 'am': 2, 'working': 23, 'on': 20, 'accuracy': 0}


In [5]:
len(vectorizer.vocabulary_)

24

In [6]:
# most words in the text we have selected do not repeat except for 'day' & 'of' which gets a lower value at 1.28 as they appear in more than one documents
print(vectorizer.idf_)


[1.69314718 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718
 1.69314718 1.69314718 1.69314718 1.69314718 1.28768207 1.28768207
 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718
 1.69314718 1.28768207 1.69314718 1.69314718 1.69314718 1.69314718]


In [7]:
# encode document
vector = vectorizer.transform([texts[0]])
vector

<1x24 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [8]:
# summarize encoded vector
print(vector.shape)


(1, 24)


In [9]:
# Here the values get assigned on a normalized scale between 0 and 1
print(vector.toarray())

[[0.         0.         0.         0.20525964 0.20525964 0.
  0.20525964 0.         0.         0.         0.15610525 0.15610525
  0.20525964 0.20525964 0.20525964 0.20525964 0.41051928 0.
  0.61577893 0.15610525 0.         0.20525964 0.20525964 0.        ]]


In [10]:
# we can also combine the steps of word counting, fit and transform into a single step. Not that Tfidftransformer does now allow that
vector2 = vectorizer.fit_transform(texts)

In [11]:
print(vector2[0].toarray())

[[0.         0.         0.         0.20525964 0.20525964 0.
  0.20525964 0.         0.         0.         0.15610525 0.15610525
  0.20525964 0.20525964 0.20525964 0.20525964 0.41051928 0.
  0.61577893 0.15610525 0.         0.20525964 0.20525964 0.        ]]


In [12]:
from sklearn.feature_extraction.text import TfidfTransformer

# this steps generates word counts for the words in your docs 
word_count_vector=cv.fit_transform(texts)

tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True) 
#compute the IDF values by calling tfidf_transformer.fit(word_count_vector) on the word counts
tfidf_transformer.fit(word_count_vector)

# print idf values after placing them in a DataFrame. The values will be sorted in ascending order. 
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"]) 
 
# sort ascending 
df_idf.sort_values(by=['idf_weights'])

Unnamed: 0,idf_weights
day,1.287682
of,1.287682
classification,1.287682
popular,1.693147
on,1.693147
nlp,1.693147
new,1.693147
is,1.693147
interesting,1.693147
getting,1.693147


In [13]:
# computer tf-idf score of a document
# count matrix 
count_vector=cv.transform(texts) 
 
# tf-idf scores 
tf_idf_vector=tfidf_transformer.transform(count_vector)

feature_names = cv.get_feature_names() 
 
#get tfidf vector for first document 
first_document_vector=tf_idf_vector[0] 
 
#print the scores 
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"]) 
df.sort_values(by=["tfidf"],ascending=False)


Unnamed: 0,tfidf
nlp,0.615779
is,0.410519
every,0.20526
work,0.20526
an,0.20526
and,0.20526
popular,0.20526
area,0.20526
for,0.20526
getting,0.20526


References: 
https://kavita-ganesan.com/tfidftransformer-tfidfvectorizer-usage-differences/#.XzmFsehKiUk

https://machinelearningmastery.com/prepare-text-data-machine-learning-scikit-learn/

https://en.wikipedia.org/wiki/Tf%E2%80%93idf
        