# Parameters:

input: It refers to parameter document passed, it can be a filename, file or content itself.
Attributes:

vocabulary_: It returns a dictionary of terms as keys and values as feature indices.
idf_: It returns the inverse document frequency vector of the document passed as a parameter.
Returns:

fit_transform(): It returns an array of terms along with tf-idf values.
get_feature_names(): It returns a list of feature names.

In [15]:
# import required module
from sklearn.feature_extraction.text import TfidfVectorizer


In [16]:
# assign documents
d0 = 'IMS Pro School'
d1 = 'TF'
d2 = 'IDF'

# merge documents into a single corpus
string = [d0, d1, d2]


In [17]:
# create object
tfidf = TfidfVectorizer()

# get tf-df values
result = tfidf.fit_transform(string)


In [18]:
# get idf values
print('\nidf values:')
for ele1, ele2 in zip(tfidf.get_feature_names(), tfidf.idf_):
	print(ele1, ':', ele2)



idf values:
idf : 1.6931471805599454
ims : 1.6931471805599454
pro : 1.6931471805599454
school : 1.6931471805599454
tf : 1.6931471805599454


In [19]:
# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)

# display tf-idf values
print('\ntf-idf value:')
print(result)

# in matrix form
print('\ntf-idf values in matrix form:')
print(result.toarray())



Word indexes:
{'ims': 1, 'pro': 2, 'school': 3, 'tf': 4, 'idf': 0}

tf-idf value:
  (0, 3)	0.5773502691896257
  (0, 2)	0.5773502691896257
  (0, 1)	0.5773502691896257
  (1, 4)	1.0
  (2, 0)	1.0

tf-idf values in matrix form:
[[0.         0.57735027 0.57735027 0.57735027 0.        ]
 [0.         0.         0.         0.         1.        ]
 [1.         0.         0.         0.         0.        ]]


In [20]:
# import required module
from sklearn.feature_extraction.text import TfidfVectorizer

# assign documents
d0 = 'IMS Pro School'
d1 = 'Andheri'
d2 = 'East'

# merge documents into a single corpus
string = [d0, d1, d2]

# create object
tfidf = TfidfVectorizer()

# get tf-df values
result = tfidf.fit_transform(string)

# get idf values
print('\nidf values:')
for ele1, ele2 in zip(tfidf.get_feature_names(), tfidf.idf_):
	print(ele1, ':', ele2)

# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)

# display tf-idf values
print('\ntf-idf value:')
print(result)

# in matrix form
print('\ntf-idf values in matrix form:')
print(result.toarray())



idf values:
andheri : 1.6931471805599454
east : 1.6931471805599454
ims : 1.6931471805599454
pro : 1.6931471805599454
school : 1.6931471805599454

Word indexes:
{'ims': 2, 'pro': 3, 'school': 4, 'andheri': 0, 'east': 1}

tf-idf value:
  (0, 4)	0.5773502691896257
  (0, 3)	0.5773502691896257
  (0, 2)	0.5773502691896257
  (1, 0)	1.0
  (2, 1)	1.0

tf-idf values in matrix form:
[[0.         0.         0.57735027 0.57735027 0.57735027]
 [1.         0.         0.         0.         0.        ]
 [0.         1.         0.         0.         0.        ]]


# Here, tf-idf values are computed from a corpus having unique values. 

In [21]:
# import required module
from sklearn.feature_extraction.text import TfidfVectorizer

# assign documents
d0 = 'Student1'
d1 = 'Student2'
d2 = 'Student3'
d3 = 'Student4'

# merge documents into a single corpus
string = [d0, d1, d2, d3]

# create object
tfidf = TfidfVectorizer()

# get tf-df values
result = tfidf.fit_transform(string)

# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)

# display tf-idf values
print('\ntf-idf values:')
print(result)



Word indexes:
{'student1': 0, 'student2': 1, 'student3': 2, 'student4': 3}

tf-idf values:
  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0


# In this program, tf-idf values are computed from a corpus having similar documents.

In [22]:
# import required module
from sklearn.feature_extraction.text import TfidfVectorizer

# assign documents
d0 = 'Ganpati Bappa Morya!'
d1 = 'Ganpati Bappa Morya!'


# merge documents into a single corpus
string = [d0, d1]

# create object
tfidf = TfidfVectorizer()

# get tf-df values
result = tfidf.fit_transform(string)

# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)

# display tf-idf values
print('\ntf-idf values:')
print(result)



Word indexes:
{'ganpati': 1, 'bappa': 0, 'morya': 2}

tf-idf values:
  (0, 2)	0.5773502691896258
  (0, 0)	0.5773502691896258
  (0, 1)	0.5773502691896258
  (1, 2)	0.5773502691896258
  (1, 0)	0.5773502691896258
  (1, 1)	0.5773502691896258


#  Below is the program in which we try to calculate tf-idf value of a single word geeks is repeated multiple times in multiple documents.

In [23]:
# import required module
from sklearn.feature_extraction.text import TfidfVectorizer

# assign corpus
string = ['Bhai Thoda Padh le.']*5

# create object
tfidf = TfidfVectorizer()

# get tf-df values
result = tfidf.fit_transform(string)

# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)

# display tf-idf values
print('\ntf-idf values:')
print(result)



Word indexes:
{'bhai': 0, 'thoda': 3, 'padh': 2, 'le': 1}

tf-idf values:
  (0, 1)	0.5
  (0, 2)	0.5
  (0, 3)	0.5
  (0, 0)	0.5
  (1, 1)	0.5
  (1, 2)	0.5
  (1, 3)	0.5
  (1, 0)	0.5
  (2, 1)	0.5
  (2, 2)	0.5
  (2, 3)	0.5
  (2, 0)	0.5
  (3, 1)	0.5
  (3, 2)	0.5
  (3, 3)	0.5
  (3, 0)	0.5
  (4, 1)	0.5
  (4, 2)	0.5
  (4, 3)	0.5
  (4, 0)	0.5
