# Parameters:

input: It refers to parameter document passed, it can be a filename, file or content itself.
Attributes:

vocabulary_: It returns a dictionary of terms as keys and values as feature indices.
idf_: It returns the inverse document frequency vector of the document passed as a parameter.
Returns:

fit_transform(): It returns an array of terms along with tf-idf values.
get_feature_names(): It returns a list of feature names.

In [1]:
# import required module
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
# assign documents
d0 = 'Maharishi University of Information Technology'
d1 = 'TF'
d2 = 'IDF'

# merge documents into a single corpus
string = [d0, d1, d2]


In [3]:
# create object
tfidf = TfidfVectorizer()

# get tf-df values
result = tfidf.fit_transform(string)
result

<3x7 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [4]:
# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)

# display tf-idf values
print('\ntf-idf value:')
print(result)

# in matrix form
print('\ntf-idf values in matrix form:')
print(result.toarray())



Word indexes:
{'maharishi': 2, 'university': 6, 'of': 3, 'information': 1, 'technology': 4, 'tf': 5, 'idf': 0}

tf-idf value:
  (0, 2)	0.4472135954999579
  (0, 6)	0.4472135954999579
  (0, 3)	0.4472135954999579
  (0, 1)	0.4472135954999579
  (0, 4)	0.4472135954999579
  (1, 5)	1.0
  (2, 0)	1.0

tf-idf values in matrix form:
[[0.        0.4472136 0.4472136 0.4472136 0.4472136 0.        0.4472136]
 [0.        0.        0.        0.        0.        1.        0.       ]
 [1.        0.        0.        0.        0.        0.        0.       ]]


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

documents = ["I love data science", "Data science is amazing", "I love coding in Python"]
vectorizer = TfidfVectorizer()

tfidf_matrix = vectorizer.fit_transform(documents)

print(vectorizer.get_feature_names_out())
print(tfidf_matrix.toarray())


['amazing' 'coding' 'data' 'in' 'is' 'love' 'python' 'science']
[[0.         0.         0.57735027 0.         0.         0.57735027
  0.         0.57735027]
 [0.5628291  0.         0.42804604 0.         0.5628291  0.
  0.         0.42804604]
 [0.         0.52863461 0.         0.52863461 0.         0.40204024
  0.52863461 0.        ]]


In [7]:
import pandas as pd

df_tfidf = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())
df_tfidf


Unnamed: 0,amazing,coding,data,in,is,love,python,science
0,0.0,0.0,0.57735,0.0,0.0,0.57735,0.0,0.57735
1,0.562829,0.0,0.428046,0.0,0.562829,0.0,0.0,0.428046
2,0.0,0.528635,0.0,0.528635,0.0,0.40204,0.528635,0.0


In [16]:
from sklearn.preprocessing import normalize

tfidf_norm = normalize(tfidf_matrix, norm='l2', axis=1)
print(tfidf_norm)


  (0, 5)	0.5773502691896257
  (0, 2)	0.5773502691896257
  (0, 7)	0.5773502691896257
  (1, 2)	0.4280460350631185
  (1, 7)	0.4280460350631185
  (1, 4)	0.5628290964997665
  (1, 0)	0.5628290964997665
  (2, 5)	0.4020402441612698
  (2, 1)	0.5286346066596935
  (2, 3)	0.5286346066596935
  (2, 6)	0.5286346066596935


# Spam Filtering using Regex

In [17]:
import re

spam_keywords = ["win", "free", "click", "subscribe", "offer"]

def is_spam(message):
    return any(word in message.lower() for word in spam_keywords)

print(is_spam("Win a free iPhone!"))  # Output: True
print(is_spam("Meeting at 10 AM"))  # Output: False


True
False


# TF-IDF + Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample dataset
messages = ["Win a free iPhone now", "Limited time offer!", "Meeting at 10 AM", "Let's catch up soon"]
labels = [1, 1, 0, 0]  # 1 = spam, 0 = ham


# Convert text to TF-IDF
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(messages)

# Train model
clf = LogisticRegression()
clf.fit(X_tfidf, labels)

# Predict
print(clf.predict(vectorizer.transform(["Limited offer now!"])))


[1]


# Here, tf-idf values are computed from a corpus having unique values. 

In [8]:
# import required module
from sklearn.feature_extraction.text import TfidfVectorizer

# assign documents
d0 = 'Student1'
d1 = 'Student2'
d2 = 'Student3'
d3 = 'Student4'

# merge documents into a single corpus
string = [d0, d1, d2, d3]

# create object
tfidf = TfidfVectorizer()

# get tf-df values
result = tfidf.fit_transform(string)

# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)

# display tf-idf values
print('\ntf-idf values:')
print(result)



Word indexes:
{'student1': 0, 'student2': 1, 'student3': 2, 'student4': 3}

tf-idf values:
  (0, 0)	1.0
  (1, 1)	1.0
  (2, 2)	1.0
  (3, 3)	1.0


# In this program, tf-idf values are computed from a corpus having similar documents.

In [7]:
# import required module
from sklearn.feature_extraction.text import TfidfVectorizer

# assign documents
d0 = 'Ganpati Bappa Morya!'
d1 = 'Ganpati Bappa Morya!'


# merge documents into a single corpus
string = [d0, d1]

# create object
tfidf = TfidfVectorizer()

# get tf-df values
result = tfidf.fit_transform(string)

# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)

# display tf-idf values
print('\ntf-idf values:')
print(result)



Word indexes:
{'ganpati': 1, 'bappa': 0, 'morya': 2}

tf-idf values:
  (0, 1)	0.5773502691896258
  (0, 0)	0.5773502691896258
  (0, 2)	0.5773502691896258
  (1, 1)	0.5773502691896258
  (1, 0)	0.5773502691896258
  (1, 2)	0.5773502691896258


#  Below is the program in which we try to calculate tf-idf value of a single word geeks is repeated multiple times in multiple documents.

In [6]:
# import required module
from sklearn.feature_extraction.text import TfidfVectorizer

# assign corpus
string = ['Bhai Thoda Padh le.']*5

# create object
tfidf = TfidfVectorizer()

# get tf-df values
result = tfidf.fit_transform(string)

# get indexing
print('\nWord indexes:')
print(tfidf.vocabulary_)

# display tf-idf values
print('\ntf-idf values:')
print(result)



Word indexes:
{'bhai': 0, 'thoda': 3, 'padh': 2, 'le': 1}

tf-idf values:
  (0, 0)	0.5
  (0, 3)	0.5
  (0, 2)	0.5
  (0, 1)	0.5
  (1, 0)	0.5
  (1, 3)	0.5
  (1, 2)	0.5
  (1, 1)	0.5
  (2, 0)	0.5
  (2, 3)	0.5
  (2, 2)	0.5
  (2, 1)	0.5
  (3, 0)	0.5
  (3, 3)	0.5
  (3, 2)	0.5
  (3, 1)	0.5
  (4, 0)	0.5
  (4, 3)	0.5
  (4, 2)	0.5
  (4, 1)	0.5
