In [1]:
from sklearn.feature_extraction.text import CountVectorizer

# Example 1: Basic Bag of Words Representation
# Creating a simple corpus (list of text documents)
corpus1 = ["This is a sample sentence", "This is another example"]

# Initializing the CountVectorizer, which converts text into a BoW representation
vectorizer1 = CountVectorizer()

# Applying the vectorizer to the corpus and transforming the text into numerical form
X1 = vectorizer1.fit_transform(corpus1)


In [2]:
X1

<2x6 sparse matrix of type '<class 'numpy.int64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [3]:
# Printing the vocabulary (unique words in the corpus)
print("Example 1 - Vocabulary:", vectorizer1.get_feature_names_out())

# Printing the BoW matrix, where each row represents a sentence and each column a word count
print("Example 1 - BOW Representation:\n", X1.toarray())

Example 1 - Vocabulary: ['another' 'example' 'is' 'sample' 'sentence' 'this']
Example 1 - BOW Representation:
 [[0 0 1 1 1 1]
 [1 1 1 0 0 1]]


In [4]:
# Example 2: Handling Repeated Words
corpus2 = ["Machine is is learning is fun", "machine learning is a part a of is machine learning"]

# Initializing CountVectorizer for this corpus
vectorizer2 = CountVectorizer()

# Transforming the corpus into BoW format
X2 = vectorizer2.fit_transform(corpus2)

# Displaying the unique words from the corpus
print("\nExample 2 - Vocabulary:", vectorizer2.get_feature_names_out())

# Showing the word frequency matrix
print("Example 2 - BOW Representation:\n", X2.toarray())



Example 2 - Vocabulary: ['fun' 'is' 'learning' 'machine' 'of' 'part']
Example 2 - BOW Representation:
 [[1 3 1 1 0 0]
 [0 2 2 2 1 1]]


In [8]:
a=23
a1=33
a+a1

56

In [9]:
"23" + "33"

'2333'

In [4]:

# Example 3: Ignoring Stopwords
# Initializing CountVectorizer while removing common stopwords
vectorizer3 = CountVectorizer(stop_words='english')

# Transforming the corpus into BoW while ignoring stopwords
X3 = vectorizer3.fit_transform(corpus2)

# Displaying the vocabulary after removing stopwords
print("\nExample 3 - Vocabulary (Ignoring Stopwords):", vectorizer3.get_feature_names_out())

# Showing the word frequency matrix
print("Example 3 - BOW Representation:\n", X3.toarray())


Example 3 - Vocabulary (Ignoring Stopwords): ['deep' 'fun' 'learning' 'machine']
Example 3 - BOW Representation:
 [[0 1 1 1]
 [1 0 2 1]]


In [5]:
# Example 4: Applying N-grams (Bigrams)
# Using n-gram range of (2,2) to capture word pairs instead of individual words
vectorizer4 = CountVectorizer(ngram_range=(2,2))

# Transforming the corpus into BoW with bigrams
X4 = vectorizer4.fit_transform(corpus2)

# Displaying the vocabulary with bigrams
print("\nExample 4 - Vocabulary with Bigrams:", vectorizer4.get_feature_names_out())

# Showing the BoW matrix where columns now represent word pairs
print("Example 4 - BOW Representation:\n", X4.toarray())


Example 4 - Vocabulary with Bigrams: ['deep learning' 'is fun' 'is part' 'learning is' 'machine learning'
 'of machine' 'part of']
Example 4 - BOW Representation:
 [[0 1 0 1 1 0 0]
 [1 0 1 1 1 1 1]]


In [6]:
# Example 5: Custom Tokenization and Preprocessing
# Defining a simple preprocessing function to convert text to lowercase
def custom_preprocessor(text):
    return text.lower()

# Initializing CountVectorizer with a custom preprocessing function
vectorizer5 = CountVectorizer(preprocessor=custom_preprocessor)

# Transforming the corpus with custom preprocessing applied
X5 = vectorizer5.fit_transform(corpus2)

# Displaying the vocabulary after preprocessing
print("\nExample 5 - Vocabulary with Custom Preprocessing:", vectorizer5.get_feature_names_out())

# Showing the final BoW matrix
print("Example 5 - BOW Representation:\n", X5.toarray())



Example 5 - Vocabulary with Custom Preprocessing: ['deep' 'fun' 'is' 'learning' 'machine' 'of' 'part']
Example 5 - BOW Representation:
 [[0 1 1 1 1 0 0]
 [1 0 1 2 1 1 1]]


In [7]:
from sklearn.feature_extraction.text import CountVectorizer
import re
from nltk.corpus import stopwords
import nltk


In [8]:
# Ensure stopwords are downloaded
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\A1\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
# Defining a preprocessing function for NLP expertise

def advanced_preprocessor(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters, numbers, and punctuations
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize words
    words = text.split()
    # Remove stopwords
    words = [word for word in words if word not in stopwords.words('english')]
    return ' '.join(words)


In [10]:
# Creating a complex corpus
corpus6 = [
    "Natural Language Processing (NLP) is an exciting field of AI!",
    "Text preprocessing is a crucial step in any NLP pipeline.",
    "Tokenization, Stopword Removal, and Lemmatization improve model performance."
]

In [11]:
# Applying preprocessing to the corpus
preprocessed_corpus = [advanced_preprocessor(doc) for doc in corpus6]


In [12]:
# Initializing CountVectorizer with preprocessed text
vectorizer6 = CountVectorizer()

# Transforming text into Bag of Words format
X6 = vectorizer6.fit_transform(preprocessed_corpus)


In [13]:
# Displaying the vocabulary after advanced preprocessing
print("\nExample 6 - Vocabulary with Advanced Preprocessing:", vectorizer6.get_feature_names_out())



Example 6 - Vocabulary with Advanced Preprocessing: ['ai' 'crucial' 'exciting' 'field' 'improve' 'language' 'lemmatization'
 'model' 'natural' 'nlp' 'performance' 'pipeline' 'preprocessing'
 'processing' 'removal' 'step' 'stopword' 'text' 'tokenization']


In [14]:
# Showing the BoW representation after applying text cleaning and preprocessing
print("Example 6 - BOW Representation:\n", X6.toarray())


Example 6 - BOW Representation:
 [[1 0 1 1 0 1 0 0 1 1 0 0 0 1 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 1 0]
 [0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 1 0 1]]


In [3]:
import numpy  as np

In [5]:
a1 = np.array([[1,0,0],[0,0,1],[0,1,0]])
a1

array([[1, 0, 0],
       [0, 0, 1],
       [0, 1, 0]])

# Standard TF-IDF Representation

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus2)

print(vectorizer.get_feature_names_out())  
print(X.toarray())  


['deep' 'fun' 'is' 'learning' 'machine' 'of' 'part']
[[0.         0.46977774 0.66850146 0.33425073 0.46977774 0.
  0.        ]
 [0.684738   0.         0.24359836 0.48719673 0.         0.342369
  0.342369  ]]


# TF-IDF with Stopword Removal

In [12]:
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(corpus2)

print(vectorizer.get_feature_names_out())  
print(X.toarray())  


['deep' 'fun' 'learning' 'machine']
[[0.         0.6316672  0.44943642 0.6316672 ]
 [0.81480247 0.         0.57973867 0.        ]]


# 3. TF-IDF with N-Grams

In [3]:
vectorizer = TfidfVectorizer(ngram_range=(2, 3))
X = vectorizer.fit_transform(sentences)

print(vectorizer.get_feature_names_out())  
print(X.toarray())  


NameError: name 'sentences' is not defined

# Sublinear TF Scaling

In [4]:
vectorizer = TfidfVectorizer(sublinear_tf=True)
X = vectorizer.fit_transform(sentences)

print(vectorizer.get_feature_names_out())  
print(X.toarray())  


NameError: name 'sentences' is not defined

![image.png](attachment:2a8edae0-3163-4f32-b39f-582c008e9719.png)

In [7]:
import pandas as pd
import numpy as np
import re
import string
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [8]:
df = pd.read_csv("spam.csv", encoding='latin-1')  # Replace with actual dataset
df = df[['v1', 'v2']]  # Keeping only necessary columns
df.columns = ['label', 'message']

# Convert labels to numerical values
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Display dataset information
print(df.head())
print(df['label'].value_counts())  # Check the distribution of spam and ham messages


FileNotFoundError: [Errno 2] No such file or directory: 'spam.csv'

In [9]:
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['message_clean'] = df['message'].apply(preprocess_text)


NameError: name 'df' is not defined

In [10]:
vectorizer = TfidfVectorizer(max_features=5000)  # Use 5000 most important words
X = vectorizer.fit_transform(df['message_clean'])

y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Shape of Training Data: {X_train.shape}")
print(f"Shape of Test Data: {X_test.shape}")


NameError: name 'df' is not defined

In [11]:
model = MultinomialNB()
model.fit(X_train, y_train)


NameError: name 'X_train' is not defined

In [12]:
y_pred = model.predict(X_test)

# Print accuracy and classification report
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

# Display Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


NameError: name 'X_test' is not defined

In [13]:
def predict_spam(message):
    message = preprocess_text(message)
    vectorized_message = vectorizer.transform([message])
    prediction = model.predict(vectorized_message)
    return "Spam" if prediction[0] == 1 else "Ham"

print(predict_spam("Congratulations! You have won a free iPhone. Claim now!"))
print(predict_spam("Hey, how are you? Let's meet up for coffee."))


NotFittedError: The TF-IDF vectorizer is not fitted