In [16]:
#TASK-2(Text classification model using Naive Bayes )

In [17]:
#Setup and Libraries
!pip install pandas scikit-learn nltk



In [4]:
import pandas as pd
import zipfile
import requests
import os

# Downloading the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
response = requests.get(url)

# Saving the ZIP file
with open('smsspamcollection.zip', 'wb') as f:
    f.write(response.content)

# Extracting the ZIP file
with zipfile.ZipFile('smsspamcollection.zip', 'r') as zip_ref:
    zip_ref.extractall('smsspamcollection')

# Loading the dataset
df = pd.read_csv('smsspamcollection/SMSSpamCollection', sep='\t', names=['label', 'text'])

# Display the first few rows from the dataset
print(df.head())


  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [15]:
#Preprocess the Text Data
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Downloading NLTK stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

# Preprocessing function
def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing to the text column
df['text'] = df['text'].apply(preprocess)
print(df['text'].head())



[nltk_data] Downloading package stopwords to C:\Users\Vaishnavi
[nltk_data]     Yadav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry wkly comp win fa cup final tkts may...
3                  u dun say early hor u c already say
4               nah think goes usf lives around though
Name: text, dtype: object


In [8]:
#Train-Test Split
#Split the dataset into training and testing sets.
import nltk

# Downloading the necessary resources
nltk.download('punkt')
nltk.download('stopwords')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Preprocessing function
def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply preprocessing to the text column
df['text'] = df['text'].apply(preprocess)
print(df['text'].head())

# Spliting the dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)



[nltk_data] Downloading package punkt to C:\Users\Vaishnavi
[nltk_data]     Yadav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\Vaishnavi
[nltk_data]     Yadav\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


0    go jurong point crazy available bugis n great ...
1                              ok lar joking wif u oni
2    free entry wkly comp win fa cup final tkts may...
3                  u dun say early hor u c already say
4               nah think goes usf lives around though
Name: text, dtype: object


In [10]:
#Vectorization
#Converting the text data into a numerical format using Count Vectorization.
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [12]:
#Training the Naive Bayes Model
#Training the Naive Bayes model on the training set.

model = MultinomialNB()
model.fit(X_train_vec, y_train)

In [14]:
#Predictions and Evaluation
#Making predictions and evaluate the model.
# Predictions
y_pred = model.predict(X_test_vec)

# Evaluation Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)

print(f'Accuracy: {accuracy:.2f}')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')

Accuracy: 0.98
Precision: 0.93
Recall: 0.93


In [None]:
#Brief Analysis Report for SMS Spam Detection Model

#Dataset Overview:
#Source: UCI Machine Learning Repository
#Size: 5,574 messages (4,825 ham, 747 spam)

#Data Preprocessing:

#Text Normalization: The text was converted to lowercase to ensure uniformity.
#Tokenization: Messages were split into individual words (tokens) using NLTK's word tokenizer.
#Stopword Removal: Common English stopwords (e.g., "and", "the") were removed to focus on more meaningful words.
#Vectorization: The preprocessed text was converted into numerical format using Count Vectorization, creating a feature matrix for model training.

#Model Training:

#Algorithm: Multinomial Naive Bayes.
#Multinomial Naive Bayes was chosen for its effectiveness in text classification tasks, particularly for handling discrete data such as word counts.
#Training/Test Split: 80% training, 20% testing.

#Performance Evaluation:

#Accuracy: 0.98%  indicating the proportion of correctly classified messages out of the total messages.
#Precision: 0.93% which reflects the model's ability to correctly identify spam messages out of all messages classified as spam.
#Recall: 0.93% which measures the model's ability to correctly identify all actual spam messages. High recall is essential to ensure that as many spam messages as possible are caught.