## **SMS_SPAM_TASK**

## **Import the required libraries**

In [None]:
import pandas as pd
import numpy as np
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns


In [None]:
data = pd.read_csv('/spam.csv', encoding='iso-8859-1')
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


# Data preprocessing

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
columns_to_drop = ["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"]
data.drop(columns=columns_to_drop, inplace=True)

In [None]:
data

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [None]:
new_column_names = {"v1":"Category","v2":"Message"}
data.rename(columns = new_column_names,inplace = True)

In [None]:
data[data.duplicated()]

Unnamed: 0,Category,Message,Unnamed: 2,Unnamed: 3,Unnamed: 4
102,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
153,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
206,ham,"As I entered my cabin my PA said, '' Happy B'd...",,,
222,ham,"Sorry, I'll call later",,,
325,ham,No calls..messages..missed calls,,,
...,...,...,...,...,...
5524,spam,You are awarded a SiPix Digital Camera! call 0...,,,
5535,ham,"I know you are thinkin malaria. But relax, chi...",,,
5539,ham,Just sleeping..and surfing,,,
5553,ham,Hahaha..use your brain dear,,,


In [None]:
print(data.columns)


Index(['Category', 'Message', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [None]:
data['Category'].value_counts()


ham     4825
spam     747
Name: Category, dtype: int64

In [None]:

# Split the dataset into features (X) and labels (y)
X = data['Category']
y = data['Message']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
X

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: Category, Length: 5572, dtype: object

In [None]:
y

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                Will Ì_ b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: Message, Length: 5572, dtype: object

In [None]:
print(X.shape)
print(X_train.shape)
print(X_test.shape)

(5572,)
(4457,)
(1115,)


## Model Training:

In [None]:
feature_extraction = TfidfVectorizer(min_df=1, stop_words="english", lowercase=True)

In [None]:
# Initialize and fit the TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [None]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)


In [None]:
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train_tfidf, y_train)


In [None]:
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train)


## Model Evaluation:

In [None]:
# Function to evaluate and print the classifier's performance
def evaluate_classifier(classifier, X_test, y_test):
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Evaluate Naive Bayes Classifier
print("Naive Bayes Classifier:")
evaluate_classifier(nb_classifier, X_test_tfidf, y_test)

# Evaluate Logistic Regression Classifier
print("Logistic Regression Classifier:")
evaluate_classifier(lr_classifier, X_test_tfidf, y_test)

# Evaluate Support Vector Machine Classifier
print("Support Vector Machine Classifier:")
evaluate_classifier(svm_classifier, X_test_tfidf, y_test)


Naive Bayes Classifier:
Accuracy: 0.007174887892376682


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Classification Report:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 precision    recall  f1-score   support

                      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))



Classification Report:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 precision    recall  f1-score   support

                      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Prediction:

In [None]:
new_sms = ["Congratulations! You've won a free vacation to an exotic island. Just click on the link below to claim your prize."]
new_sms_tfidf = tfidf_vectorizer.transform(new_sms)

# Predict with Naive Bayes
nb_predictions = nb_classifier.predict(new_sms_tfidf)

# Predict with Logistic Regression
lr_predictions = lr_classifier.predict(new_sms_tfidf)

# Predict with SVM
svm_predictions = svm_classifier.predict(new_sms_tfidf)

print("Naive Bayes Predictions:", nb_predictions)
print("Logistic Regression Predictions:", lr_predictions)
print("Support Vector Machine Predictions:", svm_predictions)


Naive Bayes Predictions: ["Sorry, I'll call later"]
Logistic Regression Predictions: ["Sorry, I'll call later"]
Support Vector Machine Predictions: ['Your opinion about me? 1. Over 2. Jada 3. Kusruthi 4. Lovable 5. Silent 6. Spl character 7. Not matured 8. Stylish 9. Simple Pls reply..']
