In [None]:
import pandas as pd
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
from wordcloud import WordCloud
nltk.download('punkt')
nltk.download('stopwords')

In [None]:
#Import Dataset
df = pd.read_csv('SMSSpamCollection', sep='\t', names=['label', 'text'])
df

In [None]:
#Plot for Ham and Spam Count
plot_data = df.groupby('label')['text'].nunique()
colors = ['#FF0000', '#90EE90']
plot_data.plot(kind='bar', color=colors, figsize=(4,3))
plt.title('Count of Ham of Spam')
plt.xlabel('Label')
plt.ylabel('Number of Unique Texts')
plt.show()


In [None]:
#Spam Words WordCloud
spam_text = " ".join(df[df['label'] == 'spam']['text'])
wordcloud = WordCloud(width=800, height=800, background_color='white', min_font_size=10).generate(spam_text)
plt.figure(figsize=(5, 5), facecolor=None)
plt.imshow(wordcloud)
plt.title('WordCloud of Top Spam Words')
plt.axis('off')
plt.tight_layout(pad=0)
plt.savefig('2.jpg')
plt.show()

In [None]:
#Stop Words
stop_words = set(stopwords.words('english'))
print(stop_words)

In [None]:
#Preprocessing Text from Dataset
def preprocess_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\S+', '', text)   
    words = nltk.word_tokenize(text)
    words = [word.lower() for word in words]
    words = [word for word in words if word not in stop_words]
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words] 
    return ' '.join(words)     

In [None]:
#Applying Preprocessing to Data
df['text'] = df['text'].apply(preprocess_text) 
df

In [None]:
#Function for using Count Vecorizer with Multinomial Naive Bayes
def count_vectorizer_with_MNB():
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(df['text'])
    X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)
    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('MultinomialNB using CountVectorizer:')
    print('Accuracy Score:', accuracy_score(y_test, y_pred))
    print('Precision Score:', precision_score(y_test, y_pred, pos_label='spam'))
    print('Recall Score:', recall_score(y_test, y_pred, pos_label='spam'))
    print('F1 Score:', f1_score(y_test, y_pred, pos_label='spam'))
    print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
    print('Cross-validation Scores:', cross_val_score(clf, X, df['label'], cv=10))

In [None]:
#Function for using TfidfVectorizer with Multinomial Naive Bayes
def tfidf_vectorizer_with_MNB():
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(df['text'])
    X_train, X_test, y_train, y_test = train_test_split(X, df['label'], test_size=0.2, random_state=42)
    clf = MultinomialNB()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print('MultinomialNB using TfidfVectorizer:')
    print('Accuracy Score:', accuracy_score(y_test, y_pred))
    print('Precision Score:', precision_score(y_test, y_pred, pos_label='spam'))
    print('Recall Score:', recall_score(y_test, y_pred, pos_label='spam'))
    print('F1 Score:', f1_score(y_test, y_pred, pos_label='spam'))
    print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
    print('Cross-validation Scores:', cross_val_score(clf, X, df['label'], cv=10))

In [None]:
#Result Using CountVecorizer with Multinomial Naive Bayes
count_vectorizer_with_MNB()

In [None]:
#Result Using TfidfVectorizer with Multinomial Naive Bayes
tfidf_vectorizer_with_MNB()