In [1]:
import pandas as pd

# Load the dataset
data = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding='latin-1')
data = data[['v1', 'v2']]  # Selecting only the relevant columns

In [2]:
data #printing

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
# Convert 'ham' to 0 and 'spam' to 1 directly in the 'v1' column
data['v1'] = data['v1'].apply(lambda x: 1 if x == 'spam' else 0)

# removing duplicates
data = data.drop_duplicates()

In [4]:
data

Unnamed: 0,v1,v2
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


To prevent the "SettingWithCopyWarning" that can occur when making changes to a slice of a DataFrame.

In [5]:
import pandas as pd
pd.options.mode.chained_assignment = None  # Disable the warning

In [6]:
import re

def clean_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z]', ' ', text)
    return cleaned_text

data['v2'] = data['v2'].apply(lambda x: clean_text(x))


In [7]:
data['v2'] = data['v2'].str.lower()

In [8]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
from nltk.tokenize import word_tokenize

data['v2'] = data['v2'].apply(word_tokenize)


In [10]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

def stem_words(words):
    return [stemmer.stem(word) for word in words]

data['v2'] = data['v2'].apply(stem_words)


In [11]:
data

Unnamed: 0,v1,v2
0,0,"[go, until, jurong, point, crazi, avail, onli,..."
1,0,"[ok, lar, joke, wif, u, oni]"
2,1,"[free, entri, in, a, wkli, comp, to, win, fa, ..."
3,0,"[u, dun, say, so, earli, hor, u, c, alreadi, t..."
4,0,"[nah, i, don, t, think, he, goe, to, usf, he, ..."
...,...,...
5567,1,"[thi, is, the, nd, time, we, have, tri, contac..."
5568,0,"[will, b, go, to, esplanad, fr, home]"
5569,0,"[piti, wa, in, mood, for, that, so, ani, other..."
5570,0,"[the, guy, did, some, bitch, but, i, act, like..."


In [12]:
from sklearn.feature_extraction.text import CountVectorizer

# Convert tokenized words back to text
data['v2'] = data['v2'].apply(lambda x: ' '.join(x))

# Initialize the Count Vectorizer
count_vectorizer = CountVectorizer(max_features=5000)  # You can adjust max_features as needed

# Apply the vectorizer to the 'v2' column
features = count_vectorizer.fit_transform(data['v2'])

# Convert the result to a dense array (if needed)
features = features.toarray()


In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, data['v1'], test_size=0.2, random_state=42)


In [14]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()


In [15]:
clf.fit(X_train, y_train)


In [16]:
from sklearn.metrics import classification_report

y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)


              precision    recall  f1-score   support

           0       0.99      0.98      0.99       889
           1       0.90      0.94      0.92       145

    accuracy                           0.98      1034
   macro avg       0.94      0.96      0.95      1034
weighted avg       0.98      0.98      0.98      1034



In [17]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import MultinomialNB

# Assuming 'features' and 'data['v1']' are your features and labels
X = features
y = data['v1']

# Initialize a Naive Bayes classifier
clf = MultinomialNB()

# Perform 5-fold cross-validation (you can adjust 'cv' as needed)
cv_scores = cross_val_score(clf, X, y, cv=5)

# Print the cross-validation scores
print(f'Cross-Validation Scores: {cv_scores}')
print(f'Mean CV Score: {cv_scores.mean()}')


Cross-Validation Scores: [0.97969052 0.97582205 0.97582205 0.9787234  0.9767667 ]
Mean CV Score: 0.9773649452028887
