In [26]:
import pandas as pd
import re 
import nltk 
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/erdaulet/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words] 
    return ' '.join(tokens)

In [29]:
df = pd.read_csv('spam mail.csv')

In [30]:
df.columns

Index(['Category', 'Masseges'], dtype='object')

In [31]:
df['cleaned_messages'] = df['Masseges'].apply(preprocess)

In [32]:
df

Unnamed: 0,Category,Masseges,cleaned_messages
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goes usf lives around though
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,2nd time tried 2 contact u u å750 pound prize ...
5568,ham,Will Ì_ b going to esplanade fr home?,ì_ b going esplanade fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",pity mood soany suggestions
5570,ham,The guy did some bitching but I acted like i'd...,guy bitching acted like id interested buying s...


In [33]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['cleaned_messages'])
y = df['Category']

In [34]:
X = X.toarray()
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=.2)

In [35]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train,y_train)
y_pred = model.predict(X_test)

In [36]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.99       972
        spam       1.00      0.80      0.89       143

    accuracy                           0.97      1115
   macro avg       0.99      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115



In [50]:
my_text = ["click the link and get free 100 dollar bonus"]
my_vector = vectorizer.transform(my_text)
prediction = model.predict(my_vector)
print(f"result:{prediction[0]}")

result:spam
