In [288]:
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
import pandas as pd 

In [258]:
df=pd.read_csv('spam.csv',usecols=['v1','v2'])

In [260]:
df.columns=['lable','message']

In [261]:
df.head()

Unnamed: 0,lable,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data cleaning and preprocessing 

In [265]:

lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer('english')
#tokenized = [word_tokenize(sent) for sent in sentes] 
corpus=[]
for i in range(0,len(df)):
    tokenized = word_tokenize(df['message'][i])
    cleaned = [stemmer.stem(lemmatizer.lemmatize(token.lower())) 
                for token in tokenized
                if not token.lower() in stopwords.words('english') 
                if token.isalpha()]
    untokenized = " ".join(cleaned)
    corpus.append(untokenized)
    
                  

## vectorization of data 

In [270]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=5000)
vectors = vectorizer.fit_transform(corpus)

X = pd.DataFrame(vectors.toarray(), 
                     columns=vectorizer.get_feature_names())

X.head(10)

Unnamed: 0,aa,aah,aaniy,aaooooright,abbey,abdomen,abeg,abel,aberdeen,abi,...,zaher,zealand,zebra,zed,zero,zhong,zindgi,zoe,zogtorius,zoom
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [273]:
y=pd.get_dummies(df['lable'])
y=y.iloc[:,1].values

In [276]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [278]:
spam_detection=MultinomialNB().fit(X_train,y_train)

In [279]:
prediction=spam_detection.predict(X_test)

In [283]:
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       965
           1       0.85      0.90      0.87       150

    accuracy                           0.97      1115
   macro avg       0.92      0.94      0.93      1115
weighted avg       0.97      0.97      0.97      1115



In [286]:
confusion_matrix(y_test,prediction)

array([[941,  24],
       [ 15, 135]], dtype=int64)

In [289]:
accuracy_score(y_test,prediction)

0.9650224215246637