## 1.Import Libraries

In [None]:
import pandas as pd
import numpy as np

import re
import nltk
import seaborn as sns
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

from nltk.tokenize import word_tokenize,sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix,classification_report

ps=PorterStemmer()
ls=WordNetLemmatizer()
cv=CountVectorizer()

## 2.Read Data

In [None]:
data=pd.read_csv('spam.csv',encoding='latin-1')
data.head()

In [None]:
# Remove unnnecessarycolumns like: unnamed:2,unnamed:3 and unnamed:4
data=data.iloc[:,:2]
data.head()

In [None]:
# Check for null value in the columns
data.isnull().sum()

### Rename column names as tags and message

In [None]:
# rename column names
data.rename(columns={'v1':'tag','v2':'message'},inplace=True)

In [None]:
data.head()

In [None]:
data['tag'].value_counts()

In [None]:
data['tag'].value_counts().plot(kind='bar')

### Preprocess the message column

In [None]:
message_new=[]  #Preprocessed data
for i in data['message']:
    i=re.sub('[^a-zA-Z0-9]',' ',i)
    message_new.append(' '.join(i.split()))
    

In [None]:
message_new

In [None]:
data['messagenew']=message_new

In [None]:
data.head()

In [None]:
data=data[['tag','messagenew']]
data.head()

## Now the data is cleaned 

## Spam detection

In [None]:
data.head()

In [None]:
data['tag'].value_counts()

In [None]:
data['tag']=data['tag'].map({'ham':'0','spam':'1'})

In [None]:
data['tag'].value_counts()

## Insights:
    - Now we have modified the tags and spam is classified as 1 and ham messages are classified as 0

In [None]:
MessagePreProcessed=[]
for i in data['messagenew']:
    i=i.lower()
    word_tokenize=nltk.word_tokenize(i)
    stopwordremoval=[i for i in word_tokenize if i not in set(stopwords.words('english'))]
    
    #stemming
    stemming=[ps.stem(i) for i in stopwordremoval]
    
    MessagePreProcessed.append(' '.join(stemming))
    print(i)

In [None]:
data.head()

In [None]:
data['MessagePreProcessed']=MessagePreProcessed

In [None]:
data.head()

In [None]:
x=cv.fit_transform(data['MessagePreProcessed']).toarray()
x

In [None]:
y=data['tag']
y

## Train-Test-Split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

In [None]:
classifier=MultinomialNB()
classifier.fit(x_train,y_train)

In [None]:
y_pred=classifier.predict(x_test)
y_pred

In [None]:
print(accuracy_score(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test,y_pred))

In [None]:
print(classification_report(y_test, y_pred))

## Insights:
    - We have got 98 percent accuracy.

## Predict the newly given message

In [None]:
# Reusable method for predicting the newly added message
def predict_spam(message):
    message=re.sub('[^a-zA-Z0-9]',' ',string=message)   #sub takes three parameters(replace by,replace with,string)
    message=message.lower()
    
    message=message.split()
    
    stopwordremoval=[ps.stem(i) for i in message if i not in set(stopwords.words('english'))]
    finalmessage=' '.join(stopwordremoval)
    
    finalmessage=cv.transform([finalmessage]).toarray()
    
    predicted=classifier.predict(finalmessage) 
    if(predicted=='1'):
        return "Spam"
    else:
        return "Not spam/Ham"

### Prediction

In [None]:
#store a message in newmessage variable and pass it to the above function which preedicts message as ham/spam
newmessage='be entitled up to £3,160 in compensation from mis-sold PPI on a credit card or loan. Please reply PPI for info or STOP to opt out'
newmessage

In [None]:
predict_spam(newmessage)

### Above message was correctly calssified as `spam` and it looks suspicious

In [None]:
newmessage2="we will meet this sunday for dinner at taj hotel"
newmessage2

In [None]:
predict_spam(newmessage2)

### Above message was correctly calssified as `Not spam/ Ham` and sentence looks casual. 