# Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer     # It is used to convert textdata into numerical data
from sklearn.metrics import accuracy_score

# Reading Dataset and data analysis

In [2]:
data = pd.read_csv('spam.csv')

In [3]:
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
data.columns

Index(['Category', 'Message'], dtype='object')

In [5]:
data.shape

(5572, 2)

In [6]:
data.isnull().sum()

Category    0
Message     0
dtype: int64

In [7]:
data.duplicated().sum()

415

In [8]:
data.drop_duplicates(inplace=True)

In [9]:
data.shape

(5157, 2)

In [10]:
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [11]:
data.reset_index(inplace=True)

In [12]:
data

Unnamed: 0,index,Category,Message
0,0,ham,"Go until jurong point, crazy.. Available only ..."
1,1,ham,Ok lar... Joking wif u oni...
2,2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,3,ham,U dun say so early hor... U c already then say...
4,4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...,...
5152,5567,spam,This is the 2nd time we have tried 2 contact u...
5153,5568,ham,Will ü b going to esplanade fr home?
5154,5569,ham,"Pity, * was in mood for that. So...any other s..."
5155,5570,ham,The guy did some bitching but I acted like i'd...


In [13]:
data.drop(columns=['index'],inplace=True)

In [14]:
data

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5152,spam,This is the 2nd time we have tried 2 contact u...
5153,ham,Will ü b going to esplanade fr home?
5154,ham,"Pity, * was in mood for that. So...any other s..."
5155,ham,The guy did some bitching but I acted like i'd...


In [15]:
data.shape

(5157, 2)

In [16]:
data.Category.unique()

array(['ham', 'spam'], dtype=object)

In [17]:
data.Category.replace(['ham','spam'],['not spam','spam'],inplace=True)

In [18]:
data.head()

Unnamed: 0,Category,Message
0,not spam,"Go until jurong point, crazy.. Available only ..."
1,not spam,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,not spam,U dun say so early hor... U c already then say...
4,not spam,"Nah I don't think he goes to usf, he lives aro..."


In [19]:
input_data=data['Message']
output_data=data['Category']

In [20]:
input_data.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: Message, dtype: object

In [21]:
output_data.head()

0    not spam
1    not spam
2        spam
3    not spam
4    not spam
Name: Category, dtype: object

# Model Creation

In [22]:
x_train,x_test,y_train,y_test = train_test_split(input_data,output_data,test_size=0.2,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((4125,), (1032,), (4125,), (1032,))

In [23]:
cv=CountVectorizer(stop_words='english')
x_train=cv.fit_transform(x_train)
x_test=cv.transform(x_test)

In [24]:
model=MultinomialNB()
model.fit(x_train,y_train)

MultinomialNB()

In [25]:
model.score(x_test,y_test)

0.9854651162790697

In [26]:
y_pred=model.predict(x_test)
y_pred

array(['not spam', 'spam', 'not spam', ..., 'not spam', 'not spam',
       'not spam'], dtype='<U8')

In [27]:
accuracy_score(y_test,y_pred)

0.9854651162790697

# Predicting System

In [28]:
text_data="Your bank statements is available march onwards log in your account and check it"
text_data=cv.transform([text_data]).toarray()
model.predict(text_data)[0]


'not spam'

# Saving MOdel

In [29]:
import pickle as pk

In [30]:
pk.dump(model,open('spam.pkl','wb'))
pk.dump(cv,open('cvspam.pkl','wb'))