# EMAIL Spam detection
## importing modules

In [18]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
import re
from nltk.stem.porter import PorterStemmer

## loading dataset

In [19]:
data=pd.read_csv('spam.csv')
data.head()

Unnamed: 0,type,email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [20]:
data.shape

(5572, 2)

In [21]:
data.groupby('type').describe()

Unnamed: 0_level_0,email,email,email,email
Unnamed: 0_level_1,count,unique,top,freq
type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [22]:
corpus=[]
for i in range(data.shape[0]):
    msg = re.sub('[^a-zA-Z]', ' ', data['email'][i])
    msg = msg.lower()
    msg = msg.split()
    ps = PorterStemmer()
    msg = [ps.stem(word) for word in msg if not word in set(stopwords.words('english'))]
    msg = ' '.join(msg)
    corpus.append(msg)
data['email']=corpus
data.head()

Unnamed: 0,type,email
0,ham,go jurong point crazi avail bugi n great world...
1,ham,ok lar joke wif u oni
2,spam,free entri wkli comp win fa cup final tkt st m...
3,ham,u dun say earli hor u c alreadi say
4,ham,nah think goe usf live around though


## encoding the type of emails

In [23]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
data['type']=le.fit_transform(data['type'])
data.head()

Unnamed: 0,type,email
0,0,go jurong point crazi avail bugi n great world...
1,0,ok lar joke wif u oni
2,1,free entri wkli comp win fa cup final tkt st m...
3,0,u dun say earli hor u c alreadi say
4,0,nah think goe usf live around though


## converting text to binary data

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer()
mail=cv.fit_transform(data['email'])

## countvectorizer

In [25]:
a=["hello all how are you",
   "good morning",
   "good evening"]
b=cv.fit(a)
print(b.get_feature_names())
print(b.vocabulary_)
c=cv.transform(a)
print(c.shape)
print(c)
print(c.toarray())
df=pd.DataFrame(c.toarray(),columns=b.get_feature_names())
df

['all', 'are', 'evening', 'good', 'hello', 'how', 'morning', 'you']
{'hello': 4, 'all': 0, 'how': 5, 'are': 1, 'you': 7, 'good': 3, 'morning': 6, 'evening': 2}
(3, 8)
  (0, 0)	1
  (0, 1)	1
  (0, 4)	1
  (0, 5)	1
  (0, 7)	1
  (1, 3)	1
  (1, 6)	1
  (2, 2)	1
  (2, 3)	1
[[1 1 0 0 1 1 0 1]
 [0 0 0 1 0 0 1 0]
 [0 0 1 1 0 0 0 0]]


Unnamed: 0,all,are,evening,good,hello,how,morning,you
0,1,1,0,0,1,1,0,1
1,0,0,0,1,0,0,1,0
2,0,0,1,1,0,0,0,0


## spliting data into train and test data

In [26]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(mail,data['type'],test_size=0.3)

## Applying to model

In [27]:
from sklearn.neighbors import KNeighborsClassifier
classifier=KNeighborsClassifier()
classifier.fit(xtrain,ytrain)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

## predicting test output

In [28]:
ypred=classifier.predict(xtest)

In [29]:
from sklearn.metrics import confusion_matrix,classification_report
cm=confusion_matrix(ytest,ypred)
print(cm)

[[1489    0]
 [ 110   73]]


In [30]:
cr=classification_report(ytest,ypred)
print(cr)

              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1489
           1       1.00      0.40      0.57       183

    accuracy                           0.93      1672
   macro avg       0.97      0.70      0.77      1672
weighted avg       0.94      0.93      0.92      1672



In [31]:
from sklearn.linear_model import LogisticRegression
r=LogisticRegression()
r.fit(xtrain,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

## predicting output

In [32]:
ypre=r.predict(xtest)

## classifier accuracy

In [33]:
cm=confusion_matrix(ytest,ypre)
cr=classification_report(ytest,ypre)
print(cm)
print()
print(cr)

[[1486    3]
 [  27  156]]

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1489
           1       0.98      0.85      0.91       183

    accuracy                           0.98      1672
   macro avg       0.98      0.93      0.95      1672
weighted avg       0.98      0.98      0.98      1672

