In [9]:
import pandas as pd

docs = pd.read_csv('smsspamcollection.csv', header=None)
docs.head()

Unnamed: 0,0,1
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
docs.columns = ['Class', 'sms']

In [11]:
len(docs)

5572

In [12]:
ham_spam = docs.Class.value_counts()
ham_spam

ham     4825
spam     747
Name: Class, dtype: int64

In [14]:
print('Spam rate is about {0}%'.format(
    round((ham_spam[1]/float(ham_spam[0]+ham_spam[1]))*100, 2)
))

Spam rate is about 13.41%


In [16]:
docs['label'] = docs.Class.map({'ham':0, 'spam':1})
docs.head()

Unnamed: 0,Class,sms,label
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [17]:
docs = docs.drop('Class', axis=1)
docs.head()

Unnamed: 0,sms,label
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0


In [18]:
X = docs.sms
y = docs.label
print(X.shape)
print(y.shape)

(5572,)
(5572,)


In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [22]:
X_train.head()

4393                       what are your new years plans?
216     Finally the match heading towards draw as your...
4471    Lemme know when I can swing by and pick up, I'...
3889                   ok....take care.umma to you too...
5030    I'd like to tell you my deepest darkest fantas...
Name: sms, dtype: object

In [23]:
y_train.head()

4393    0
216     0
4471    0
3889    0
5030    1
Name: label, dtype: int64

In [24]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(stop_words='english')

In [26]:
vect.fit(X_train)
vect.vocabulary_

{'new': 4272,
 'years': 6856,
 'plans': 4682,
 'finally': 2550,
 'match': 3930,
 'heading': 3018,
 'draw': 2202,
 'prediction': 4801,
 'lemme': 3650,
 'know': 3548,
 'swing': 5957,
 'pick': 4644,
 'free': 2663,
 'basically': 1119,
 'time': 6163,
 'semester': 5369,
 'ok': 4406,
 'care': 1485,
 'umma': 6354,
 'like': 3685,
 'tell': 6043,
 'deepest': 2000,
 'darkest': 1949,
 'fantasies': 2478,
 '09094646631': 222,
 'just': 3472,
 '60p': 538,
 'min': 4022,
 'stop': 5804,
 'texts': 6083,
 '08712460324': 103,
 'nat': 4218,
 'rate': 4985,
 'lmao': 3732,
 'fish': 2566,
 'memory': 3984,
 'need': 4243,
 'yahoo': 6842,
 'boys': 1324,
 'bring': 1354,
 'perf': 4592,
 'legal': 3644,
 'hiya': 3083,
 'comin': 1722,
 'bristol': 1358,
 'st': 5740,
 'week': 6638,
 'april': 940,
 'les': 3656,
 'got': 2862,
 'rudi': 5234,
 'yrs': 6883,
 'eve': 2383,
 'snoring': 5603,
 'drunk': 2226,
 'bak': 1092,
 'college': 1710,
 'work': 6770,
 'sends': 5374,
 'ink': 3289,
 'bath': 1123,
 'wamma': 6575,
 'laid': 3581,
 '

In [27]:
len(vect.vocabulary_.keys())

6904

In [29]:
X_train_transformed = vect.transform(X_train)
X_test_transformed = vect.transform(X_test)

In [34]:
print(type(X_train_transformed))
print(X_train_transformed)

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 4272)	1
  (0, 4682)	1
  (0, 6856)	1
  (1, 2202)	1
  (1, 2550)	1
  (1, 3018)	1
  (1, 3930)	1
  (1, 4801)	1
  (2, 1119)	1
  (2, 2663)	1
  (2, 3548)	1
  (2, 3650)	1
  (2, 4644)	1
  (2, 5369)	1
  (2, 5957)	1
  (2, 6163)	1
  (3, 1485)	1
  (3, 4406)	1
  (3, 6354)	1
  (4, 103)	1
  (4, 222)	1
  (4, 538)	1
  (4, 1949)	1
  (4, 2000)	1
  (4, 2478)	1
  :	:
  (3897, 3721)	1
  (3897, 4248)	1
  (3897, 5026)	1
  (3897, 5949)	1
  (3897, 6840)	1
  (3898, 243)	1
  (3898, 295)	1
  (3898, 344)	1
  (3898, 799)	1
  (3898, 1959)	1
  (3898, 2590)	1
  (3898, 3435)	1
  (3898, 3472)	1
  (3898, 3966)	1
  (3898, 4148)	1
  (3898, 4584)	1
  (3898, 5031)	1
  (3898, 5115)	1
  (3898, 5273)	1
  (3898, 5399)	1
  (3898, 6074)	1
  (3898, 6122)	1
  (3898, 6196)	1
  (3899, 1574)	1
  (3899, 5590)	1


In [35]:
#Building Model
#Building Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train_transformed, y_train)
y_pred_class = mnb.predict(X_test_transformed)
y_pred_proba = mnb.predict_proba(X_test_transformed)

In [37]:
print(mnb)

MultinomialNB()


In [40]:
#Model Evaluation
from sklearn import metrics
from sklearn.metrics import classification_report
metrics.accuracy_score(y_test, y_pred_class)

0.986244019138756

In [39]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[1434,    8],
       [  15,  215]], dtype=int64)

In [42]:
print(metrics.classification_report(y_test, y_pred_class))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1442
           1       0.96      0.93      0.95       230

    accuracy                           0.99      1672
   macro avg       0.98      0.96      0.97      1672
weighted avg       0.99      0.99      0.99      1672



In [None]:
#Building Bernoulli Naive Bayes
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(x_transformed, y_train)
bnb.predict_proba(X_test)