# Import Library

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('dataset/smsspamcollection.tsv', sep='\t')

In [3]:
df.head()

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [4]:
df.isnull().sum()

label      0
message    0
length     0
punct      0
dtype: int64

In [5]:
df['label'].value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
# X feature
X = df['message']

# y label
y = df['label']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42) #30% test data

In [9]:
X_train.shape

(3900,)

In [10]:
X_test.shape

(1672,)

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
count_vect = CountVectorizer() # declare vectorizer

In [13]:
X

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
5       FreeMsg Hey there darling it's been 3 week's n...
6       Even my brother is not like to speak with me. ...
7       As per your request 'Melle Melle (Oru Minnamin...
8       WINNER!! As a valued network customer you have...
9       Had your mobile 11 months or more? U R entitle...
10      I'm gonna be home soon and i don't want to tal...
11      SIX chances to win CASH! From 100 to 20,000 po...
12      URGENT! You have won a 1 week FREE membership ...
13      I've been searching for the right words to tha...
14                    I HAVE A DATE ON SUNDAY WITH WILL!!
15      XXXMobileMovieClub: To use your credit, click ...
16                             Oh k...i'm watching here:)
17      Eh u r

In [14]:
# Fit the vectorizer to the data
X_train_counts = count_vect.fit_transform(X_train) #Produce sparse matrix

In [19]:
X_train_counts.shape #7.263 vocab

(3900, 7263)

In [20]:
from sklearn.linear_model import LogisticRegression

In [23]:
lr_model = LogisticRegression()
lr_model.fit(X=X_train_counts, y=y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [24]:
from sklearn.pipeline import Pipeline

In [25]:
text_clf = Pipeline([('CountVectorizer',count_vect),
                     ('LogisticClf', lr_model)])

In [28]:
y_pred = text_clf.predict(X_test)

In [27]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [29]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1445    3]
 [  22  202]]
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1448
        spam       0.99      0.90      0.94       224

   micro avg       0.99      0.99      0.99      1672
   macro avg       0.99      0.95      0.97      1672
weighted avg       0.99      0.99      0.98      1672



In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
tfidf_vect = TfidfVectorizer()
X_train_tfidf = tfidf_vect.fit_transform(X_train)

In [32]:
lr_model = LogisticRegression()
lr_model.fit(X=X_train_tfidf, y=y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [33]:
text_clf = Pipeline([('TfidfVectorizer',tfidf_vect),
                     ('LogisticClf', lr_model)])

In [34]:
y_pred = text_clf.predict(X_test)

In [35]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1446    2]
 [  46  178]]
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1448
        spam       0.99      0.79      0.88       224

   micro avg       0.97      0.97      0.97      1672
   macro avg       0.98      0.90      0.93      1672
weighted avg       0.97      0.97      0.97      1672



In [68]:
# Function to train and test multiple models
# Iterate within dictionary

def train_test_model(model_dict, extractor):
    '''
    Train and Test Multiple models.
    Input : dict -> {'model name':model object, ...}
            dict -> {'extractor name':extractor object, ...}
    Output : a dict contains pipelines
    '''
    ext_name =list(extractor.keys())[0]
    ext = extractor[ext_name]
    X_train_ext = ext.fit_transform(X_train)
    pipeline = dict()
    for name, model in model_dict.items():
        #Train model
        model.fit(X_train_ext, y_train)
        
        #Create pipeline
        text_clf = Pipeline([(ext_name,ext),(name, model)])
        
        #Test
        y_pred = text_clf.predict(X_test)
        y_report = [y_test, y_pred]
        acc_score = accuracy_score(*y_report)*100
        print(name)
        print(confusion_matrix(*y_report))
        print(classification_report(*y_report))
        print(f'Accuracy: {acc_score:.2f}%')
        print('\n')
        pipeline[name] = text_clf
    return pipeline

In [69]:
# Define the model
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

nb_model = MultinomialNB() #naive-bayes
svc_model = SVC(gamma='auto') #SVC
lr_model = LogisticRegression(solver='lbfgs') #log-clf

count_vect = CountVectorizer()

model_dict = {'naive-bayes':nb_model, 'SVC':svc_model, 'log-clf':lr_model}
extractor = {'CountVectorizer': count_vect}

In [70]:
# Train and Test
pipelines = train_test_model(model_dict, extractor)

naive-bayes
[[1444    4]
 [  12  212]]
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1448
        spam       0.98      0.95      0.96       224

   micro avg       0.99      0.99      0.99      1672
   macro avg       0.99      0.97      0.98      1672
weighted avg       0.99      0.99      0.99      1672

Accuracy: 99.04%


SVC
[[1448    0]
 [ 224    0]]
              precision    recall  f1-score   support

         ham       0.87      1.00      0.93      1448
        spam       0.00      0.00      0.00       224

   micro avg       0.87      0.87      0.87      1672
   macro avg       0.43      0.50      0.46      1672
weighted avg       0.75      0.87      0.80      1672

Accuracy: 86.60%


log-clf
[[1445    3]
 [  22  202]]
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1448
        spam       0.99      0.90      0.94       224

   micro avg       0.99      0.99      

  'precision', 'predicted', average, warn_for)


In [73]:
len(pipelines['SVC'].predict(X_test))

1672

In [77]:
(pipelines['SVC'].predict(X_test)=='ham').sum()

1672

In [78]:
nb_model = MultinomialNB() #naive-bayes
svc_model = SVC(gamma='auto') #SVC
lr_model = LogisticRegression(solver='lbfgs') #log-clf

tfidf_vect = TfidfVectorizer()

model_dict = {'naive-bayes':nb_model, 'SVC':svc_model, 'log-clf':lr_model}
extractor = {'TfidfVectorizer': tfidf_vect}

In [79]:
# Train and Test
pipelines = train_test_model(model_dict, extractor)

naive-bayes
[[1448    0]
 [  62  162]]
              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1448
        spam       1.00      0.72      0.84       224

   micro avg       0.96      0.96      0.96      1672
   macro avg       0.98      0.86      0.91      1672
weighted avg       0.96      0.96      0.96      1672

Accuracy: 96.29%


SVC
[[1448    0]
 [ 224    0]]
              precision    recall  f1-score   support

         ham       0.87      1.00      0.93      1448
        spam       0.00      0.00      0.00       224

   micro avg       0.87      0.87      0.87      1672
   macro avg       0.43      0.50      0.46      1672
weighted avg       0.75      0.87      0.80      1672

Accuracy: 86.60%


log-clf
[[1446    2]
 [  46  178]]
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98      1448
        spam       0.99      0.79      0.88       224

   micro avg       0.97      0.97      

  'precision', 'predicted', average, warn_for)


In [80]:
len(pipelines['SVC'].predict(X_test))

1672

In [81]:
(pipelines['SVC'].predict(X_test)=='ham').sum()

1672

In [82]:
svc_model = SVC(gamma='auto') #SVC
svc_model.fit(X=X_train_counts, y=y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [83]:
text_clf = Pipeline([('CountVectorizer',count_vect),
                     ('SVC',svc_model)])

In [84]:
y_pred = text_clf.predict(X_test)

In [85]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1448    0]
 [ 224    0]]
              precision    recall  f1-score   support

         ham       0.87      1.00      0.93      1448
        spam       0.00      0.00      0.00       224

   micro avg       0.87      0.87      0.87      1672
   macro avg       0.43      0.50      0.46      1672
weighted avg       0.75      0.87      0.80      1672



  'precision', 'predicted', average, warn_for)


In [86]:
from sklearn.model_selection import GridSearchCV

In [94]:
svc_model = SVC(gamma='auto')
param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')}
clf = GridSearchCV(svc_model, param_grid, scoring='accuracy')
clf.fit(X_train_counts, y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [1, 10], 'kernel': ('linear', 'rbf')},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [98]:
clf.best_params_

{'C': 1, 'kernel': 'linear'}

In [95]:
text_clf = Pipeline([('CountVectorizer',count_vect),
                     ('SVC',clf)])

In [96]:
y_pred = text_clf.predict(X_test)

In [97]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1448    0]
 [  19  205]]
              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1448
        spam       1.00      0.92      0.96       224

   micro avg       0.99      0.99      0.99      1672
   macro avg       0.99      0.96      0.97      1672
weighted avg       0.99      0.99      0.99      1672



In [99]:
print(accuracy_score(y_test, y_pred))

0.9886363636363636
