In [1]:
    import pandas as pd
pd.set_option('display.max_colwidth', -1)

df = pd.read_csv('data/smsspamcollection/SMSSpamCollection', sep='\t', names =['label','sms_message'])

df.head()

Unnamed: 0,label,sms_message
0,ham,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives around here though"


In [2]:
df['label'] = df.label.map({'ham':0, 'spam':1})
print(df.shape)  # returns (rows, columns)
df.head()

(5572, 2)


Unnamed: 0,label,sms_message
0,0,"Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives around here though"


### Implementing Bag of Words in scikit-learn ###

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
count_vector = CountVectorizer()

In [4]:
print(count_vector)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


In [6]:
# split into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms_message'], 
                                                    df['label'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


In [7]:
from sklearn.feature_extraction.text import CountVectorizer

count_vector = CountVectorizer()

training_data = count_vector.fit_transform(X_train)

testing_data = count_vector.transform(X_test)

### Naive Bayes implementation using scikit-learn


In [8]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [9]:
predictions = naive_bayes.predict(testing_data)
predictions

array([0, 0, 0, ..., 0, 1, 0], dtype=int64)

In [10]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test ,predictions)))
print('Precision score: ', format(precision_score(y_test ,predictions)))
print('Recall score: ', format(recall_score(y_test ,predictions)))
print('F1 score: ', format(f1_score(y_test ,predictions)))

Accuracy score:  0.9885139985642498
Precision score:  0.9720670391061452
Recall score:  0.9405405405405406
F1 score:  0.9560439560439562


## Ensemble Techniques

In [11]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier

In [12]:
# Instantiate a BaggingClassifier 
bag_mod = BaggingClassifier(n_estimators=200)


# Instantiate a RandomForestClassifier
rf_mod = RandomForestClassifier(n_estimators=200)

# Instantiate an a AdaBoostClassifier 
ada_mod = AdaBoostClassifier(n_estimators=300, learning_rate=0.2)

In [13]:
# Fit your BaggingClassifier to the training data
bag_mod.fit(training_data, y_train)

# Fit your RandomForestClassifier to the training data
rf_mod.fit(training_data, y_train)

# Fit your AdaBoostClassifier to the training data
ada_mod.fit(training_data, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.2, n_estimators=300, random_state=None)

In [14]:
# Predict using BaggingClassifier on the test data
bag_preds = bag_mod.predict(testing_data) 

# Predict using RandomForestClassifier on the test data
rf_preds = rf_mod.predict(testing_data)

# Predict using AdaBoostClassifier on the test data
ada_preds = ada_mod.predict(testing_data)

In [15]:
def print_metrics(y_true, preds, model_name=None):
   
    if model_name == None:
        print('Accuracy score: ', format(accuracy_score(y_true, preds)))
        print('Precision score: ', format(precision_score(y_true, preds)))
        print('Recall score: ', format(recall_score(y_true, preds)))
        print('F1 score: ', format(f1_score(y_true, preds)))
        print('\n\n')
    
    else:
        print('Accuracy score for ' + model_name + ' :' , format(accuracy_score(y_true, preds)))
        print('Precision score ' + model_name + ' :', format(precision_score(y_true, preds)))
        print('Recall score ' + model_name + ' :', format(recall_score(y_true, preds)))
        print('F1 score ' + model_name + ' :', format(f1_score(y_true, preds)))
        print('\n\n')

In [16]:
# Print Bagging scores
print_metrics(y_test, bag_preds, 'bagging')

# Print Random Forest scores
print_metrics(y_test, rf_preds, 'random forest')

# Print AdaBoost scores
print_metrics(y_test, ada_preds, 'adaboost')

# Naive Bayes Classifier scores
print_metrics(y_test, predictions, 'naive bayes')

Accuracy score for bagging : 0.9741564967695621
Precision score bagging : 0.9116022099447514
Recall score bagging : 0.8918918918918919
F1 score bagging : 0.9016393442622951



Accuracy score for random forest : 0.9820531227566404
Precision score random forest : 1.0
Recall score random forest : 0.8648648648648649
F1 score random forest : 0.927536231884058



Accuracy score for adaboost : 0.9770279971284996
Precision score adaboost : 0.9693251533742331
Recall score adaboost : 0.8540540540540541
F1 score adaboost : 0.9080459770114943



Accuracy score for naive bayes : 0.9885139985642498
Precision score naive bayes : 0.9720670391061452
Recall score naive bayes : 0.9405405405405406
F1 score naive bayes : 0.9560439560439562



