In [1]:
import pandas as pd
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np

In [2]:
#Load Dataset

DATASET_DIR = '../Dataset/big_spam_dataset.csv'

data = pd.read_csv(DATASET_DIR, encoding='cp1252')
data.drop('Unnamed: 0', inplace=True, axis=1)
data.head()

Unnamed: 0,Message_body,Label
0,"UpgrdCentre Orange customer, you may now claim...",Spam
1,"Loan for any purpose £500 - £75,000. Homeowner...",Spam
2,Congrats! Nokia 3650 video camera phone is you...,Spam
3,URGENT! Your Mobile number has been awarded wi...,Spam
4,Someone has contacted our dating service and e...,Spam


In [3]:
# Create new features (Feature Engineering) # apply before text-cleaning

def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count / (len(text) - text.count(" ")), 3)

data['body_len'] = data['Message_body'].apply(lambda x: len(x) - x.count(' '))
data['punct%'] = data['Message_body'].apply(lambda x: count_punct(x))

In [19]:
# Text cleaning and Vectorizing

stop_words = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()

def clean_text(text, digit_opt):
    tokens = nltk.word_tokenize(text)
    tokens = [w.lower() for w in tokens]
#    print('Before --> {}'.format(tokens))
    if digit_opt == 'remove':
        tokens = [''.join([c for c in w if not c.isdigit()]) for w in tokens]
    if digit_opt == 'mask':
        tokens = ['digit' if w.isdigit() == True else w for w in tokens]
#        print('After --> {}'.format(tokens))
    re_punct = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punct.sub('', w) for w in tokens]
    tokens = " ".join([ps.stem(w) for w in tokens if len(w) > 2])
    
    return tokens

data['No_Digit'] = data['Message_body'].apply(lambda x: clean_text(x, 'remove'))
data['Masked_Digit'] = data['Message_body'].apply(lambda x: clean_text(x, 'mask'))

# TF-IDF - No Digit
tfidf_vect = TfidfVectorizer()
X_tfidf_no_digit = tfidf_vect.fit_transform(data['No_Digit'])
X_tfidf_no_digit_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf_no_digit.toarray())], axis=1)

# TF-IDF - Masked Digit
tfidf_vect = TfidfVectorizer()
X_tfidf_masked_digit = tfidf_vect.fit_transform(data['Masked_Digit'])
X_tfidf_masked_digit_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_tfidf_masked_digit.toarray())], axis=1)


# CountVectorizer - No Digit
count_vect = CountVectorizer()
X_count_no_digit = count_vect.fit_transform(data['No_Digit'])
X_count_no_digit_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_count_no_digit.toarray())], axis=1)

# CountVectorizer - Masked Digit
count_vect = CountVectorizer()
X_count_masked_digit = count_vect.fit_transform(data['Masked_Digit'])
X_count_masked_digit_feat = pd.concat([data['body_len'], data['punct%'], pd.DataFrame(X_count_masked_digit.toarray())], axis=1)


In [20]:
data['No_Digit'].head()

0    upgrdcentr orang custom you may now claim your...
1    loan for ani purpos homeown tenant welcom have...
2    congrat nokia video camera phone your call cal...
3    urgent your mobil number ha been award with pr...
4    someon ha contact our date servic and enter yo...
Name: No_Digit, dtype: object

In [21]:
X_tfidf_no_digit_feat.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,2685,2686,2687,2688,2689,2690,2691,2692,2693,2694
0,147,0.054,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.232376,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,135,0.067,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,132,0.015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.111909,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,115,0.035,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.15004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,136,0.015,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.129826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
X_count_masked_digit_feat.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,2860,2861,2862,2863,2864,2865,2866,2867,2868,2869
0,147,0.054,0,0,0,0,0,0,0,0,...,0,2,0,0,0,0,0,0,0,0
1,135,0.067,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,132,0.015,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,115,0.035,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
4,136,0.015,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [23]:
X_count_no_digit_feat.head()

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,2685,2686,2687,2688,2689,2690,2691,2692,2693,2694
0,147,0.054,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,0,0
1,135,0.067,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,132,0.015,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,115,0.035,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,136,0.015,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [24]:
# Random Forest & GridSearch

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import precision_recall_fscore_support as score

In [25]:
####### TF-IDF ######## for No_Digit

rf = RandomForestClassifier()
param = {'n_estimators' : [10, 150, 300], #Kaç tane decision-tree build edileceği set edilir
        'max_depth' : [30, 60, 90, None]} #Her bir tree ne kadar derin(boyu) olacağı belirlenir

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1) # Cv kaç tane K-fold-cross-validation da ki K rakamını set eder
gs_fit = gs.fit(X_tfidf_no_digit_feat, data['Label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
8,2.255041,0.109897,0.081916,0.014409,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.953917,0.963134,0.962963,0.962963,0.944444,0.957484,0.007412,1
5,2.295641,0.059154,0.083878,0.008988,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.963134,0.963134,0.962963,0.958333,0.939815,0.957476,0.00902,2
7,1.187141,0.022924,0.051171,0.007017,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.963134,0.963134,0.962963,0.953704,0.944444,0.957476,0.007459,2
10,1.226057,0.03826,0.05815,0.00246,,150,"{'max_depth': None, 'n_estimators': 150}",0.967742,0.958525,0.962963,0.953704,0.944444,0.957476,0.008008,2
4,1.196707,0.025561,0.051124,0.003375,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.967742,0.967742,0.958333,0.949074,0.944444,0.957467,0.009507,5


In [26]:
####### TF-IDF ######## for Masked_Digit

rf = RandomForestClassifier()
param = {'n_estimators' : [10, 150, 300], #Kaç tane decision-tree build edileceği set edilir
        'max_depth' : [30, 60, 90, None]} #Her bir tree ne kadar derin(boyu) olacağı belirlenir

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1) # Cv kaç tane K-fold-cross-validation da ki K rakamını set eder
gs_fit = gs.fit(X_tfidf_masked_digit_feat, data['Label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]



Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
10,1.079317,0.016301,0.057087,0.009412,,150,"{'max_depth': None, 'n_estimators': 150}",0.976959,0.958525,0.967593,0.967593,0.958333,0.9658,0.006923,1
4,1.110812,0.079096,0.055201,0.006189,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.981567,0.958525,0.967593,0.967593,0.953704,0.965796,0.009532,2
11,2.040621,0.141412,0.085012,0.025669,,300,"{'max_depth': None, 'n_estimators': 300}",0.976959,0.958525,0.967593,0.962963,0.958333,0.964875,0.006931,3
2,1.952752,0.068515,0.079252,0.00551,30.0,300,"{'max_depth': 30, 'n_estimators': 300}",0.97235,0.953917,0.967593,0.972222,0.949074,0.963031,0.009695,4
5,2.140353,0.104392,0.077413,0.00559,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.976959,0.949309,0.967593,0.962963,0.958333,0.963031,0.009221,4


In [27]:
####### Word-Frequency ######## for No_Digit

rf = RandomForestClassifier()
param = {'n_estimators' : [10, 150, 300], #Kaç tane decision-tree build edileceği set edilir
        'max_depth' : [30, 60, 90, None]} #Her bir tree ne kadar derin(boyu) olacağı belirlenir

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1) # Cv kaç tane K-fold-cross-validation da ki K rakamını set eder
gs_fit = gs.fit(X_count_no_digit_feat, data['Label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]





Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
7,1.163794,0.029322,0.057647,0.005465,90.0,150,"{'max_depth': 90, 'n_estimators': 150}",0.9447,0.958525,0.958333,0.958333,0.953704,0.954719,0.005329,1
5,2.414782,0.033939,0.095608,0.005983,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.953917,0.953917,0.958333,0.958333,0.949074,0.954715,0.003443,2
11,2.063165,0.271811,0.067706,0.018668,,300,"{'max_depth': None, 'n_estimators': 300}",0.940092,0.958525,0.958333,0.962963,0.944444,0.952872,0.008921,3
10,1.212242,0.028531,0.066252,0.01247,,150,"{'max_depth': None, 'n_estimators': 150}",0.953917,0.949309,0.958333,0.958333,0.944444,0.952867,0.005375,4
8,2.331579,0.066809,0.084628,0.009249,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.949309,0.958525,0.958333,0.953704,0.944444,0.952863,0.005407,5


In [28]:
####### Word-Frequency ######## for Masked_Digit

rf = RandomForestClassifier()
param = {'n_estimators' : [10, 150, 300], #Kaç tane decision-tree build edileceği set edilir
        'max_depth' : [30, 60, 90, None]} #Her bir tree ne kadar derin(boyu) olacağı belirlenir

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1) # Cv kaç tane K-fold-cross-validation da ki K rakamını set eder
gs_fit = gs.fit(X_count_masked_digit_feat, data['Label'])
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]









Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
10,1.187116,0.053758,0.058346,0.003848,,150,"{'max_depth': None, 'n_estimators': 150}",0.976959,0.967742,0.962963,0.958333,0.962963,0.965792,0.006327,1
5,2.436645,0.061495,0.079651,0.009578,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.97235,0.963134,0.958333,0.967593,0.962963,0.964875,0.004749,2
11,2.019715,0.212993,0.067358,0.017982,,300,"{'max_depth': None, 'n_estimators': 300}",0.967742,0.958525,0.962963,0.967593,0.962963,0.963957,0.003436,3
4,1.270818,0.02895,0.064103,0.008104,60.0,150,"{'max_depth': 60, 'n_estimators': 150}",0.963134,0.963134,0.967593,0.953704,0.958333,0.961179,0.004749,4
2,2.141072,0.074989,0.086036,0.009322,30.0,300,"{'max_depth': 30, 'n_estimators': 300}",0.967742,0.953917,0.962963,0.958333,0.958333,0.960258,0.00471,5






