In [1]:
import re
import unidecode
import string
import unicodedata
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.util import ngrams
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The scikit-learn version is 0.24.1.


# Load Data

In [4]:
df = pd.read_csv("spamraw_train.csv")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        5000 non-null   int64 
 1   sms_text  5000 non-null   object
 2   spam      5000 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 117.3+ KB


Unnamed: 0,id,sms_text,spam
0,1,Hope you are having a good week. Just checking in,0
1,2,K..give back my thanks.,0
2,3,Am also doing in cbe only. But have to pay.,0
3,4,"complimentary 4 STAR Ibiza Holiday or £10,000 ...",1
4,5,okmail: Dear Dave this is your final notice to...,1


In [5]:
np.bincount(df['spam'])

array([4327,  673], dtype=int64)

# Build Pipeline

In [6]:
# Simple preprocessor.
# Input is a single document, as a single string.
# Otuput should be a single document, as a single string.
stop_words = set(stopwords.words('english')) - {'you','your','have','now','won'}
stop_words.add('also')
#I noticed that [not,no] is removed as a stop word, and {also} is not in the set,
#thus, we should add it back as it is important to determine the polarity with this.

lemmer = WordNetLemmatizer()

def my_preprocess(doc):
    
    # Lowercase
    doc = doc.lower()
    
    # Remove URLs
    #doc = re.sub(r'http\S+', '', doc)
    doc = re.sub(r'[^\w\s]', ' ', doc)    
    # TODO: What else?
    doc = unidecode.unidecode(doc)
    
    #x = re.sub(r'\d+', '', x)
    #doc = lemmer.lemmatize(w)
    doc = [lemmer.lemmatize(w) for w in doc.split() if w not in stop_words]
    return ' '.join(doc)

In [7]:

df['sms_text_clean'] = df['sms_text'].apply(my_preprocess)
df.head(100)

Unnamed: 0,id,sms_text,spam,sms_text_clean
0,1,Hope you are having a good week. Just checking in,0,hope you good week checking
1,2,K..give back my thanks.,0,k give back thanks
2,3,Am also doing in cbe only. But have to pay.,0,cbe have pay
3,4,"complimentary 4 STAR Ibiza Holiday or £10,000 ...",1,complimentary 4 star ibiza holiday 10 000 cash...
4,5,okmail: Dear Dave this is your final notice to...,1,okmail dear dave your final notice collect you...
...,...,...,...,...
95,96,Dear reached railway. What happen to you,0,dear reached railway happen you
96,97,Probably gonna swing by in a wee bit,0,probably gonna swing wee bit
97,98,K k pa Had your lunch aha.,0,k k pa your lunch aha
98,99,Whom you waited for yesterday,0,you waited yesterday


In [8]:
from sklearn.model_selection import train_test_split

X = df['sms_text_clean']
y = df['spam']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
type(X_train)
X_train.shape
X_train.head()

type(y_train)
y_train.shape
y_train.head()

pandas.core.series.Series

(4000,)

4227    cashbin co uk get lot cash weekend www cashbin...
4676                            free call sir waiting you
800     keep safe need you miss you already envy every...
3671                   you call tell now infront call now
4193                         reach home safe n sound liao
Name: sms_text_clean, dtype: object

pandas.core.series.Series

(4000,)

4227    1
4676    0
800     0
3671    0
4193    0
Name: spam, dtype: int64

In [21]:
import time
start = time.time()

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
import lightgbm as lgb
vectorizer = TfidfVectorizer(min_df=15, max_df=.3, max_features = 600, ngram_range=[1,3])
lgbm = lgb.LGBMClassifier()

num_leaves = np.arange(15,30,1).tolist()
param_grid = {
    'boosting_type': ['gbdt'],
    'num_leaves': num_leaves,
    'learning_rate':[0.1],
#    'subsample_for_bin': (range(20000, 300000, 20000)),
#    'min_child_samples': list(range(20, 500, 5)),
#    'reg_lambda': (np.linspace(0, 1,num = 10)).tolist(),
#    'reg_lambda': (np.linspace(0, 1)),
    'colsample_bytree': (np.linspace(0.6, 1, 10)).tolist(),
#    'subsample': list(np.linspace(0.5, 1, 100)),
    'is_unbalance': [False]
}
# Initialize a RandomizedSearchCV object using 5-fold CV-
rs_cv = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv = 5,verbose=0,scoring = 'f1')

%time
pipe = Pipeline([('cv', vectorizer), ('clf', rs_cv)])
pipe.fit(X_train, y_train)

end = time.time()
print(end - start)

Wall time: 0 ns


Pipeline(steps=[('cv',
                 TfidfVectorizer(max_df=0.3, max_features=600, min_df=15,
                                 ngram_range=[1, 3])),
                ('clf',
                 GridSearchCV(cv=5, estimator=LGBMClassifier(),
                              param_grid={'boosting_type': ['gbdt'],
                                          'colsample_bytree': [0.6,
                                                               0.6444444444444444,
                                                               0.6888888888888889,
                                                               0.7333333333333333,
                                                               0.7777777777777778,
                                                               0.8222222222222222,
                                                               0.8666666666666667,
                                                               0.9111111111111111,
                                          

205.9205677509308


In [20]:
(np.linspace(0.6, 1, 10))

array([0.6       , 0.64444444, 0.68888889, 0.73333333, 0.77777778,
       0.82222222, 0.86666667, 0.91111111, 0.95555556, 1.        ])

In [22]:
rs_cv.best_params_

{'boosting_type': 'gbdt',
 'colsample_bytree': 0.6,
 'is_unbalance': False,
 'learning_rate': 0.1,
 'num_leaves': 15}

# Estimate Model Performance

In [24]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, f1_score
#X_val_2 = vectorizer_model.transform(X_val)
pred_val = pipe.predict(X_val)

print("Confusion matrix:")
print(confusion_matrix(y_val, pred_val))

print("\nF1 Score = {:.5f}".format(f1_score(y_val, pred_val, average="macro")))

print("\nClassification Report:")
print(classification_report(y_val, pred_val))

Confusion matrix:
[[866   4]
 [ 14 116]]

F1 Score = 0.95886

Classification Report:
              precision    recall  f1-score   support

           0       0.98      1.00      0.99       870
           1       0.97      0.89      0.93       130

    accuracy                           0.98      1000
   macro avg       0.98      0.94      0.96      1000
weighted avg       0.98      0.98      0.98      1000

