`April 18, 2022`

### **Text preprocessing & classification**

In [89]:
#! pip install nltk
#! pip install contractions

Collecting contractions
  Downloading contractions-0.1.68-py2.py3-none-any.whl (8.1 kB)
Collecting textsearch>=0.0.21
  Downloading textsearch-0.0.21-py2.py3-none-any.whl (7.5 kB)
Collecting anyascii
  Downloading anyascii-0.3.1-py3-none-any.whl (287 kB)
Collecting pyahocorasick
  Downloading pyahocorasick-1.4.4-cp39-cp39-win_amd64.whl (39 kB)
Installing collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.1 contractions-0.1.68 pyahocorasick-1.4.4 textsearch-0.0.21


In [83]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# Text preprocessing
import re
import nltk
from string import punctuation
from nltk.corpus import stopwords, wordnet
from nltk.stem import SnowballStemmer, WordNetLemmatizer

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline


# nltk.download('stopwords')
# nltk.download('punkt')

In [3]:
df = pd.read_csv('sms_spam.csv')
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


### **Text preprocessing part 1**

`Convert to lowercase`

In [5]:
df['message_clean'] = df['message'].copy().str.lower() # df['message_clean'].apply(lambda x: x.lower())

In [6]:
df.head()

Unnamed: 0,label,message,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...","go until jurong point, crazy.. available only ..."
1,ham,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor... u c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro...","nah i don't think he goes to usf, he lives aro..."


`Remove numbers`

In [7]:
def remove_numbers(text):
    output = ''.join(char for char in text if not char.isdigit())
    return output

df['message_clean'] = df.loc[:, 'message_clean'].apply(remove_numbers)
df.head()

Unnamed: 0,label,message,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...","go until jurong point, crazy.. available only ..."
1,ham,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor... u c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro...","nah i don't think he goes to usf, he lives aro..."


`Contractions treatment`

In [8]:
contractions = { 
"ain't": "am not / are not / is not / has not / have not",
"aren't": "are not / am not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he had / he would",
"he'd've": "he would have",
"he'll": "he shall / he will",
"he'll've": "he shall have / he will have",
"he's": "he has / he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how has / how is / how does",
"I'd": "I had / I would",
"I'd've": "I would have",
"I'll": "I shall / I will",
"I'll've": "I shall have / I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it had / it would",
"it'd've": "it would have",
"it'll": "it shall / it will",
"it'll've": "it shall have / it will have",
"it's": "it has / it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she had / she would",
"she'd've": "she would have",
"she'll": "she shall / she will",
"she'll've": "she shall have / she will have",
"she's": "she has / she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as / so is",
"that'd": "that would / that had",
"that'd've": "that would have",
"that's": "that has / that is",
"there'd": "there had / there would",
"there'd've": "there would have",
"there's": "there has / there is",
"they'd": "they had / they would",
"they'd've": "they would have",
"they'll": "they shall / they will",
"they'll've": "they shall have / they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we had / we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what shall / what will",
"what'll've": "what shall have / what will have",
"what're": "what are",
"what's": "what has / what is",
"what've": "what have",
"when's": "when has / when is",
"when've": "when have",
"where'd": "where did",
"where's": "where has / where is",
"where've": "where have",
"who'll": "who shall / who will",
"who'll've": "who shall have / who will have",
"who's": "who has / who is",
"who've": "who have",
"why's": "why has / why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you had / you would",
"you'd've": "you would have",
"you'll": "you shall / you will",
"you'll've": "you shall have / you will have",
"you're": "you are",
"you've": "you have"
}

# Use regex to find the pattern
contractions_re = re.compile('(%s)' % '|'.join(contractions.keys()))

# Return to base form
def expand_contractions(text, contractions=contractions):
    def replace(match):
        return contractions[match.group(0)]
    return contractions_re.sub(replace, text)

Alternative for contractions treatment by using a library: https://github.com/kootenpv/contractions

In [9]:
df['message_clean'] = df.loc[:, 'message_clean'].apply(expand_contractions)
df.head()

Unnamed: 0,label,message,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...","go until jurong point, crazy.. available only ..."
1,ham,Ok lar... Joking wif u oni...,ok lar... joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor... u c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro...","nah i do not think he goes to usf, he lives ar..."


`Remove punctuation`

In [10]:
def remove_punctuation(text):
    output = ''.join(char for char in text if char not in punctuation)
    return output

df['message_clean'] = df.loc[:, 'message_clean'].apply(remove_punctuation)
df.head()

Unnamed: 0,label,message,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i do not think he goes to usf he lives aro...


`Remove white spaces`

In [11]:
def strip_text(text):
    output = ' '.join(text.split())
    return output
    
df['message_clean'] = df.loc[:, 'message_clean'].apply(strip_text)
df.head()

Unnamed: 0,label,message,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in a wkly comp to win fa cup final ...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i do not think he goes to usf he lives aro...


`Remove stopwords`

In [12]:
english_stopwords = stopwords.words('english')

def remove_stopwords(text, stopwords=english_stopwords):
    output = ' '.join([word for word in nltk.word_tokenize(text) if word not in stopwords])
    return output

df['message_clean'] = df.loc[:, 'message_clean'].apply(remove_stopwords)
df.head()

Unnamed: 0,label,message,message_clean
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though


### **Text preprocessing part 2**

`Stemming`

In [13]:
# Stemming increases recall score, decreases precision (supress false negative)
english_stemmer = SnowballStemmer('english')

def stem_word(text, stemmer=english_stemmer):
    output = ' '.join([stemmer.stem(word) for word in nltk.word_tokenize(text)])
    return output

df['message_clean_stem'] = df.loc[:, 'message_clean'].apply(stem_word)
df.head()

Unnamed: 0,label,message,message_clean,message_clean_stem
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...,free entri wkli comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though,nah think goe usf live around though


In [14]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\maulialwan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

`Lemmatization`

In [15]:
# Lemmatization with part-of-speech tagging (to define adjective, verb, noun, adverb)

def get_pos(word):
    tag_dict = {
        'A': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }

    tag = nltk.pos_tag([word])[0][1][0].upper()
    return tag_dict.get(tag, wordnet.NOUN)

# Lemmatization increases precision score, decreases recall (supress false positive)
def lemmatize_word(text):
    lemmatizer = WordNetLemmatizer()
    output = ' '.join([lemmatizer.lemmatize(word, pos=get_pos(word)) for word in nltk.word_tokenize(text)])
    return output

df['message_clean_lemma'] = df.loc[:, 'message_clean'].apply(lemmatize_word)
df.head() 

Unnamed: 0,label,message,message_clean,message_clean_stem,message_clean_lemma
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,go jurong point crazi avail bugi n great world...,go jurong point crazy available bugis n great ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,ok lar joke wif u oni,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts st ...,free entri wkli comp win fa cup final tkts st ...,free entry wkly comp win fa cup final tkts st ...
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,u dun say earli hor u c alreadi say,u dun say early hor u c already say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah think goes usf lives around though,nah think goe usf live around though,nah think go usf life around though


### **Modeling**

In [17]:
df['label'].value_counts(True)

ham     0.865937
spam    0.134063
Name: label, dtype: float64

In [18]:
X = df['message_clean_lemma']
y = np.where(df['label'] == 'spam', 1, 0)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

`IDM`

In [37]:
# Generate document term matrix (DTM)

# Term-Frequency (TF)

tf_vectorizer = CountVectorizer(ngram_range=(1, 1))
X_train_tf = tf_vectorizer.fit_transform(X_train).toarray()
X_test_tf = tf_vectorizer.transform(X_test).toarray()

In [38]:
tf = tf_vectorizer.get_feature_names_out()

In [39]:
tf.shape

(6544,)

`TF-IDF`

In [73]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1))
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

In [74]:
df_tfidf = pd.DataFrame(
    data=X_train_tfidf,
    columns=tfidf_vectorizer.get_feature_names_out()
)

In [75]:
df_tfidf.head()

Unnamed: 0,aa,aah,aaniye,aathilove,aathiwhere,ab,abbey,abdomen,abeg,abelu,...,zealand,zed,zero,zhong,zindgi,zoe,zogtorius,zyada,üll,〨ud
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
df_tfidf.loc[0].sort_values(ascending=False)

close         0.761874
guy           0.647725
aa            0.000000
previously    0.000000
print         0.000000
                ...   
getzedcouk    0.000000
gettin        0.000000
getstop       0.000000
getsleep      0.000000
〨ud           0.000000
Name: 0, Length: 6544, dtype: float64

In [77]:
# Modeling
logreg = LogisticRegression(random_state=42)
logreg.fit(X_train_tfidf, y_train)

LogisticRegression(random_state=42)

In [78]:
logreg.score(X_train_tfidf, y_train)

0.9674669059905766

In [79]:
logreg.score(X_test_tfidf, y_test)

0.9668161434977578

In [80]:
y_pred = logreg.predict(X_test_tfidf)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.75      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



### **Modeling with pipeline**

In [81]:
X_train

184                                             guy close
2171    please come imin towndontmatter urgoin outlrju...
5422                            ok ksry knw sivatats askd
4113                                  ill see prolly yeah
4588          ill see swing bit get thing take care firsg
                              ...                        
1932                                      pa tell go bath
5316                                jus finish watch tv u
2309    moby pub quizwin £ high street prize u know ne...
1904    free entry weekly comp chance win ipod txt pod...
762     grandma oh dear u still ill felt shit morning ...
Name: message_clean_lemma, Length: 4457, dtype: object

In [84]:
model = LogisticRegression(random_state=42)
vectorizer = TfidfVectorizer()

prep_pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('model', model)
])

param_grid = {
    'model__C': [1.0, 0.1, 0.01],
    'vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)]
}

grid_search = GridSearchCV(
    prep_pipeline,
    param_grid,
    cv=5,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                                       ('model',
                                        LogisticRegression(random_state=42))]),
             n_jobs=-1,
             param_grid={'model__C': [1.0, 0.1, 0.01],
                         'vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2)]})

In [86]:
grid_search.best_params_

{'model__C': 1.0, 'vectorizer__ngram_range': (1, 1)}

In [87]:
best_model = grid_search.best_estimator_
best_model.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('model', LogisticRegression(random_state=42))])

In [88]:
y_pred_tuning = best_model.predict(X_test)

print(classification_report(y_test, y_pred_tuning))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       966
           1       1.00      0.75      0.86       149

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

