Import libraries and modules

In [1]:
import re
from collections import Counter


from urlextract import URLExtract
import nltk
import numpy as np
import pandas as pd
import random
import torch

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import StratifiedShuffleSplit

Simple datamodel class

In [2]:
class Document:
    def __init__(self, raw_text, label=None, tokens=None):
        self._raw_text = raw_text
        self._label = label
        self._tokens = None
        if tokens is not None:
            self._tokens = list(tokens)
    
    @property
    def raw_text(self):
        return self._raw_text
    
    @property
    def label(self):
        return self._label
    
    @property
    def tokens(self):
        return list(self._tokens)
    
    def tokenized(self, tokenizer):
        return Document(self._raw_text, self._label, tokenizer.tokenize(self._raw_text))
    
    def __repr__(self):
        return f"{self._label}\t{self._raw_text}\t{self._tokens if self._tokens is not None else ''}\n"

Useful function to get iterator over docs labels

In [3]:
def docs_labels(docs):
    return (d.label for d in docs)

Load raw data in our datamodel

In [4]:
def load_dataset(path):
    df = pd.read_csv(path, delimiter=',', encoding='latin-1')
    return [Document(text, label) for text, label in zip(df.v2, df.v1)]

In [5]:
docs = load_dataset("spam.csv")

Check data

In [6]:
docs[:5]

[ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...	,
 ham	Ok lar... Joking wif u oni...	,
 spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's	,
 ham	U dun say so early hor... U c already then say...	,
 ham	Nah I don't think he goes to usf, he lives around here though	]

In [7]:
Counter(docs_labels(docs))

Counter({'ham': 4825, 'spam': 747})

Let's check SMS that we are interested to find

In [8]:
for i, doc in enumerate(d for d in docs if d.label == "spam"):
    if i >= 25:
        break
    print(doc.raw_text)
    print()

Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's

FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv

WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.

Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030

SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info

URGENT! You have won a 1 week FREE membership in our å£100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18

XXXMobileMovieClub: To use your credit, click the WAP link in the next txt messag

So many digits and urls. Let's use twitter-based NLTK tokenizer and do some preprocessing: change all digits to one 0 and change all urls to 'href' token

In [9]:
class TwitterNLTKPreProcessingTokenizer:
    DIGITS_RE = re.compile(r'[0-9]+')
    
    def __init__(self):
        nltk.download('punkt')
        self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        self.tokenizer = nltk.tokenize.TweetTokenizer(preserve_case=False, reduce_len=True)
        self.url_extractor = URLExtract()
        
    def tokenize(self, text):
        text = self.DIGITS_RE.sub('0', text)
        tokens = []
        for sent in self.sent_detector.tokenize(text):
            urls = self.url_extractor.find_urls(sent, only_unique=True)
            for url in urls:
                sent = sent.replace(url, "<href>")
            tokens.extend(self.tokenizer.tokenize(sent))            
                
        return tokens

Tokenize whole dataset

In [10]:
tokenizer = TwitterNLTKPreProcessingTokenizer()
docs = [d.tokenized(tokenizer) for d in docs]

[nltk_data] Downloading package punkt to /home/vladislav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Check tokenization

In [11]:
for d in docs[:5]:
    print(d)

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...	['go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'cine', 'there', 'got', 'amore', 'wat', '...']

ham	Ok lar... Joking wif u oni...	['ok', 'lar', '...', 'joking', 'wif', 'u', 'oni', '...']

spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's	['free', 'entry', 'in', '0', 'a', 'wkly', 'comp', 'to', 'win', 'fa', 'cup', 'final', 'tkts', '0st', 'may', '0', '.', 'text', 'fa', 'to', '0', 'to', 'receive', 'entry', 'question', '(', 'std', 'txt', 'rate', ')', 't', '&', "c's", 'apply', '0over0', "'", 's']

ham	U dun say so early hor... U c already then say...	['u', 'dun', 'say', 'so', 'early', 'hor', '...', 'u', 'c', 'already', 'then', 'say', '...']

ham	Nah I don't think he goes to us

Let's do simple TFIDF features for our small texts, may be we can find some keywords through them

In [12]:
class TfIDFFeatureExtractor:
    def __init__(self, docs):
        self._vectorizer = TfidfVectorizer(
            token_pattern=None,
            tokenizer=self._tokenize_doc,
            stop_words='english', min_df=0.001,
            preprocessor=self._identity
        )
        self._vectorizer.fit(docs)
    
    @property
    def vector_length(self):
        return len(self._vectorizer.vocabulary_)
    
    def vectorize_docs(self, docs):
        return [f.toarray()[0] for f in self._vectorizer.transform(docs)]
        
    @staticmethod
    def _tokenize_doc(doc):
        return doc.tokens
    
    @staticmethod
    def _identity(x):
        return x

Function to iterate over dataset, generate features and collect samples into batches

In [13]:
from itertools import repeat

def batcherize(docs, feature_extractor, batch_size, shuffle=True, labels=True):
    if shuffle:
        docs = list(docs)
        random.shuffle(docs)
    
    samples = feature_extractor.vectorize_docs(docs)
    labels = docs_labels(docs) if labels else repeat(None)
    
    samples_queue, labels_queue = [], []
    
    for sample, label in zip(samples, labels):
        if len(samples_queue) == batch_size:
            yield samples_queue, labels_queue
            samples_queue, labels_queue = [], []
        
        samples_queue.append(sample)
        labels_queue.append(1 if label == "spam" else 0)
    
    if samples_queue:
        yield samples_queue, labels_queue

Let's evaluate with F1 because we have imbalanced classes

In [14]:
def evaluate(predicted_labels, gold_labels):
    def to_int(labels, positive_label="spam"):
        return [int(l == positive_label) for l in labels]
    gold_labels, predicted_labels = to_int(gold_labels), to_int(predicted_labels)
    tp, tn, fp, fn = 0, 0, 0, 0
    
    for g, p in zip(gold_labels, predicted_labels):
        if g == 1 and p == 1:
            tp += 1
        elif g == 0 and p == 0:
            tn += 1        
        elif g == 1 and p == 0:
            fn += 1
        elif g == 0 and p == 1:
            fp += 1
    
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f1 = 2 * precision * recall / (precision + recall) if tp > 0 else 0
    
    scores = {
        "tp": tp, "tn": tn, "fp": fp, "fn": fn,
        "accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1,
    }
    
    return f1, scores

Simple pytorch multilayered perceptron network for binary classification with sigmoid output

In [15]:
class BinaryClassificationMLP(torch.nn.Module):
    def __init__(self, input_size: int, hidden_sizes: list, activation=torch.nn.LeakyReLU):
        super().__init__()
        
        self._activation = activation()
        self._hidden_layers = []
                
        prev_size = input_size
        for size in hidden_sizes:
            layer = torch.nn.Linear(prev_size, size)
            self._hidden_layers.append(layer)
            prev_size = size
        
        self._out_layer = torch.nn.Linear(prev_size, 1)
        self._out_sigmoid = torch.nn.Sigmoid()
    
    def forward(self, sample):
        for layer in self._hidden_layers:
            sample = self._activation(layer(sample))
        
        return self._out_sigmoid(self._out_layer(sample))            

We need docs to train and to evaluate, let's split them. Also we are accurate with class imbalance

In [16]:
def train_dev_test_split(docs, test_ratio, dev_ratio, random_seed=100):
    def index_docs(docs, indexes):
        return [docs[idx] for idx in indexes]
        
    train, dev_test = iter(next(
        StratifiedShuffleSplit(1, test_ratio + dev_ratio, random_state=random_seed).split(
            docs, list(docs_labels(docs)))))
    
    train_docs = index_docs(docs, train)
    dev_test_docs = index_docs(docs, dev_test)
    
    dev, test = iter(next(
        StratifiedShuffleSplit(1, test_ratio / (dev_ratio + test_ratio), random_state=random_seed).split(
            dev_test_docs, list(docs_labels(dev_test_docs)))))
    
    return train_docs, index_docs(dev_test_docs, dev), index_docs(dev_test_docs, test)

In [17]:
train, dev, test = train_dev_test_split(docs, 0.15, 0.15)

Function to get predictions from our model

In [18]:
def predict(model, dev, batcher_factory):
    predictions = []
    
    with torch.no_grad():
        for samples, _ in batcher_factory(dev, False, False):
            outputs = model(torch.tensor(samples, dtype=torch.float32))
            predictions.extend(l[0] for l in outputs.numpy())
    return ["spam" if p > 0.5 else "ham" for p in predictions]

Function to train model, returns max score we got on dev set

In [19]:
def train_model(epoch_num, train, dev, batcher_factory, model, optimizer, criterion):
    max_dev_score = 0
    other_dev_scores_with_max = None
    
    for epoch in range(epoch_num):
        running_loss = 0.0
        for i, (samples, labels) in enumerate(batcher_factory(train, True, True)):
            optimizer.zero_grad()
            outputs = model(torch.tensor(samples, dtype=torch.float32))
            loss = criterion(outputs, torch.tensor(labels, dtype=torch.float32).unsqueeze_(-1))
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            if i % 300 == 299:
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 2000))
                running_loss = 0.0
        print(f"train F1 = {evaluate(predict(model, train, batcher_factory), docs_labels(train))[0]}")
        dev_score, other_scores = evaluate(predict(model, dev, batcher_factory), docs_labels(dev))
        print(f"dev F1 = {dev_score}")
        
        if dev_score > max_dev_score:        
            max_dev_score = dev_score
            other_dev_scores_with_max = other_scores
    
    return max_dev_score, other_dev_scores_with_max

Factory for our batcher with feature extractor. Need to decompose batcher and feature extractor but not enough motivation to do it in pet project :)

In [20]:
def get_batcher_factory(feature_extractor, batch_size):
    return lambda docs, shuffle, labels: batcherize(docs, feature_extractor, batch_size, shuffle, labels)

Experiment pipeline. Evaluate model with 5 seeds and make results reproducible by setting seeds everywhere

In [21]:
seed_num = 5

epoch_num = 20
hidden_sizes = [200]
learning_rate = 0.03
batch_size = 6
dev_main_scores = []
dev_all_scores = []

for seed in range(100, 100*(seed_num + 1), 100):
    np.random.seed(seed)
    torch.manual_seed(seed)
    random.seed(seed)
    
    feature_extractor = TfIDFFeatureExtractor(train)
    model = BinaryClassificationMLP(feature_extractor.vector_length, hidden_sizes, torch.nn.Tanh)
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    max_dev_score, other_scores = train_model(
        epoch_num, train, dev, get_batcher_factory(feature_extractor, batch_size), model, optimizer, criterion)
    dev_main_scores.append(max_dev_score)
    dev_all_scores.append(other_scores)

print()
print(f'mean_score = {np.mean(dev_main_scores)}, std = {np.std(dev_main_scores)}')

[1,   300] loss: 0.048
[1,   600] loss: 0.038
train F1 = 0.6169665809768637
dev F1 = 0.6628571428571429
[2,   300] loss: 0.028
[2,   600] loss: 0.025
train F1 = 0.7671541057367829
dev F1 = 0.792079207920792
[3,   300] loss: 0.024
[3,   600] loss: 0.021
train F1 = 0.7951807228915663
dev F1 = 0.8155339805825244
[4,   300] loss: 0.021
[4,   600] loss: 0.020
train F1 = 0.801317233809001
dev F1 = 0.8195121951219512
[5,   300] loss: 0.019
[5,   600] loss: 0.019
train F1 = 0.8180839612486545
dev F1 = 0.8285714285714286
[6,   300] loss: 0.018
[6,   600] loss: 0.018
train F1 = 0.7968923418423973
dev F1 = 0.8078817733990148
[7,   300] loss: 0.015
[7,   600] loss: 0.017
train F1 = 0.8392484342379959
dev F1 = 0.839622641509434
[8,   300] loss: 0.016
[8,   600] loss: 0.016
train F1 = 0.8841285296981499
dev F1 = 0.8558558558558559
[9,   300] loss: 0.016
[9,   600] loss: 0.015
train F1 = 0.8467153284671532
dev F1 = 0.8544600938967137
[10,   300] loss: 0.015
[10,   600] loss: 0.015
train F1 = 0.846315

train F1 = 0.8811475409836066
dev F1 = 0.8598130841121495
[20,   300] loss: 0.013
[20,   600] loss: 0.014
train F1 = 0.888
dev F1 = 0.8623853211009174
[1,   300] loss: 0.048
[1,   600] loss: 0.033
train F1 = 0.5629228687415426
dev F1 = 0.6235294117647059
[2,   300] loss: 0.027
[2,   600] loss: 0.021
train F1 = 0.7800453514739228
dev F1 = 0.7939698492462313
[3,   300] loss: 0.020
[3,   600] loss: 0.020
train F1 = 0.8411016949152542
dev F1 = 0.8155339805825244
[4,   300] loss: 0.018
[4,   600] loss: 0.018
train F1 = 0.8511087645195353
dev F1 = 0.8235294117647057
[5,   300] loss: 0.016
[5,   600] loss: 0.017
train F1 = 0.8577449947312962
dev F1 = 0.8235294117647057
[6,   300] loss: 0.016
[6,   600] loss: 0.015
train F1 = 0.8857142857142857
dev F1 = 0.8638497652582159
[7,   300] loss: 0.014
[7,   600] loss: 0.014
train F1 = 0.8699271592091571
dev F1 = 0.8390243902439024
[8,   300] loss: 0.013
[8,   600] loss: 0.016
train F1 = 0.8992094861660079
dev F1 = 0.8909090909090909
[9,   300] loss: 

0.87 F1 is good result! Pipeline works even without complex features and complex classifiers. We can go further with hyperparameter search, deeper networks and final test evaluation but it's just routine

Let's check other scores

In [22]:
for metric in ["accuracy", "precision", "recall", "f1"]:
    print(f'mean {metric} = {np.mean([sc[metric] for sc in dev_all_scores])}')

mean accuracy = 0.9677033492822966
mean precision = 0.9047732279322315
mean recall = 0.8482142857142858
mean f1 = 0.8755175465629914


Accuracy is very high. That's why we used F1 not accuracy in evaluation

Let's try without our text preprocessing

In [23]:
class TwitterNLTKTokenizer:
    def __init__(self):
        nltk.download('punkt')
        self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        self.tokenizer = nltk.tokenize.TweetTokenizer(preserve_case=False, reduce_len=True)
        
    def tokenize(self, text):
        tokens = []
        for sent in self.sent_detector.tokenize(text):
            tokens.extend(self.tokenizer.tokenize(sent))            
                
        return tokens

In [24]:
tokenizer = TwitterNLTKTokenizer()
train = [d.tokenized(tokenizer) for d in train]
dev = [d.tokenized(tokenizer) for d in dev]
test = [d.tokenized(tokenizer) for d in test]

[nltk_data] Downloading package punkt to /home/vladislav/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [25]:
seed_num = 5

epoch_num = 20
hidden_sizes = [200]
learning_rate = 0.03
batch_size = 6
dev_main_scores = []
dev_all_scores = []

for seed in range(100, 100*(seed_num+1), 100):
    np.random.seed(seed)
    torch.manual_seed(seed)
    random.seed(seed)
    
    feature_extractor = TfIDFFeatureExtractor(train)
    model = BinaryClassificationMLP(feature_extractor.vector_length, hidden_sizes, torch.nn.Tanh)
    criterion = torch.nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    max_dev_score, other_scores = train_model(
        epoch_num, train, dev, get_batcher_factory(feature_extractor, batch_size), model, optimizer, criterion)
    dev_main_scores.append(max_dev_score)
    dev_all_scores.append(other_scores)

print()
print(f'mean_score = {np.mean(dev_main_scores)}, std = {np.std(dev_main_scores)}')

[1,   300] loss: 0.053
[1,   600] loss: 0.048
train F1 = 0.17132867132867133
dev F1 = 0.2222222222222222
[2,   300] loss: 0.038
[2,   600] loss: 0.033
train F1 = 0.623207301173403
dev F1 = 0.5925925925925924
[3,   300] loss: 0.032
[3,   600] loss: 0.028
train F1 = 0.6905055487053021
dev F1 = 0.6628571428571429
[4,   300] loss: 0.028
[4,   600] loss: 0.026
train F1 = 0.7339667458432304
dev F1 = 0.6961325966850829
[5,   300] loss: 0.026
[5,   600] loss: 0.025
train F1 = 0.7786606129398409
dev F1 = 0.7434554973821988
[6,   300] loss: 0.025
[6,   600] loss: 0.024
train F1 = 0.7369668246445499
dev F1 = 0.7032967032967032
[7,   300] loss: 0.023
[7,   600] loss: 0.023
train F1 = 0.8083242059145673
dev F1 = 0.7817258883248731
[8,   300] loss: 0.021
[8,   600] loss: 0.022
train F1 = 0.8343949044585988
dev F1 = 0.7980295566502463
[9,   300] loss: 0.022
[9,   600] loss: 0.022
train F1 = 0.819848975188781
dev F1 = 0.79
[10,   300] loss: 0.022
[10,   600] loss: 0.021
train F1 = 0.7950450450450451
d

train F1 = 0.829059829059829
dev F1 = 0.7804878048780488
[20,   300] loss: 0.018
[20,   600] loss: 0.019
train F1 = 0.8412537917087968
dev F1 = 0.7945205479452055
[1,   300] loss: 0.054
[1,   600] loss: 0.046
train F1 = 0.16783216783216784
dev F1 = 0.14876033057851243
[2,   300] loss: 0.038
[2,   600] loss: 0.033
train F1 = 0.572972972972973
dev F1 = 0.6627218934911242
[3,   300] loss: 0.030
[3,   600] loss: 0.029
train F1 = 0.6749999999999999
dev F1 = 0.7403314917127072
[4,   300] loss: 0.028
[4,   600] loss: 0.026
train F1 = 0.7211538461538463
dev F1 = 0.7675675675675675
[5,   300] loss: 0.024
[5,   600] loss: 0.026
train F1 = 0.7032967032967034
dev F1 = 0.7540983606557377
[6,   300] loss: 0.025
[6,   600] loss: 0.022
train F1 = 0.7662037037037036
dev F1 = 0.8186528497409328
[7,   300] loss: 0.022
[7,   600] loss: 0.024
train F1 = 0.7854710556186152
dev F1 = 0.8247422680412371
[8,   300] loss: 0.021
[8,   600] loss: 0.023
train F1 = 0.8052516411378555
dev F1 = 0.8442211055276381
[9, 

0.82 < 0.87. That's why preprocessing is crucial on such noisy data

In [26]:
for metric in ["accuracy", "precision", "recall", "f1"]:
    print(f'mean {metric} = {np.mean([sc[metric] for sc in dev_all_scores])}')

mean accuracy = 0.9555023923444976
mean precision = 0.8892948881971563
mean recall = 0.7660714285714286
mean f1 = 0.8222050625838389
