In [4]:
import re
from collections import Counter


from urlextract import URLExtract
import nltk
import pandas as pd

In [59]:
class Document:
    def __init__(self, raw_text, label=None, tokens=None):
        self._raw_text = raw_text
        self._label = label
        self._tokens = None
        if tokens is not None:
            self._tokens = list(tokens)
    
    @property
    def raw_text(self):
        return self._raw_text
    
    @property
    def label(self):
        return self._label
    
    @property
    def tokens(self):
        return list(self._tokens)
    
    def tokenized(self, tokenizer):
        return Document(self._raw_text, self._label, tokenizer.tokenize(self._raw_text))
    
    def __repr__(self):
        return f"{self._label}\t{self._raw_text}\t{self._tokens if self._tokens is not None else ''}\n"

In [144]:
def docs_labels(docs):
    return (d.label for d in docs)

In [60]:
def load_dataset(path):
    df = pd.read_csv(path, delimiter=',', encoding='latin-1')
    return [Document(text, label) for text, label in zip(df.v2, df.v1)]

In [61]:
docs = load_dataset("spam.csv")

In [62]:
docs[:5]

[ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...	,
 ham	Ok lar... Joking wif u oni...	,
 spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's	,
 ham	U dun say so early hor... U c already then say...	,
 ham	Nah I don't think he goes to usf, he lives around here though	]

In [145]:
Counter(docs_labels(docs))

Counter({'ham': 4825, 'spam': 747})

In [64]:
for i, doc in enumerate(d for d in docs if d.label == "spam"):
    if i >= 25:
        break
    print(doc.raw_text)
    print()

Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's

FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, å£1.50 to rcv

WINNER!! As a valued network customer you have been selected to receivea å£900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.

Had your mobile 11 months or more? U R entitled to Update to the latest colour mobiles with camera for Free! Call The Mobile Update Co FREE on 08002986030

SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info

URGENT! You have won a 1 week FREE membership in our å£100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18

XXXMobileMovieClub: To use your credit, click the WAP link in the next txt messag

In [65]:
class TwitterNLTKTokenizer:
    DIGITS_RE = re.compile(r'[0-9]+')
    
    def __init__(self):
        self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        self.tokenizer = nltk.tokenize.TweetTokenizer(preserve_case=False, reduce_len=True)
        self.url_extractor = URLExtract()
        
    def tokenize(self, text):
        text = self.DIGITS_RE.sub('0', text)
        tokens = []
        for sent in self.sent_detector.tokenize(text):
            urls = self.url_extractor.find_urls(sent, only_unique=True)
            for url in urls:
                sent = sent.replace(url, "<href>")
            tokens.extend(self.tokenizer.tokenize(sent))            
                
        return tokens

In [66]:
tokenizer = TwitterNLTKTokenizer()
docs = [d.tokenized(tokenizer) for d in docs]

In [67]:
for d in docs[:5]:
    print(d)

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...	['go', 'until', 'jurong', 'point', ',', 'crazy', '..', 'available', 'only', 'in', 'bugis', 'n', 'great', 'world', 'la', 'e', 'buffet', '...', 'cine', 'there', 'got', 'amore', 'wat', '...']

ham	Ok lar... Joking wif u oni...	['ok', 'lar', '...', 'joking', 'wif', 'u', 'oni', '...']

spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's	['free', 'entry', 'in', '0', 'a', 'wkly', 'comp', 'to', 'win', 'fa', 'cup', 'final', 'tkts', '0st', 'may', '0', '.', 'text', 'fa', 'to', '0', 'to', 'receive', 'entry', 'question', '(', 'std', 'txt', 'rate', ')', 't', '&', "c's", 'apply', '0over0', "'", 's']

ham	U dun say so early hor... U c already then say...	['u', 'dun', 'say', 'so', 'early', 'hor', '...', 'u', 'c', 'already', 'then', 'say', '...']

ham	Nah I don't think he goes to us

In [105]:
from sklearn.feature_extraction.text import TfidfVectorizer

class TfIDFFeatureExtractor:
    def __init__(self, docs):
        self._vectorizer = TfidfVectorizer(
            token_pattern=None,
            tokenizer=self._tokenize_doc,
            stop_words='english', min_df=0.001,
            preprocessor=self._identity
        )
        self._vectorizer.fit(docs)
    
    @property
    def vector_length(self):
        return len(self._vectorizer.vocabulary_)
    
    def vectorize_docs(self, docs):
        return self._vectorizer.transform(docs)
        
    @staticmethod
    def _tokenize_doc(doc):
        return doc.tokens
    
    @staticmethod
    def _identity(x):
        return x

In [108]:
def batcherize(samples, batch_size):
    queue = []
    
    for sample in samples:
        if len(queue) == batch_size:
            yield queue
            queue = []
        
        queue.append(sample)
    
    if queue:
        yield queue

In [128]:
from sklearn.metrics import f1_score

def evaluate(predicted_labels, gold_labels):
    def to_int(labels, positive_label="spam"):
        return [int(l == positive_label) for l in labels]
    
    return f1_score(to_int(gold_labels), to_int(predicted_labels))

In [138]:
import torch

class BinaryClassificationMLP(torch.nn.Module):
    def __init__(self, input_size: int, hidden_sizes: list, activation=torch.nn.LeakyReLU):
        super().__init__()
        
        self._activation = activation()
        self._hidden_layers = []
                
        prev_size = input_size
        for size in hidden_sizes:
            layer = torch.nn.Linear(prev_size, size)
            prev_size = size
        
        self._out_layer = torch.nn.Linear(prev_size, 1)
        self._out_sigmoid = torch.nn.Sigmoid()
    
    def forward(self, sample):
        for layer in self._hidden_layers:
            sample = self._activation(layer(sample))
        
        return self._out_sigmoid(self._out_layer(sample))            

In [146]:
from sklearn.model_selection import StratifiedShuffleSplit

def train_dev_test_split(docs, test_ratio, dev_ratio, random_seed=100):
    def index_docs(docs, indexes):
        return [docs[idx] for idx in indexes]
        
    train, dev_test = iter(next(
        StratifiedShuffleSplit(1, test_ratio + dev_ratio, random_state=random_seed).split(
            docs, list(docs_labels(docs)))))
    
    train_docs = index_docs(docs, train)
    dev_test_docs = index_docs(docs, dev_test)
    
    dev, test = iter(next(
        StratifiedShuffleSplit(1, test_ratio / (dev_ratio + test_ratio), random_state=random_seed).split(
            dev_test_docs, list(docs_labels(dev_test_docs)))))
    
    return train_docs, index_docs(dev_test_docs, dev), index_docs(dev_test_docs, test)

In [147]:
train, dev, test = train_dev_test_split(docs, 0.15, 0.15)