In [1]:
from pathlib import Path
from typing import *
import torch
import torch.optim as optim
import numpy as np
import pandas as pd
from functools import partial
from overrides import overrides

from allennlp.data import Instance
from allennlp.data.token_indexers import TokenIndexer
from allennlp.data.tokenizers import Token
from allennlp.nn import util as nn_util

In [2]:
class Config(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        for k, v in kwargs.items():
            setattr(self, k, v)
    
    def set(self, key, val):
        self[key] = val
        setattr(self, key, val)
        
config = Config(
    testing=True,
    seed=1,
    batch_size=64,
    lr=3e-4,
    epochs=2,
    hidden_sz=64,
    max_seq_len=100, # necessary to limit memory usage
    max_vocab_size=100000,
)

In [3]:
from allennlp.common.checks import ConfigurationError
import uuid


In [4]:
USE_GPU = torch.cuda.is_available()
DATA_ROOT = Path("/Users/wenjunshi") / "SpamDataset"

def data_preprocessing (fname:str):
    DATA_ROOT = Path("/Users/wenjunshi/SpamDataset") / fname
    data = pd.read_csv(DATA_ROOT, encoding='latin-1')
    index = data.shape[0]
    list = [0]*data.shape[0]
    ##add spam and ham column
    data["spam"] = list
    data["ham"] = list
    
#for label in data["Label"]:
  #  if label == "spam":
  #      print(label.index)
    count = 0


## generate random id column
    id =[]
    for i in range (0,index):
        x = uuid.uuid4()
        id.append(x)
    data["id"] = id
##mark spam and ham as 1 in its own column
    while (count<index):
        if data["Label"][count] == "spam":
            data.at[count,"spam"] = 1
        if data["Label"][count] == "ham":
            data.at[count,"ham"] =1
        count = count + 1
        
    return data
data_preprocessing("train.csv").to_csv("/Users/wenjunshi/SpamDataset/revised_train.csv")
data_preprocessing("test.csv").to_csv("/Users/wenjunshi/SpamDataset/revised_test.csv")


In [5]:
#LOAD DATA
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.dataset_readers import DatasetReader
from allennlp.data.vocabulary import Vocabulary
from allennlp.data.dataset_readers import DatasetReader

In [6]:
label_cols = ["ham","spam"]

In [7]:
from allennlp.data.fields import TextField, MetadataField, ArrayField

class SpamDatasetReader(DatasetReader):
    def __init__(self, tokenizer: Callable[[str], List[str]]=lambda x: x.split(),
                 token_indexers: Dict[str, TokenIndexer] = None,
                 max_seq_len: Optional[int]=config.max_seq_len) -> None:
        super().__init__(lazy=False)
        self.tokenizer = tokenizer
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
        self.max_seq_len = max_seq_len

    @overrides
    def text_to_instance(self, tokens: List[Token],id: str=None,
                         labels: np.ndarray=None) -> Instance:
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"tokens": sentence_field}
        
        id_field = MetadataField(id)
        fields["id"] = id_field
        
        if labels is None:
            labels = np.zeros(len(label_cols))
        label_field = ArrayField(array=labels)
        fields["label"] = label_field

        return Instance(fields)
    
    @overrides
    def _read(self, file_path: str) -> Iterator[Instance]:
        df = pd.read_csv(file_path)
        if config.testing: df = df.head(1000)
        for i, row in df.iterrows():
            yield self.text_to_instance(
                [Token(x) for x in self.tokenizer(row["Text"])],row["id"],
                row[label_cols].values,
            )


In [8]:
from allennlp.data.tokenizers.word_splitter import SpacyWordSplitter
from allennlp.data.token_indexers.elmo_indexer import ELMoCharacterMapper, ELMoTokenCharactersIndexer

# the token indexer is responsible for mapping tokens to integers
token_indexer = ELMoTokenCharactersIndexer()

def tokenizer(x: str):
    return [w.text for w in
            SpacyWordSplitter(language='en_core_web_sm', 
                              pos_tags=False).split_words(x)[:config.max_seq_len]]
reader = SpamDatasetReader(
    tokenizer=tokenizer,
    token_indexers={"tokens": token_indexer}
)

In [9]:
train_ds, test_ds = (reader.read(DATA_ROOT / fname) for fname in ["revised_train.csv", "revised_test.csv"])
val_ds = None

1000it [00:02, 493.46it/s]
517it [00:00, 951.64it/s]


In [10]:
vars(train_ds[0].fields["tokens"])

{'tokens': [Free,
  entry,
  in,
  2,
  a,
  wkly,
  comp,
  to,
  win,
  FA,
  Cup,
  final,
  tkts,
  21st,
  May,
  2005,
  .,
  Text,
  FA,
  to,
  87121,
  to,
  receive,
  entry,
  question(std,
  txt,
  rate)T&C,
  's,
  apply,
  08452810075over18,
  's],
 '_token_indexers': {'tokens': <allennlp.data.token_indexers.elmo_indexer.ELMoTokenCharactersIndexer at 0x128c76cc0>},
 '_indexed_tokens': None,
 '_indexer_name_to_indexed_token': None,
 '_token_index_to_indexer_name': None}

In [11]:
#prepare vocabulary
vocab = Vocabulary()
from allennlp.data.iterators import BucketIterator
iterator = BucketIterator(batch_size=config.batch_size, 
                          sorting_keys=[("tokens", "num_tokens")],
                         )

In [12]:
iterator.index_with(vocab)

In [13]:
#Read sample
batch = next(iter(iterator(train_ds)))
batch

{'tokens': {'tokens': tensor([[[259,  86,  83,  ..., 261, 261, 261],
           [259,  34, 260,  ..., 261, 261, 261],
           [259,  90, 112,  ..., 261, 261, 261],
           ...,
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0]],
  
          [[259,  85, 105,  ..., 261, 261, 261],
           [259, 122, 112,  ..., 261, 261, 261],
           [259,  45, 260,  ..., 261, 261, 261],
           ...,
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0]],
  
          [[259,  90, 112,  ..., 261, 261, 261],
           [259, 120, 106,  ..., 261, 261, 261],
           [259, 115, 102,  ..., 261, 261, 261],
           ...,
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0]],
  
          ...,
  
          [[259,  81

In [14]:
batch["tokens"]["tokens"]

tensor([[[259,  86,  83,  ..., 261, 261, 261],
         [259,  34, 260,  ..., 261, 261, 261],
         [259,  90, 112,  ..., 261, 261, 261],
         ...,
         [  0,   0,   0,  ...,   0,   0,   0],
         [  0,   0,   0,  ...,   0,   0,   0],
         [  0,   0,   0,  ...,   0,   0,   0]],

        [[259,  85, 105,  ..., 261, 261, 261],
         [259, 122, 112,  ..., 261, 261, 261],
         [259,  45, 260,  ..., 261, 261, 261],
         ...,
         [  0,   0,   0,  ...,   0,   0,   0],
         [  0,   0,   0,  ...,   0,   0,   0],
         [  0,   0,   0,  ...,   0,   0,   0]],

        [[259,  90, 112,  ..., 261, 261, 261],
         [259, 120, 106,  ..., 261, 261, 261],
         [259, 115, 102,  ..., 261, 261, 261],
         ...,
         [  0,   0,   0,  ...,   0,   0,   0],
         [  0,   0,   0,  ...,   0,   0,   0],
         [  0,   0,   0,  ...,   0,   0,   0]],

        ...,

        [[259,  81,  83,  ..., 261, 261, 261],
         [259,  34, 260,  ..., 261, 261, 261]

In [15]:
batch["tokens"]["tokens"].shape

torch.Size([64, 28, 50])

In [16]:
#Prepare Model
import torch
import torch.nn as nn
import torch.optim as optim
from allennlp.modules.seq2vec_encoders import Seq2VecEncoder, PytorchSeq2VecWrapper
from allennlp.nn.util import get_text_field_mask
from allennlp.models import Model
from allennlp.modules.text_field_embedders import TextFieldEmbedder

In [17]:
class BaselineModel(Model):
    def __init__(self, word_embeddings: TextFieldEmbedder,
                 encoder: Seq2VecEncoder,
                 out_sz: int=len(label_cols)):
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder
        self.projection = nn.Linear(self.encoder.get_output_dim(), out_sz)
        self.loss = nn.BCEWithLogitsLoss()
        
    def forward(self, tokens: Dict[str, torch.Tensor],
                id: Any, label: torch.Tensor) -> torch.Tensor:
        mask = get_text_field_mask(tokens)
        embeddings = self.word_embeddings(tokens)
        state = self.encoder(embeddings, mask)
        class_logits = self.projection(state)
        
        output = {"class_logits": class_logits}
        output["loss"] = self.loss(class_logits, label)

        return output

In [18]:
### Prepare Embedding 

In [19]:
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders import ElmoTokenEmbedder

options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json'
weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5'

elmo_embedder = ElmoTokenEmbedder(options_file, weight_file)
word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder})

In [20]:
from allennlp.modules.seq2vec_encoders import PytorchSeq2VecWrapper
encoder: Seq2VecEncoder = PytorchSeq2VecWrapper(nn.LSTM(word_embeddings.get_output_dim(), config.hidden_sz, bidirectional=True, batch_first=True))

In [21]:
model = BaselineModel(
    word_embeddings, 
    encoder, 
)

In [22]:
if USE_GPU: model.cuda()
else: model

### Basic sanity check


In [23]:
batch = nn_util.move_to_device(batch, 0 if USE_GPU else -1)
batch

{'tokens': {'tokens': tensor([[[259,  86,  83,  ..., 261, 261, 261],
           [259,  34, 260,  ..., 261, 261, 261],
           [259,  90, 112,  ..., 261, 261, 261],
           ...,
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0]],
  
          [[259,  85, 105,  ..., 261, 261, 261],
           [259, 122, 112,  ..., 261, 261, 261],
           [259,  45, 260,  ..., 261, 261, 261],
           ...,
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0]],
  
          [[259,  90, 112,  ..., 261, 261, 261],
           [259, 120, 106,  ..., 261, 261, 261],
           [259, 115, 102,  ..., 261, 261, 261],
           ...,
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0]],
  
          ...,
  
          [[259,  81

In [24]:
tokens = batch["tokens"]
labels = batch

In [25]:
batch

{'tokens': {'tokens': tensor([[[259,  86,  83,  ..., 261, 261, 261],
           [259,  34, 260,  ..., 261, 261, 261],
           [259,  90, 112,  ..., 261, 261, 261],
           ...,
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0]],
  
          [[259,  85, 105,  ..., 261, 261, 261],
           [259, 122, 112,  ..., 261, 261, 261],
           [259,  45, 260,  ..., 261, 261, 261],
           ...,
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0]],
  
          [[259,  90, 112,  ..., 261, 261, 261],
           [259, 120, 106,  ..., 261, 261, 261],
           [259, 115, 102,  ..., 261, 261, 261],
           ...,
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0]],
  
          ...,
  
          [[259,  81

In [26]:
mask = get_text_field_mask(tokens)
mask

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 0, 0],
        [1, 1, 1,  ..., 1, 0, 0]])

In [27]:
embeddings = model.word_embeddings(tokens)
state = model.encoder(embeddings, mask)
class_logits = model.projection(state)
class_logits

tensor([[-0.1578, -0.1401],
        [ 0.1633, -0.2026],
        [-0.4005,  0.0698],
        [-0.0653, -0.0601],
        [-0.1972, -0.0118],
        [-0.0385,  0.0442],
        [-0.1010, -0.1426],
        [-0.2317, -0.0913],
        [-0.1465, -0.2030],
        [-0.0957, -0.1868],
        [-0.2466, -0.2116],
        [-0.1672, -0.1248],
        [-0.0462, -0.1022],
        [-0.1406, -0.0813],
        [-0.0394, -0.0513],
        [-0.2200, -0.0987],
        [-0.1257, -0.1661],
        [-0.0559, -0.0156],
        [-0.2843, -0.0496],
        [-0.3019,  0.0464],
        [-0.1359, -0.1331],
        [-0.1846,  0.0706],
        [-0.3091, -0.1731],
        [-0.1807, -0.0988],
        [-0.2577, -0.1032],
        [-0.1106, -0.1412],
        [-0.2444, -0.0532],
        [-0.1391, -0.1714],
        [-0.0063, -0.0187],
        [-0.1968,  0.0642],
        [-0.2396,  0.0440],
        [-0.1942, -0.1614],
        [-0.1106, -0.0620],
        [-0.0334, -0.1449],
        [-0.2336,  0.0177],
        [-0.2384, -0

In [28]:
model(**batch)

{'class_logits': tensor([[-0.1086, -0.1314],
         [-0.1362, -0.1357],
         [-0.4590,  0.0911],
         [-0.0605, -0.0981],
         [-0.2530, -0.1254],
         [-0.0772, -0.0104],
         [-0.1212, -0.0629],
         [-0.1704, -0.0799],
         [-0.2248, -0.1453],
         [-0.0470, -0.1885],
         [-0.1447, -0.1643],
         [-0.2085, -0.1088],
         [-0.0659, -0.1236],
         [-0.2722, -0.0979],
         [-0.0983, -0.0712],
         [-0.1765, -0.1423],
         [-0.1426, -0.1086],
         [-0.2360,  0.0183],
         [-0.0881,  0.0486],
         [-0.1567,  0.0316],
         [-0.0515, -0.2747],
         [-0.0654,  0.0452],
         [-0.2805, -0.1574],
         [-0.2308,  0.0117],
         [-0.3458, -0.0769],
         [-0.0514, -0.1826],
         [-0.3650, -0.0007],
         [-0.1293, -0.2126],
         [-0.0455, -0.0085],
         [-0.2042, -0.0619],
         [-0.2395,  0.0231],
         [-0.1094,  0.0896],
         [-0.0955, -0.1364],
         [-0.0752, -0.1951]

In [29]:
labels

{'tokens': {'tokens': tensor([[[259,  86,  83,  ..., 261, 261, 261],
           [259,  34, 260,  ..., 261, 261, 261],
           [259,  90, 112,  ..., 261, 261, 261],
           ...,
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0]],
  
          [[259,  85, 105,  ..., 261, 261, 261],
           [259, 122, 112,  ..., 261, 261, 261],
           [259,  45, 260,  ..., 261, 261, 261],
           ...,
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0]],
  
          [[259,  90, 112,  ..., 261, 261, 261],
           [259, 120, 106,  ..., 261, 261, 261],
           [259, 115, 102,  ..., 261, 261, 261],
           ...,
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0],
           [  0,   0,   0,  ...,   0,   0,   0]],
  
          ...,
  
          [[259,  81

In [30]:
loss = model(**batch)["loss"]


In [31]:
loss

tensor(0.6906, grad_fn=<BinaryCrossEntropyWithLogitsBackward>)

In [32]:
loss.backward()

In [33]:
[x.grad for x in list(model.encoder.parameters())]

[tensor([[-2.2899e-07,  1.1963e-05,  6.3401e-06,  ..., -4.9502e-06,
           8.5761e-07,  1.1151e-05],
         [-5.4373e-05, -1.1778e-04,  3.2322e-05,  ..., -3.2892e-05,
          -6.9842e-06,  1.9351e-05],
         [-1.5540e-05,  1.1828e-05,  7.0665e-06,  ..., -1.8890e-05,
          -7.2158e-07,  3.4845e-05],
         ...,
         [ 1.9443e-04, -1.5309e-06, -4.4923e-05,  ..., -2.6027e-05,
          -1.4770e-04, -2.8207e-06],
         [ 1.6439e-05, -3.0601e-06, -2.4317e-06,  ...,  2.9516e-06,
          -2.5714e-05, -2.0522e-06],
         [-1.5911e-04, -1.8073e-04,  1.6343e-04,  ...,  9.0564e-05,
           6.2489e-05, -2.1502e-04]]),
 tensor([[ 5.6621e-06, -3.0676e-07,  1.1542e-06,  ..., -8.6580e-07,
          -1.3204e-06,  3.3684e-06],
         [ 1.5932e-06,  2.5790e-05,  3.5700e-05,  ..., -3.2820e-06,
          -3.1531e-05, -1.8656e-05],
         [ 1.1627e-06,  4.4114e-07,  1.3999e-06,  ...,  6.8209e-06,
           7.5733e-06,  1.3866e-05],
         ...,
         [ 2.3546e-06,  2

###Train

In [34]:
optimizer = optim.Adam(model.parameters(), lr=config.lr)
from allennlp.training.trainer import Trainer

trainer = Trainer(
    model=model,
    optimizer=optimizer,
    iterator=iterator,
    train_dataset=train_ds,
    cuda_device=0 if USE_GPU else -1,
    num_epochs=config.epochs,
)


In [35]:
metrics = trainer.train()

loss: 0.6783 ||: 100%|██████████| 16/16 [00:37<00:00,  2.32s/it]
loss: 0.6232 ||: 100%|██████████| 16/16 [00:37<00:00,  2.32s/it]


### Generating Predictions

In [36]:
from allennlp.data.iterators import DataIterator
from tqdm import tqdm
from scipy.special import expit # the sigmoid function

def tonp(tsr): return tsr.detach().cpu().numpy()

class Predictor:
    def __init__(self, model: Model, iterator: DataIterator,
                 cuda_device: int=-1) -> None:
        self.model = model
        self.iterator = iterator
        self.cuda_device = cuda_device
        
    def _extract_data(self, batch) -> np.ndarray:
        out_dict = self.model(**batch)
        return expit(tonp(out_dict["class_logits"]))
    
    def predict(self, ds: Iterable[Instance]) -> np.ndarray:
        pred_generator = self.iterator(ds, num_epochs=1, shuffle=False)
        self.model.eval()
        pred_generator_tqdm = tqdm(pred_generator,
                                   total=self.iterator.get_num_batches(ds))
        preds = []
        with torch.no_grad():
            for batch in pred_generator_tqdm:
                batch = nn_util.move_to_device(batch, self.cuda_device)
                preds.append(self._extract_data(batch))
        return np.concatenate(preds, axis=0)

In [37]:
from allennlp.data.iterators import BasicIterator
# iterate over the dataset without changing its order
seq_iterator = BasicIterator(batch_size=64)
seq_iterator.index_with(vocab)

In [38]:
predictor = Predictor(model, seq_iterator, cuda_device=0 if USE_GPU else -1)
train_preds = predictor.predict(train_ds) 
test_preds = predictor.predict(test_ds) 

100%|██████████| 16/16 [01:04<00:00,  4.04s/it]
100%|██████████| 9/9 [00:36<00:00,  4.11s/it]
