In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from typing import *
import torch
from torch import nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.utils.data import Dataset
from itertools import zip_longest
import torch.nn.functional as F


In [2]:
class Counter2D(Counter):
    def __init__(self, l: list):
        super(Counter2D, self).__init__([x for xs in l for x in set(xs)])

class iMessageVocab(object):
    def __init__(self, text_data: List):
        assert type(text_data) is list, "Must pass in a list of texts"
        self.sents = self.__text_handler(text_data)
        self.sos_token = "<sos>"
        self.eos_token = "<eos>"
        self.pad_token = "<pad>"
        self.num_tokens = 3
        self.__create_vocab()

    def __text_handler(self, text_data: List): 
        #checks if text is tokenized or not, and tokenizes if not
        if type(text_data[0]) is str and len(text_data[0].split()[0]) > 1:
            return [x.split() for x in text_data]
        return text_data

    def __create_vocab(self):
        self.word2index = {
            self.pad_token: 0, 
            self.sos_token: 1,
            self.eos_token: 2
            }

        for sentence in self.sents:
            for word in sentence:
                if word not in self.word2index:
                    self.word2index[word] = self.num_tokens
                    self.num_tokens += 1
        self.index2word = {v:k for k,v in self.word2index.items()}

    def most_common(self, n: int) -> list:
        counts = Counter2D(self.sents)
        return counts.most_common(n)

    def __call__(self, text: Union[str, List[str]]) -> List[int]:
        return self.transform(text)

    def transform(self, text: Union[str, List[str]]) -> List[int]:
        if type(text) is str:
            text = text.split()
        return [self.word2index[self.sos_token]] + [self.word2index[word] for word in text] + [self.word2index[self.eos_token]]


class iMessage(Dataset):
    def __init__(self, path: str):
        self.path = path
        self.data = self.__load()
        self.vocab = self.create_vocab(self.data)
        self.data = self.get_sentence_pairs(self.data) 

    def __load(self) -> List[str]:
        df = pd.read_csv(self.path)
        df.text = df.text.astype(str)
        df.text = df.text.apply(lambda x: re.sub(r"\s{2,}", " ", x).strip())
        df = df[df.text.str.len() > 0].reset_index(drop=True)

        df["msglen"] = df.text.apply(lambda x: len(x.split()))
        df = df[df["msglen"] > 1].reset_index(drop=True)
        df['msgkey'] = (df['is_sent'] != df['is_sent'].shift(1)).astype(int).cumsum()
        return df.groupby('msgkey', sort=False)['text'].apply(' <message_end> '.join).tolist()

    def create_vocab(self, sents):
        return iMessageVocab(sents)

    def get_sentence_pairs(self, data: List):
        return list(zip(data[:-1], data[1:]))

    def __tokenizer(self, sent: str) -> List[int]:
        return self.vocab.transform(sent)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        x, y = self.data[idx]
        x, y = self.__tokenizer(x), self.__tokenizer(y)
        return x, y



def zero_pad(list2d: List[list], fill_with: int = 0) -> np.ndarray:
    return np.array(list(zip_longest(*list2d, fillvalue=fill_with)))
    #return np.array(list(zip_longest(*list2d, fillvalue=fill_with))).T


class iMessageLoader(object):
    def __init__(self, dataset, batch_size: int, shuffle: bool = True):
        self.ds = dataset
        self.batch_size = batch_size
        self.idxs = np.arange(len(self.ds))
        if shuffle:
            np.random.shuffle(self.idxs)
        self.batch_idxs = self.__get_batch_idxs()
    
    def __get_batch_idxs(self):
        rem = divmod(len(self.idxs), self.batch_size)[1]
        if rem > 0:
            self.idxs = self.idxs[:-rem]
        assert len(self.idxs) % self.batch_size == 0
        split_size = int(len(self.idxs) / self.batch_size)
        return np.array_split(self.idxs, split_size)

    def __len__(self):
        return len(self.batch_idxs)

    def var_handler(self, x, mask: bool, max_len: bool) -> Tuple:
        lengths = torch.Tensor([len(tokens) for tokens in x])
        if max_len:
            lengths = max(lengths)
        x = zero_pad(x)
        if mask:
            return torch.LongTensor(x), lengths, self.create_binary_mask(x)
        return torch.LongTensor(x), lengths

    def create_binary_mask(self, arr: np.ndarray) -> torch.Tensor:
        mask_arr = np.zeros(arr.shape)
        mask_idxs = np.argwhere(arr > 0)
        mask_arr[mask_idxs[:,0], mask_idxs[:,1]] = 1
        return torch.BoolTensor(mask_arr)

    def __getitem__(self, idx):
        idxs = self.batch_idxs[idx]
        X_batch, y_batch = [], []
        for idx in idxs:
            x, y = self.ds[idx]
            X_batch.append(x)
            y_batch.append(y)

        X_batch = self.var_handler(X_batch, mask=False, max_len=False)
        y_batch = self.var_handler(y_batch, mask=True, max_len=True)

        return X_batch, y_batch



In [3]:
PATH = "iMessage_sample.csv"
batch_size = 16

ds = iMessage(PATH)

In [4]:
loader = iMessageLoader(ds, batch_size, shuffle=True)

In [5]:
for (X, lengths), (y, max_len, mask) in loader:
    print(X.shape)
    packed = pack_padded_sequence(X, lengths, enforce_sorted=False)
    padded, _ = pad_packed_sequence(packed)
    print(padded.shape)
    break

torch.Size([49, 16])
torch.Size([49, 16])


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [7]:
class Encoder(nn.Module):
    """
    Encoder
    ----------------------
            PARAMS
    ----------------------
        1. 'embedding' -> embedding layer
    ---------------------
    Forward (__call__) Params
    ---------------------
        1. 'X' -> padded batch of input sequences; shape=(max_length, batch_size)
        2. 'input_lengths' -> list of sequence lengths for each sentence in batch; shape=(batch_size)
        3. 'hidden' -> hidden state; shape=(n_layers x n_directions, batch_size, hidden_size)
    """
    def __init__(self, hidden_size, embedding, n_layers=1):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = embedding
        self.n_layers = n_layers

        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, bidirectional=True)

    def forward(self, X, input_lengths, hidden=None):
        X = self.embedding(X)
        X = pack_padded_sequence(X, input_lengths, enforce_sorted=False) #pack to save computing power
        outputs, hidden = self.gru(X, hidden)
        outputs, _ = pad_packed_sequence(outputs) #outputs, lengths
        outputs = outputs[:, :, :self.hidden_size] + outputs[:, : ,self.hidden_size:] #sum bidirectional units
        return outputs, hidden


class Attention(nn.Module):
    """
    Attention Mechanism
    ------
    output is softmax normalized weights tensor of shape (batch_size, 1, max_length)

    """
    def __init__(self, method, hidden_size):
        super(Attention, self).__init__()
        assert method in ["dot", "general", "concat"], f"{method} must be either 'dot', 'general', or 'concat'"
        score_func_map = {
            "dot": self.dot_score, 
            "general": self.general_score, 
            "concat": self.concat_score
            }
        self.method = method
        self.scoring_func = score_func_map[method]

        self.hidden_size = hidden_size

        if self.method == 'general':
            self.attention = nn.Linear(self.hidden_size, hidden_size)

        elif self.method == 'concat':
            self.attention = nn.Linear(self.hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(hidden_size))

    def dot_score(self, hidden, encoder_output):
        return torch.sum(hidden * encoder_output, dim=2)

    def general_score(self, hidden, encoder_output):
        weights = self.attention(encoder_output)
        return torch.sum(hidden * weights, dim=2)

    def concat_score(self, hidden, encoder_output):
        weights = self.attention(torch.cat((hidden.expand(encoder_output.size(0), -1, -1),
                                           encoder_output), 2)).tanh()
        return torch.sum(self.v * weights, dim=2)

    def forward(self, hidden, encoder_outputs):
        attn_weights = self.scoring_func(hidden, encoder_outputs).t() #transpose
        return F.softmax(attn_weights, dim=1).unsqueeze(1) #softmax to create probabilities and then add dim


class AttentionDecoder(nn.Module):
    """
    Decoder with Attention Mechanism
    ----------------------
            PARAMS
    ----------------------
        1. 'attention_method' -> ttention scoring method; either dot, general, or concat.
        2. 'embedding' -> embedding layer
    ---------------------
    Forward (__call__) Params
    ---------------------
        1. 'X_step' -> One timestep of input sequence; shape=(1, batch_size)
        2. 'prev_hidden' -> final hidden layer of GRU; shape=(n_layers x n_directions, batch_size, hidden_size)
        3. 'encoder_output' -> encoder output; shape=(max_length, batch_size, hidden_size)
    """
    def __init__(self, 
                 attention_method, 
                 embedding, 
                 hidden_size, 
                 output_size, 
                 n_layers=1):
        super(AttentionDecoder, self).__init__()
        self.attention_method = attention_method
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers

        self.embedding = embedding
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
        self.concat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

        self.attention = Attention(attention_method, hidden_size)

    def forward(self, X_step, prev_hidden, encoder_output):
        #NOTE: runs one word (timestep) at a time
        embedded = self.embedding(X_step)
        output, hidden = self.gru(embedded, prev_hidden)
        attention_w = self.attention(output, encoder_output) #attention weights
        #attention weights * encoder outputs = weighted sum context vector
        context = attention_w.bmm(encoder_output.transpose(0, 1)) #batch matrix-matrix product

        concat_input = torch.cat((output.squeeze(0), context.squeeze(1)), 1)
        concat_output = torch.tanh(self.concat(concat_input))
        output = self.out(concat_output) #predict next word
        output = F.softmax(output, dim=1) 
        return output, hidden



In [8]:
def mask_nll_loss(decoder_output, target, mask):
    """
    Average negative log likelihood of the elements that correspond to a 1 in the mask tensor
    -----
    PARAMS
    -----

        1. 'decoder_output' -> decoder output tensor
        2. 'target' -> the target tensor
        3. 'mask' -> binary mask tensor describing the padding of the target tensor
    """
    n_total = mask.sum()
    crossEntropy = -torch.log(torch.gather(decoder_output, 1, target.view(-1, 1)).squeeze(1))
    loss = crossEntropy.masked_select(mask).mean()
    loss = loss.to(device)
    return loss, n_total.item()


In [9]:
attention_method = 'dot'
count_of_tokens = ds.vocab.num_tokens
hidden_size = 512
encoder_n_layers = 2
dropout = 0.1

num_epochs = 5
clip = 50.0
teacher_forcing_ratio = 1.0
learning_rate = 0.0001
decoder_learning_ratio = 5.0


embedding_layer = nn.Embedding(count_of_tokens, hidden_size)

encoder = Encoder(hidden_size=hidden_size, 
                  embedding=embedding_layer, 
                  n_layers=2)

decoder = AttentionDecoder(attention_method=attention_method, 
                           embedding=embedding_layer, 
                           hidden_size=hidden_size, 
                           output_size = count_of_tokens,
                           n_layers=2)


encoder = encoder.to(device)
decoder = decoder.to(device)


encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=learning_rate * decoder_learning_ratio)

#configure cuda to call
for state in encoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()

for state in decoder_optimizer.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor):
            state[k] = v.cuda()




In [10]:
for epoch in range(1, num_epochs+1):
    for (X, lengths), (y, max_len, mask) in loader:
        encoder_optimizer.zero_grad()
        decoder_optimizer.zero_grad()
        X, y = X.to(device), y.to(device)
        mask = mask.to(device)
        lengths = lengths.to("cpu") #lengths should always be on cpu
        loss, n_totals = 0, 0

        cumulative_losses = []

        encoder_outputs, encoder_hidden = encoder(X, lengths)

        decoder_input = X[0].view(1,-1).to(device)
        #initial decoder hidden state is encoder final hidden state
        decoder_hidden = encoder_hidden[:decoder.n_layers]

        #if random.random() < teacher_forcing_ratio:
        for idx in range(int(max_len)):
            decoder_output, decoder_hidden = decoder(decoder_input, decoder_hidden, encoder_outputs)
            # Teacher forcing: next input is current target
            decoder_input = y[idx].view(1, -1)
            # Calculate and accumulate loss
            mask_loss, n_total = mask_nll_loss(decoder_output, y[idx], mask[idx])
            loss += mask_loss
            cumulative_losses.append(mask_loss.item() * n_total)
            n_totals += n_total

        loss.backward()

        #in place
        nn.utils.clip_grad_norm_(encoder.parameters(), clip)
        nn.utils.clip_grad_norm_(decoder.parameters(), clip)

        # Adjust model weights
        encoder_optimizer.step()
        decoder_optimizer.step()

    print(f"EPOCH {epoch}: {sum(cumulative_losses) / n_totals}")


EPOCH 1: 4.756896694499657
EPOCH 2: 4.42965586975749
EPOCH 3: 4.161765954325365
EPOCH 4: 3.9816492099079723
EPOCH 5: 3.7919977152284576
