### Import Labeles

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# imports
import torch
from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models

from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np
from pathlib import Path
import _pickle as pickle
from collections import defaultdict
import re

### Check the local directory

In [2]:
PATH_LM=Path("/home/paperspace/data/wikitext/wikitext-2")
list(PATH_LM.iterdir())

[PosixPath('/home/paperspace/data/wikitext/wikitext-2/wiki.train.tokens'),
 PosixPath('/home/paperspace/data/wikitext/wikitext-2/mode117.pth'),
 PosixPath('/home/paperspace/data/wikitext/wikitext-2/wiki.valid.tokens'),
 PosixPath('/home/paperspace/data/wikitext/wikitext-2/sample.txt'),
 PosixPath('/home/paperspace/data/wikitext/wikitext-2/wiki.test.tokens')]

### Load pretrained Word to ID mapping

In [12]:
# load pretrained dictionary mapping
with open('dict17.pkl', 'rb') as f:
    pretrn_word2idx, pretrn_idx2word = pickle.load(f)
print(list(pretrn_word2idx.items())[:10])
print(list(pretrn_idx2word)[:10])

[('UNK', 0), ('<eos>', 1), ('=', 2), ('Valkyria', 3), ('Chronicles', 4), ('III', 5), ('Senjō', 6), ('no', 7), ('3', 8), (':', 9)]
['UNK', '<eos>', '=', 'Valkyria', 'Chronicles', 'III', 'Senjō', 'no', '3', ':']


## New dataset

In [4]:
from pathlib import Path
PATH = Path("/home/paperspace/data/rotten_imdb/")
list(PATH.iterdir())

[PosixPath('/home/paperspace/data/rotten_imdb/glove.6B.100d.txt'),
 PosixPath('/home/paperspace/data/rotten_imdb/quote.tok.gt9.5000'),
 PosixPath('/home/paperspace/data/rotten_imdb/glove.6B.50d.txt'),
 PosixPath('/home/paperspace/data/rotten_imdb/plot.tok.gt9.5000'),
 PosixPath('/home/paperspace/data/rotten_imdb/glove.6B.200d.txt'),
 PosixPath('/home/paperspace/data/rotten_imdb/rotten_imdb.tar.gz'),
 PosixPath('/home/paperspace/data/rotten_imdb/subjdata.README.1.0'),
 PosixPath('/home/paperspace/data/rotten_imdb/glove.6B.300d.txt'),
 PosixPath('/home/paperspace/data/rotten_imdb/rotten_imdb.tar')]

In [5]:
def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Every dataset is lower cased except for TREC
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)     
    string = re.sub(r"\'s", " \'s", string) 
    string = re.sub(r"\'ve", " \'ve", string) 
    string = re.sub(r"n\'t", " n\'t", string) 
    string = re.sub(r"\'re", " \'re", string) 
    string = re.sub(r"\'d", " \'d", string) 
    string = re.sub(r"\'ll", " \'ll", string) 
    string = re.sub(r",", " , ", string) 
    string = re.sub(r"!", " ! ", string) 
    string = re.sub(r"\(", " \( ", string) 
    string = re.sub(r"\)", " \) ", string) 
    string = re.sub(r"\?", " \? ", string) 
    string = re.sub(r"\s{2,}", " ", string)    
    return string.strip().lower()


def read_file(path):
    """ Read file returns a shuttled list.
    """
    with open(path, encoding = "ISO-8859-1") as f:
        content = np.array(f.readlines())
    return content

def get_vocab(list_of_content):
    """Computes Dict of counts of words.
    
    Computes the number of times a word is on a document.
    """
    vocab = defaultdict(float)
    for content in list_of_content:
        for line in content:
            line = clean_str(line.strip())
            words = set(line.split())
            for word in words:
                vocab[word] += 1
    return vocab


# ======================================================
# Data Prep XY
# ======================================================

def make_XY():
    """
    Load the subjective / objective dataset
    """
    sub_content = read_file(PATH/"quote.tok.gt9.5000")
    obj_content = read_file(PATH/"plot.tok.gt9.5000")
    sub_content = np.array([clean_str(line.strip()) for line in sub_content])
    obj_content = np.array([clean_str(line.strip()) for line in obj_content])
    sub_y = np.zeros(len(sub_content))
    obj_y = np.ones(len(obj_content))
    X = np.append(sub_content, obj_content)
    y = np.append(sub_y, obj_y)
    return X,y


def make_train_val(X,y):
    """
    Train and test split
    """
    X_tr, X_vl, y_tr, y_vl = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_tr, X_vl, y_tr, y_vl


def encode_sentence(s, word2idx, N=35):
    """
    Takes in a long text and encodes it with dictionary
    then makes vectors of sized N.
    """
    enc = np.zeros(N, dtype=np.int32)
    enc1 = np.array([word2idx.get(w, 0) for w in s.split() + ["<eos>"]])
    l = min(N, len(enc1))
    enc[:l] = enc1[:l]
    return enc

def encode_sent_array(list_of_sentences, word2idx, N=35):
    return np.vstack([encode_sentence(sent, word2idx, N) for sent in list_of_sentences])

# ======================================================
# LM Model for reference
# ======================================================

class RNNModel(nn.Module):
    """
    Container module with an encoder, a recurrent module, and a decoder.
    
    ntoken: number of tokens
    ninp: number of inputs
    nhid: number of hidden units
    nlayers: number of layers
    dropout: % dropout
    """

    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        self.rnn = nn.GRU(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)
        self.init_weights()
        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        init_range = 0.1
        self.encoder.weight.data.uniform_(-init_range, init_range)
        self.decoder.bias.data.fill_(0.0)
        self.decoder.weight.data.uniform_(-init_range, init_range)

    def forward(self, input, hidden):
        """
        input: current input
        hidden: hidden state from the previous step
        """
        # pulls the embeddings for the input submitted
        emb = self.drop(self.encoder(input))
        
        # then applies the RNN against the embedding layer
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        """
        Initialize the hidden weights
        """
        weight = next(self.parameters()).data
        return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_())

# ======================================================
# Our RNN Classifier that imports the RNN model
# ======================================================

def load_model(m, p):
    m.load_state_dict(torch.load(p))

class NetLM(nn.Module):
    def __init__(self, model_path, ntokens, nemb, nhid, nlayers, bsz, bidir=False):
        super(NetLM, self).__init__()
        
        # if bidirectional is applied, (forward and backward)
        # otherwise its a single forward pass
        self.ndir = 2 if bidir else 1
        self.nlayers = nlayers  
        self.nemb = nemb
        self.bsz = bsz
        self.LM = RNNModel(ntokens, nemb, nhid, nlayers).cuda()
        load_model(self.LM, model_path)
            
        self.nhid = nhid
        
        # freeze the RNN        
        for param in self.LM.parameters():
            param.requires_grad = False
        
        self.linear1 = nn.Linear(nhid*3, 100) # binary classification
        self.linear2 = nn.Linear(100, 1)
        self.drop = nn.Dropout(0.2)
        self.bn = nn.BatchNorm1d(100, eps=1e-05, momentum=0.1, affine=True)
   
    def forward(self, input, hidden):
        bz = input.shape[1]
        emb = self.LM.drop(self.LM.encoder(input))
        output, hidden = self.LM.rnn(emb, hidden)
        # create concat pooling 
        out_avg = F.adaptive_avg_pool1d(output.permute(1,2,0), (1,)).view(bz,-1)
        out_max = F.adaptive_max_pool1d(output.permute(1,2,0), (1,)).view(bz,-1)
        out = torch.cat([output[-1], out_avg, out_max], dim=1)
        
        out = self.drop(F.relu(self.linear1(out)))
        out = self.bn(out)
        return F.sigmoid(self.linear2(out)), hidden

    def init_hidden(self, batch_size):
        # variable of size [num_layers*num_directions, b_sz, hidden_sz]
        return Variable(torch.zeros(self.ndir * self.nlayers, batch_size, self.nhid)).cuda()
    
# ====================================================================
# Training functions
# ====================================================================

def train_epocs(model, x_train, y_train, x_test, y_test, epochs=10, lr=0.01):
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = torch.optim.Adam(parameters, lr=lr)
    
    # assume the data input size is the following
    # [bptt, batch_size, embedding size]
    # will need to rearrange some of the dimensions

    b_sz = x_train.shape[0]
    print(b_sz)
    model.train()
    hidden = model.init_hidden(b_sz)
    for i in range(epochs):
        # wrap the data in variables
        x = Variable(torch.from_numpy(x_train)).long().cuda()
        x = x.permute(1,0)

        y = Variable(torch.from_numpy(y_train)).float().cuda().unsqueeze(1)
        
        # wrap hidden states for the model
        hidden = Variable(hidden.data)
        
        # pass our phrase through the model
        # get the updated hidden state
        y_hat, hidden = model(x, hidden)
        
        # calculate the loss
        loss = F.binary_cross_entropy_with_logits(y_hat, y)
        loss.backward()
        
        optimizer.zero_grad()
        
        optimizer.step()
        print(loss.data[0])
    test_metrics(model, x_test, y_test, hidden)


def test_metrics(m, x_test, y_test, hidden):
    m.eval()
    b_sz = x_test.shape[0]
    hidden = model.init_hidden(b_sz)
    x = Variable(torch.from_numpy(x_test)).long().cuda()
    x = x.permute(1,0)
    y = Variable(torch.from_numpy(y_test)).float().cuda().unsqueeze(1)
    y_hat, hidden = m(x, hidden)
    print(type(y_hat.data), type(y.data))
    loss = F.binary_cross_entropy_with_logits(y_hat, y)
    y_pred = y_hat > 0
    correct = (y_pred.float() == y).float().sum()
    accuracy = correct/y_pred.shape[0]
    print("test loss %.3f and accuracy %.3f" % (loss.data[0], accuracy.data[0]))

In [13]:
# load your dataset
X,y = make_XY()

# train test split
X_tr, X_vl, y_tr, y_vl = make_train_val(X,y)

# encode into numeric
X_tr_enc = encode_sent_array(X_tr, pretrn_word2idx)
X_vl_enc = encode_sent_array(X_vl, pretrn_word2idx)

In [15]:
# sample encoding / decoding
print(X_tr[0])
print(X_tr_enc[0])
print(' '.join([pretrn_idx2word[idx] for idx in X_tr_enc[0]]))

will god let her fall or give her a new path \?
[ 301 5011 7292  362 6357  311 2194  362   28  579 9429    0    1    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0]
will god let her fall or give her a new path UNK <eos> UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK UNK


### Setup RNN Classifier

In [17]:
model_params = dict(nemb = 300,
                    nhid = 300,
                    nlayers = 2,
                    ntokens = 33279,
                    bsz = X_tr_enc.shape[0]
                   )

model = NetLM(PATH_LM/'mode117.pth', **model_params).cuda()
parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = torch.optim.Adam(parameters, lr=0.01)

In [18]:
train_epocs(model, X_tr_enc, y_tr, X_vl_enc, y_vl,  epochs=10, lr=0.001)

8000
0.7182508707046509
0.7167633175849915
0.7171564698219299
0.7160307168960571
0.7166233658790588
0.7169387340545654
0.7165149450302124
0.7176434993743896
0.7173396944999695
0.7177921533584595
<class 'torch.cuda.FloatTensor'> <class 'torch.cuda.FloatTensor'>
test loss 0.724 and accuracy 0.494


In [19]:
train_epocs(model, X_tr_enc, y_tr, X_vl_enc, y_vl,  epochs=10, lr=0.01)

8000
0.7181419134140015
0.7173454761505127
0.7169738411903381
0.7168292999267578
0.7168923616409302
0.7172890901565552
0.7175048589706421
0.7175806164741516
0.7178093194961548
0.7179996967315674
<class 'torch.cuda.FloatTensor'> <class 'torch.cuda.FloatTensor'>
test loss 0.725 and accuracy 0.494


## Fast AI notes

```python
class LanguageModelData
    def get_model(self, opt_fn, emb_sz, n_hid, n_layers, **kwargs):
            """ Method returns a RNN_Learner object, that wraps an instance of the RNN_Encoder module.

            Args:
                opt_fn (Optimizer): the torch optimizer function to use
                emb_sz (int): embedding size
                n_hid (int): number of hidden inputs
                n_layers (int): number of hidden layers
                kwargs: other arguments

            Returns:
                An instance of the RNN_Learner class.

            """
            m = get_language_model(self.nt, emb_sz, n_hid, n_layers, self.pad_idx, **kwargs)
            model = SingleModel(to_gpu(m))
            return RNN_Learner(self, model, opt_fn=opt_fn)
    
class RNN_Learner(Learner):
    def __init__(self, data, models, **kwargs):
        super().__init__(data, models, **kwargs)

    def _get_crit(self, data): return F.cross_entropy

    def save_encoder(self, name): save_model(self.model[0], self.get_model_path(name))

    def load_encoder(self, name): load_model(self.model[0], self.get_model_path(name))
    

def get_language_model(n_tok, emb_sz, nhid, nlayers, pad_token,
                 dropout=0.4, dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5, tie_weights=True):
    """Returns a SequentialRNN model.

    A RNN_Encoder layer is instantiated using the parameters provided.

    This is followed by the creation of a LinearDecoder layer.

    Also by default (i.e. tie_weights = True), the embedding matrix used in the RNN_Encoder
    is used to  instantiate the weights for the LinearDecoder layer.

    The SequentialRNN layer is the native torch's Sequential wrapper that puts the RNN_Encoder and
    LinearDecoder layers sequentially in the model.

    Args:
        n_tok (int): number of unique vocabulary words (or tokens) in the source dataset
        emb_sz (int): the embedding size to use to encode each token
        nhid (int): number of hidden activation per LSTM layer
        nlayers (int): number of LSTM layers to use in the architecture
        pad_token (int): the int value used for padding text.
        dropouth (float): dropout to apply to the activations going from one LSTM layer to another
        dropouti (float): dropout to apply to the input layer.
        dropoute (float): dropout to apply to the embedding layer.
        wdrop (float): dropout used for a LSTM's internal (or hidden) recurrent weights.
        tie_weights (bool): decide if the weights of the embedding matrix in the RNN encoder should be tied to the
            weights of the LinearDecoder layer.
    Returns:
        A SequentialRNN model
    """

    rnn_enc = RNN_Encoder(n_tok, emb_sz, nhid=nhid, nlayers=nlayers, pad_token=pad_token,
                 dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)
    enc = rnn_enc.encoder if tie_weights else None
    return SequentialRNN(rnn_enc, LinearDecoder(n_tok, emb_sz, dropout, tie_encoder=enc))


class RNN_Encoder(nn.Module):

    """A custom RNN encoder network that uses
        - an embedding matrix to encode input,
        - a stack of LSTM layers to drive the network, and
        - variational dropouts in the embedding and LSTM layers

        The architecture for this network was inspired by the work done in
        "Regularizing and Optimizing LSTM Language Models".
        (https://arxiv.org/pdf/1708.02182.pdf)
    """

    initrange=0.1

    def __init__(self, ntoken, emb_sz, nhid, nlayers, pad_token, bidir=False,
                 dropouth=0.3, dropouti=0.65, dropoute=0.1, wdrop=0.5):
        """ Default constructor for the RNN_Encoder class

            Args:
                bs (int): batch size of input data
                ntoken (int): number of vocabulary (or tokens) in the source dataset
                emb_sz (int): the embedding size to use to encode each token
                nhid (int): number of hidden activation per LSTM layer
                nlayers (int): number of LSTM layers to use in the architecture
                pad_token (int): the int value used for padding text.
                dropouth (float): dropout to apply to the activations going from one LSTM layer to another
                dropouti (float): dropout to apply to the input layer.
                dropoute (float): dropout to apply to the embedding layer.
                wdrop (float): dropout used for a LSTM's internal (or hidden) recurrent weights.

            Returns:
                None
          """

        super().__init__()
        self.ndir = 2 if bidir else 1
        self.bs = 1
        self.encoder = nn.Embedding(ntoken, emb_sz, padding_idx=pad_token)
        self.encoder_with_dropout = EmbeddingDropout(self.encoder)
        self.rnns = [nn.LSTM(emb_sz if l == 0 else nhid, (nhid if l != nlayers - 1 else emb_sz)//self.ndir,
             1, bidirectional=bidir) for l in range(nlayers)]
        if wdrop: self.rnns = [WeightDrop(rnn, wdrop) for rnn in self.rnns]
        self.rnns = torch.nn.ModuleList(self.rnns)
        self.encoder.weight.data.uniform_(-self.initrange, self.initrange)

        self.emb_sz,self.nhid,self.nlayers,self.dropoute = emb_sz,nhid,nlayers,dropoute
        self.dropouti = LockedDropout(dropouti)
        self.dropouths = nn.ModuleList([LockedDropout(dropouth) for l in range(nlayers)])

    def forward(self, input):
        """ Invoked during the forward propagation of the RNN_Encoder module.
        Args:
            input (Tensor): input of shape (sentence length x batch_size)

        Returns:
            raw_outputs (tuple(list (Tensor), list(Tensor)): list of tensors evaluated from each RNN layer without using
            dropouth, list of tensors evaluated from each RNN layer using dropouth,
        """
        sl,bs = input.size()
        if bs!=self.bs:
            self.bs=bs
            self.reset()
        with set_grad_enabled(self.training):
            emb = self.encoder_with_dropout(input, dropout=self.dropoute if self.training else 0)
            emb = self.dropouti(emb)
            raw_output = emb
            new_hidden,raw_outputs,outputs = [],[],[]
            for l, (rnn,drop) in enumerate(zip(self.rnns, self.dropouths)):
                current_input = raw_output
                with warnings.catch_warnings():
                    warnings.simplefilter("ignore")
                    raw_output, new_h = rnn(raw_output, self.hidden[l])
                new_hidden.append(new_h)
                raw_outputs.append(raw_output)
                if l != self.nlayers - 1: raw_output = drop(raw_output)
                outputs.append(raw_output)

            self.hidden = repackage_var(new_hidden)
        return raw_outputs, outputs

    def one_hidden(self, l):
        nh = (self.nhid if l != self.nlayers - 1 else self.emb_sz)//self.ndir
        if IS_TORCH_04: return Variable(self.weights.new(self.ndir, self.bs, nh).zero_())
        else: return Variable(self.weights.new(self.ndir, self.bs, nh).zero_(), volatile=not self.training)

    def reset(self):
        self.weights = next(self.parameters()).data
        self.hidden = [(self.one_hidden(l), self.one_hidden(l)) for l in range(self.nlayers)]
        
        
        
        
def get_rnn_classifer(bptt, max_seq, n_class, n_tok, emb_sz, n_hid, n_layers, pad_token, layers, drops, bidir=False,
                      dropouth=0.3, dropouti=0.5, dropoute=0.1, wdrop=0.5):
    rnn_enc = MultiBatchRNN(bptt, max_seq, n_tok, emb_sz, n_hid, n_layers, pad_token=pad_token, bidir=bidir,
                      dropouth=dropouth, dropouti=dropouti, dropoute=dropoute, wdrop=wdrop)
    return SequentialRNN(rnn_enc, PoolingLinearClassifier(layers, drops))


class LinearBlock(nn.Module):
    def __init__(self, ni, nf, drop):
        super().__init__()
        self.lin = nn.Linear(ni, nf)
        self.drop = nn.Dropout(drop)
        self.bn = nn.BatchNorm1d(ni)

    def forward(self, x): return self.lin(self.drop(self.bn(x)))

    
class PoolingLinearClassifier(nn.Module):
    def __init__(self, layers, drops):
        super().__init__()
        self.layers = nn.ModuleList([
            LinearBlock(layers[i], layers[i + 1], drops[i]) for i in range(len(layers) - 1)])

    def pool(self, x, bs, is_max):
        f = F.adaptive_max_pool1d if is_max else F.adaptive_avg_pool1d
        return f(x.permute(1,2,0), (1,)).view(bs,-1)

    def forward(self, input):
        raw_outputs, outputs = input
        output = outputs[-1]
        sl,bs,_ = output.size()
        avgpool = self.pool(output, bs, False)
        mxpool = self.pool(output, bs, True)
        x = torch.cat([output[-1], mxpool, avgpool], 1)
        for l in self.layers:
            l_x = l(x)
            x = F.relu(l_x)
        return l_x, raw_outputs, outputs
```