In [1]:
from fastai.text import *
from fastai.callbacks import SaveModelCallback, EarlyStoppingCallback
import torch
import torch.nn as nn
from fasttext import FastText
from gensim.models import KeyedVectors
import zipfile

In [2]:
UNK, PAD, BOS, EOS = '<UNK>', '<PAD>', '<BOS>', '<EOS>'
LABELS = ['entailment', 'neutral', 'contradiction']
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [3]:
class WordVocab(Vocab):
    def numericalize(self, t:Collection[str], lowercase=True) -> List[int]:
        return [self.stoi[self.process_word(w, lowercase)] for w in t]

    @staticmethod
    def process_word(word: str, lowercase=True):
        if lowercase:
            word = word.lower()
        return word

In [4]:
def load_pretrained_vocab_embedding(type: str, path, is_zip=True, inside_zip_fn=None):
    def load_pretrained_embedding_from_fasttext_cc_model(path):
        model = FastText.load_model(path)

        embedding_dim = model.get_dimension()
        words = [UNK, PAD, *model.words, BOS, EOS]
        vocab = WordVocab(words)
        embeddings = nn.Embedding.from_pretrained(
            torch.cat([
                (torch.rand(1, embedding_dim,dtype=torch.float)
                 .uniform_(- math.sqrt(3 / embedding_dim),math.sqrt(3 / embedding_dim))),
                torch.zeros(1, embedding_dim,dtype=torch.float),
                torch.tensor(model.get_input_matrix()),
                (torch.rand(2, embedding_dim,dtype=torch.float)
                 .uniform_(- math.sqrt(3 / embedding_dim),math.sqrt(3 / embedding_dim)))
            ]),
            padding_idx=vocab.stoi[PAD]
        )
        return vocab, embeddings


    def load_pretrained_embedding_from_gensim_fasttext_model(path):
        model = KeyedVectors.load_word2vec_format(path, binary=True)
        embedding_dim = model.vector_size
        words = [UNK, PAD, *model.index2word, BOS, EOS]
        vocab = WordVocab(words)
        embeddings = nn.Embedding.from_pretrained(
            torch.cat([
                (torch.rand(1, embedding_dim, dtype=torch.float)
                                 .uniform_(- math.sqrt(3 / embedding_dim),math.sqrt(3 / embedding_dim))),
                torch.zeros(1, embedding_dim, dtype=torch.float),
                torch.tensor(model.vectors),
                (torch.rand(2, embedding_dim, dtype=torch.float)
                 .uniform_(- math.sqrt(3 / embedding_dim),math.sqrt(3 / embedding_dim)))
            ]),
            padding_idx=vocab.stoi[PAD]
        )
        return vocab, embeddings

    def load_glove_embedding(fn: str, is_zip=True, inside_zip_fn=None):
        assert is_zip and inside_zip_fn is not None, 'Must provide file name inside zip'

        def load_from_buffer_with_pandas(buffer):
            df = pd.read_csv(buffer, sep=' ', header=None, quoting=csv.QUOTE_NONE)
            vocab = df.iloc[:, 0].values
            vectors = df.iloc[:, 1:].values
            return vocab, vectors

        if is_zip:
            with zipfile.ZipFile(fn) as zf:
                with zf.open(inside_zip_fn) as f:
                    vocab, vectors = load_from_buffer_with_pandas(f)
                    f.close()
                zf.close()
        else:
            with open(fn, mode='r', encoding='utf8') as f:
                vocab, vectors = load_from_buffer_with_pandas(f)

        vocab = WordVocab([UNK, PAD, *vocab, BOS, EOS])
        embedding_dim = vectors.shape[1]
        embeddings =  nn.Embedding.from_pretrained(
            torch.cat([
                (torch.rand(1, embedding_dim, dtype=torch.float)
                                 .uniform_(- math.sqrt(3 / embedding_dim),math.sqrt(3 / embedding_dim))),
                torch.zeros(1, embedding_dim, dtype=torch.float),
                torch.tensor(vectors, dtype=torch.float),
                (torch.rand(2, embedding_dim, dtype=torch.float)
                 .uniform_(- math.sqrt(3 / embedding_dim),math.sqrt(3 / embedding_dim)))
            ]),
            padding_idx=vocab.stoi[PAD]
        )
        return vocab, embeddings

    assert type in ['fasttext_cc', 'fasttext_gensim', 'glove']
    if type == 'fasttext_cc':
        return load_pretrained_embedding_from_fasttext_cc_model(path)
    elif type == 'fasttext_cc':
        return load_pretrained_embedding_from_gensim_fasttext_model(path)
    else:
        return load_glove_embedding(path, is_zip=is_zip, inside_zip_fn=inside_zip_fn)


In [5]:
seed = 42
path = Path('.')
train = pd.read_csv('data/csv/train.csv', sep='\t'); train.fillna('', inplace=True)
test = pd.read_csv('data/csv/test.csv', sep='\t'); test.fillna('', inplace=True)
dev = pd.read_csv('data/csv/dev.csv', sep='\t'); dev.fillna('', inplace=True)

In [6]:
class PairPreProcessor(PreProcessor):
    def __init__(self, vocab: WordVocab, tokenizer: BaseTokenizer=None, ds: Collection=None):
        super(PairPreProcessor, self).__init__(ds=ds)
        self.vocab = vocab
        self.tokenizer = Tokenizer(post_rules=[], pre_rules=[])
        self.tok = tokenizer if tokenizer else SpacyTokenizer('en')

    def process_one(self, item:Any):
        premise, hypothesis = item
        premise_words, hypothesis_words = (
            self.tokenizer.process_text(premise, self.tok),
            self.tokenizer.process_text(hypothesis, self.tok)
        )
        premise_token_ids, hypothesis_token_ids = (
            self.vocab.numericalize([BOS, *premise_words, EOS]), self.vocab.numericalize([BOS, *hypothesis_words, EOS])
        )
        return premise_token_ids, hypothesis_token_ids

class LabelPreProcessor(PreProcessor):
    def __init__(self, vocab: Vocab):
        super(LabelPreProcessor, self).__init__()
        self.vocab = vocab

    def process_one(self, item:Any):
        return self.vocab.stoi[item]

In [7]:
word_vocab, emb = load_pretrained_vocab_embedding('glove', 'data/pretrained/glove.6B.zip', is_zip=True, inside_zip_fn='glove.6B.300d.txt')
label_vocab = Vocab(LABELS)

pair_processor = PairPreProcessor(word_vocab)
label_processor = LabelPreProcessor(vocab=label_vocab)

In [8]:
train_il = ItemList(
    items=train[['premise', 'hypothesis']].values,
    processor=pair_processor
).process()

dev_il = ItemList(
    items=dev[['premise', 'hypothesis']].values,
    processor=pair_processor
).process()

test_il = ItemList(
    items=test[['premise', 'hypothesis']].values,
    processor=pair_processor
).process()

In [9]:
train_tl = ItemList(
    items=train['label'].values,
    processor=label_processor
).process()

dev_tl = ItemList(
    items=dev['label'].values,
    processor=label_processor
).process()

test_tl = ItemList(
    items=test['label'].values,
    processor=label_processor
).process()

In [10]:
train_ll = LabelList(x=train_il, y=train_tl)
dev_ll = LabelList(x=dev_il, y=dev_tl)
test_ll = LabelList(x=test_il, y=test_tl)

In [11]:
class CollateFn:
    def __init__(
            self,
            word_vocab: WordVocab,
            label_vocab: Vocab,
            pad_first=False
    ):
        self.word_vocab = word_vocab
        self.label_vocab = label_vocab
        self.pad_first = pad_first

        self.word_pad_idx = self.word_vocab.stoi[PAD]

    @staticmethod
    def padding_collate(samples, pad_idx=1, pad_first=False, sort=False):
        lengths = [len(s) for s in samples]
        max_len = max(lengths)
        res = torch.zeros(len(samples), max_len).long() + pad_idx
        for i,s in enumerate(samples):
            if pad_first: res[i, -len(s):] = LongTensor(s)
            else:         res[i, :len(s) ] = LongTensor(s)

        if sort:
            args_sort = torch.tensor(lengths, dtype=torch.long).argsort(descending=True)
            recover_idxs = torch.argsort(args_sort)
            return res, args_sort, recover_idxs

        return res

    def __call__(self, batch_data):
        batch_data_sort_by_promises = sorted(batch_data, key=lambda x: len(x[0][0]), reverse=True)
        xs, ys = zip(*batch_data_sort_by_promises)

        promises, hypothesises = zip(*xs)

        # pad promises, hypothesises
        pad_promises, pad_hypothesises = (
            self.padding_collate(promises, pad_idx=self.word_pad_idx, pad_first=self.pad_first),
            self.padding_collate(hypothesises, pad_idx=self.word_pad_idx, pad_first=self.pad_first, sort=True)
        )

        # convert ys to tensor
        ys = torch.tensor(ys, dtype=torch.long)

        return (pad_promises, pad_hypothesises), ys

In [42]:
bs = 3
databunch = DataBunch.create(
    train_ds=train_ll,
    valid_ds=dev_ll,
    test_ds=test_ll,
    collate_fn=CollateFn(word_vocab=word_vocab, label_vocab=label_vocab),
    bs=bs,
    device=device
)

In [43]:
(premise, (hypothesis, hypothesis_args_sort, hypothesis_recover_idxs)), labels = databunch.one_batch()
premise.size(), hypothesis.size(), hypothesis_args_sort.size(), hypothesis_recover_idxs.size(), labels.size()

(torch.Size([3, 20]),
 torch.Size([3, 10]),
 torch.Size([3]),
 torch.Size([3]),
 torch.Size([3]))

In [None]:
class RNNDropout(nn.Module):
    def __init__(self, p=0., batch_first=True):
        super(RNNDropout, self).__init__()
        self.p = p
        self.batch_first = True

    def forward(self, x):
        if not self.training or self.p == 0.:
            return x
        if self.batch_first:
            return self.dropout_mask(x, (x.size(0), 1, x.size(2)), p=self.p) * x
        else:
            return self.dropout_mask(x, (1, x.size(1), x.size(2)), p=self.p) * x

    @staticmethod
    def dropout_mask(x, sizes, p):
        return x.new(sizes).bernoulli_(1-p).div_(1-p)

In [44]:
class InputEncoding(nn.Module):
    def __init__(
            self,
            word_vocab: WordVocab,
            embedding_dim=300,
            embeddings: nn.Embedding=None,
            hidden_size=300,
            bidirectional=True,
            p_dropout=0.
    ):
        super(InputEncoding, self).__init__()
        assert embedding_dim is not None or embeddings is not None, 'embedding_dim and embeddings cannot both be none'
        self.word_vocab = word_vocab
        self.bidirectional = bidirectional
        self.p_dropout = p_dropout
        self.pad_idx = word_vocab.stoi[PAD]

        self.embedding = embeddings if embeddings is not None else nn.Embedding(
            len(self.word_vocab.itos),
            embedding_dim=embedding_dim,
            padding_idx=self.pad_idx
        )
        self.embedding_dim = self.embedding.embedding_dim
        self.premise_lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=hidden_size,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=p_dropout
        )

        self.hypothesis_lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=hidden_size,
            batch_first=True,
            bidirectional=bidirectional,
            dropout=p_dropout
        )

        self.premise_dropout = RNNDropout(p=p_dropout, batch_first=True)
        self.hypothesis_dropout = RNNDropout(p=p_dropout, batch_first=True)

    def forward(self, inputs):
        assert len(inputs) == 2
        premise_batch, (hypothesis_batch, hypothesis_args_sort, hypothesis_recover_idxs) = inputs

        # sort hypothesis with lengths
        hypothesis_batch = hypothesis_batch[hypothesis_args_sort]

        premise_mask, hypothesis_mask = premise_batch != self.pad_idx, hypothesis_batch != self.pad_idx
        premise_lengths, hypothesis_lengths = torch.sum(premise_mask, dim=1), torch.sum(hypothesis_mask, dim=1)
        packed_premise_batch = nn.utils.rnn.pack_padded_sequence(
            self.premise_dropout(self.embedding(premise_batch)),
            lengths=premise_lengths,
            batch_first=True
        )
        packed_hypothesis_batch = nn.utils.rnn.pack_padded_sequence(
            self.hypothesis_dropout(self.embedding(hypothesis_batch)),
            lengths=hypothesis_lengths,
            batch_first=True,
        )

        packed_premise_output, _ = self.premise_lstm(packed_premise_batch)
        packed_hypothesis_output, _ = self.hypothesis_lstm(packed_hypothesis_batch)

        premise_output, _ = nn.utils.rnn.pad_packed_sequence(packed_premise_output)
        hypothesis_output, _ = nn.utils.rnn.pad_packed_sequence(packed_hypothesis_output)

        # recover hypothesis
        hypothesis_output = hypothesis_output[:, hypothesis_recover_idxs, :]
        hypothesis_lengths = hypothesis_lengths[hypothesis_recover_idxs]

        return (premise_output, premise_lengths), (hypothesis_output, hypothesis_lengths, hypothesis_args_sort, hypothesis_recover_idxs)

In [45]:
input_encoding = InputEncoding(
    word_vocab=word_vocab,
    embedding_dim=300,
    embeddings=emb,
    hidden_size=2,
    p_dropout=0
)

In [46]:
((premise_output, premise_lengths),
 (hypothesis_output, hypothesis_lengths, hypothesis_args_sort, hypothesis_recover_idxs)) = input_encoding((premise, (hypothesis, hypothesis_args_sort, hypothesis_recover_idxs)))
print(premise_output.size(), premise_lengths.size())
print(hypothesis_output.size(), hypothesis_lengths.size(), hypothesis_args_sort.size(), hypothesis_recover_idxs.size())

torch.Size([20, 3, 4]) torch.Size([3])
torch.Size([10, 3, 4]) torch.Size([3]) torch.Size([3]) torch.Size([3])


In [67]:
class LocalInference(nn.Module):
    def __init__(self):
        super(LocalInference, self).__init__()

    def forward(self, inputs):
        (premise_dash, premise_lengths), (hypothesis_dash, hypothesis_lengths, hypothesis_args_sort, hypothesis_recover_idxs) = inputs

        premise_dash = premise_dash.transpose(1, 0)
        hypothesis_dash = hypothesis_dash.transpose(1, 0)

        attention = torch.bmm(premise_dash, hypothesis_dash.transpose(2, 1))
        mask = torch.zeros(attention.size(), device=attention.device)
        for i in range(premise_lengths.size(0)):
            mask[i, :premise_lengths[i], :hypothesis_lengths[i]] = 1

        attention_exp = torch.exp(attention * mask + (1 - mask) * (-1e1))

        attention_normalize_premise = attention_exp / torch.sum(attention_exp, dim=2, keepdim=True) * mask
        attention_normalize_hypothesis = attention_exp / torch.sum(attention_exp, dim=1, keepdim=True) * mask

        premise_tilde = torch.bmm(attention_normalize_premise, hypothesis_dash)
        hypothesis_tilde = torch.bmm(attention_normalize_hypothesis.transpose(1, 2), premise_dash)

        premise_m, hypothesis_m = map(lambda x: torch.cat([x[0], x[1], x[0] - x[1], x[0] * x[1]], dim=2),
                                      [(premise_dash, premise_tilde), (hypothesis_dash, hypothesis_tilde)])

        return (premise_m, premise_lengths), (hypothesis_m, hypothesis_lengths, hypothesis_args_sort, hypothesis_recover_idxs)


In [68]:
local_inference = LocalInference()
premise_dash = (premise_output, premise_lengths)
hypothesis_dash = (hypothesis_output, hypothesis_lengths, hypothesis_args_sort, hypothesis_recover_idxs)
print(hypothesis_lengths[hypothesis_args_sort])
(premise_m, premise_lengths), (hypothesis_m, hypothesis_lengths, hypothesis_arg_sort, hypothesis_recover_idxs) = local_inference(
    (premise_dash, hypothesis_dash)
)
premise_m.size(), hypothesis_m.size(), premise_lengths.size(), hypothesis_lengths.size(), hypothesis_args_sort.size(), hypothesis_recover_idxs.size()

tensor([10,  9,  9])


(torch.Size([3, 20, 16]),
 torch.Size([3, 10, 16]),
 torch.Size([3]),
 torch.Size([3]),
 torch.Size([3]),
 torch.Size([3]))

In [22]:
class Projection(nn.Module):
    def __init__(
            self,
            in_features,
            out_features,
            bias,
    ):
        super(Projection, self).__init__()
        self.projection = nn.Linear(
            in_features=in_features,
            out_features=out_features,
            bias=bias
        )
        self.relu = nn.ReLU()

    def forward(self, inputs):
        (premise_m, premise_lengths), (hypothesis_m, hypothesis_lengths, hypothesis_args_sort, hypothesis_recover_idxs) = inputs
        premise_m = self.relu(self.projection(premise_m))
        hypothesis_m = self.relu(self.projection(hypothesis_m))
        return (premise_m, premise_lengths), (hypothesis_m, hypothesis_lengths, hypothesis_args_sort, hypothesis_recover_idxs)


In [23]:
class InferenceComposition(nn.Module):
    def __init__(
            self,
            input_size,
            hidden_size=300,
            bidirectional=True,
            p_dropout=0.
    ):
        super(InferenceComposition, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.bidirectional = bidirectional
        self.p_dropout = p_dropout

        self.premise_inference_composition = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            bidirectional=bidirectional,
            dropout=p_dropout,
            batch_first=True
        )

        self.hypothesis_inference_composition = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            bidirectional=bidirectional,
            dropout=p_dropout,
            batch_first=True
        )

        self.premise_inference_dropout = RNNDropout(p=p_dropout, batch_first=True)
        self.hypothesis_inference_dropout = RNNDropout(p=p_dropout, batch_first=True)

    def forward(self, inputs):
        assert len(inputs) == 2
        (premise_m, premise_lengths), (hypothesis_m, hypothesis_lengths, hypothesis_args_sort, hypothesis_recover_idxs) = inputs

        # sort hypothesis with length
        hypothesis_m = hypothesis_m[hypothesis_args_sort]
        hypothesis_lengths = hypothesis_lengths[hypothesis_args_sort]

        packed_premise_m = nn.utils.rnn.pack_padded_sequence(
            self.premise_inference_dropout(premise_m),
            lengths=premise_lengths,
            batch_first=True
        )
        packed_hypothesis_m = nn.utils.rnn.pack_padded_sequence(
            self.hypothesis_inference_dropout(hypothesis_m),
            lengths=hypothesis_lengths,
            batch_first=True,
        )

        packed_premise_v, _ = self.premise_inference_composition(packed_premise_m)
        packed_hypothesis_v, _ = self.hypothesis_inference_composition(packed_hypothesis_m)

        premise_v, _ = nn.utils.rnn.pad_packed_sequence(packed_premise_v)
        hypothesis_v, _ = nn.utils.rnn.pad_packed_sequence(packed_hypothesis_v)

        premise_v_max_pooling, _ = torch.max(premise_v, dim=0)
        hypothesis_v_max_pooling, _ = torch.max(hypothesis_v, dim=0)
        premise_v_avg_pooling = torch.sum(premise_v, dim=0) / premise_v.size(0)
        hypothesis_v_avg_pooling = torch.sum(hypothesis_v, dim=0) / hypothesis_v.size(0)

        return torch.cat([
            premise_v_max_pooling,
            premise_v_avg_pooling,
            hypothesis_v_max_pooling[hypothesis_recover_idxs],
            hypothesis_v_avg_pooling[hypothesis_recover_idxs],
        ], dim=1)



In [24]:
inference_composition = InferenceComposition(
    input_size=4 * 300 * 2,
    hidden_size=300,
    bidirectional=True,
    p_dropout=0.
)

In [25]:
composition = inference_composition(
    ((premise_m, premise_lengths), (hypothesis_m, hypothesis_lengths, hypothesis_args_sort, hypothesis_recover_idxs))
)
composition.size()

torch.Size([1, 16])

In [81]:
class Classifier(nn.Module):
    def __init__(
            self,
            input_size,
            num_classes,
            hidden_sizes: list=None,
            act_func='sigmoid',
            bias=True,
            p_dropout=0,
    ):
        super(Classifier, self).__init__()
        self.input_size = input_size
        self.hidden_sizes = hidden_sizes
        self.num_classes = num_classes
        self.act_func = act_func
        self.bias = bias
        self.p_dropout = p_dropout
        self.mlp = self.get_mlp(
            input_size=input_size,
            hidden_sizes=hidden_sizes,
            num_classes=num_classes,
            act_func=act_func,
            bias=bias,
            p_dropout=p_dropout
        )

    def forward(self, input):
        return self.mlp(input)

    @staticmethod
    def get_mlp(input_size, hidden_sizes, num_classes, act_func, bias=True, p_dropout=0.):
        assert num_classes > 1, 'number of classes must be more than one'
        assert act_func in ['sigmoid', 'relu', 'tanh']
        map_act_func = {
            'sigmoid': nn.Sigmoid,
            'relu': nn.ReLU,
            'tanh': nn.Tanh,
        }
        sizes = [input_size, *hidden_sizes, num_classes]
        layers = []
        for i in range(len(sizes) - 1):
            if i != len(sizes) - 2:
                layers.extend([
                    nn.Linear(
                        in_features=sizes[i],
                        out_features=sizes[i + 1],
                        bias=bias
                    ),
                    map_act_func[act_func](),
                    nn.Dropout(p=p_dropout)
                ])
            else:
                layers.append(nn.Linear(
                        in_features=sizes[i],
                        out_features=sizes[i + 1],
                        bias=bias
                    ))
        return nn.Sequential(*layers)
    

In [82]:
classifier = Classifier(
    input_size=4 * 300 * 2,
    num_classes=len(label_vocab.itos),
    hidden_sizes=[512, 256],
    act_func='sigmoid',
    bias=True
)

In [83]:
output = classifier(composition)
output.size()

torch.Size([32, 3])

In [84]:
loss = nn.CrossEntropyLoss()

In [85]:
loss(output, labels)

tensor(1.1197, grad_fn=<NllLossBackward>)

In [109]:
class ESIM(nn.Module):
    def __init__(
            self,
            word_vocab: WordVocab,
            embedding_dim,
            embeddings: nn.Embedding,
            hidden_size,
            # comp_inference_hidden_size,
            bidirectional,
            num_classes,
            hidden_sizes: list,
            act_func='sigmoid',
            bias=True,
            p_dropout=0.,
    ):
        super(ESIM, self).__init__()
        self.esim = self.build_model(
            word_vocab,
            embedding_dim,
            embeddings,
            hidden_size,
            bidirectional,
            num_classes,
            hidden_sizes,
            act_func,
            bias,
            p_dropout,
        )

    def forward(self, *input):
        return self.esim(input)

    @staticmethod
    def build_model(
            word_vocab: WordVocab,
            embedding_dim,
            embeddings: nn.Embedding,
            hidden_size,
            bidirectional,
            num_classes,
            hidden_sizes: list,
            act_func='sigmoid',
            bias=True,
            p_dropout=0.
    ):
        layers = []
        layers.append(
            InputEncoding(
                word_vocab=word_vocab,
                embedding_dim=embedding_dim,
                embeddings=embeddings,
                hidden_size=hidden_size,
                p_dropout=p_dropout
            )
        )

        layers.append(LocalInference())

        output_local_inference_dim = (2 if bidirectional else 1) * hidden_size * 4

        # layers.extend([
        #     nn.Linear(
        #         in_features=output_local_inference_dim,
        #         out_features=hidden_size,
        #         bias=bias
        #     ),
        #     nn.ReLU()
        # ])

        layers.append(
            Projection(
                in_features=output_local_inference_dim,
                out_features=hidden_size,
                bias=bias
            )
        )

        layers.append(
            InferenceComposition(
                input_size=hidden_size,
                hidden_size=hidden_size,
                bidirectional=bidirectional,
                p_dropout=p_dropout
            )
        )
        output_inference_composition_dim = (2 if bidirectional else 1) * hidden_size * 4
        layers.extend(
            [
                nn.Dropout(p=p_dropout),
                Classifier(
                    input_size=output_inference_composition_dim,
                    hidden_sizes=hidden_sizes,
                    num_classes=num_classes,
                    act_func=act_func,
                    bias=bias
                ),
            ]
        )
        return nn.Sequential(*layers)

In [110]:
class Config:
    def __init__( self, *args, **kwargs):
        for k, v in self.default_config().items():
            self.__setattr__(k, v)
        for k, v in kwargs.items():
            self.__setattr__(k, v)

        assert self.__getattribute__('word_vocab') is not None
        assert self.__getattribute__('num_classes') is not None

    @staticmethod
    def default_config():
        hidden_size = 300
        return {
            'word_vocab': None,
            'embedding_dim': 300,
            'embeddings': None,
            'hidden_size': hidden_size,
            'bidirectional': True,
            'num_classes': None,
            'hidden_sizes': [hidden_size],
            'act_func': 'tanh',
            'bias': True,
            'p_dropout': 0.5,
        }

In [111]:
config = Config(
    word_vocab=word_vocab,
    num_classes=len(label_vocab.itos)
)

In [112]:
model = ESIM(
    word_vocab=config.word_vocab,
    embedding_dim=config.embedding_dim,
    embeddings=config.embeddings,
    hidden_size=config.hidden_size,
    bidirectional=config.bidirectional,
    num_classes=config.num_classes,
    hidden_sizes=config.hidden_sizes,
    act_func=config.act_func,
    bias=config.bias,
    p_dropout=config.p_dropout
)

  "num_layers={}".format(dropout, num_layers))


In [113]:
output = model(premise, (hypothesis, hypothesis_args_sort, hypothesis_recover_idxs))
output.size()

torch.Size([32, 3])

In [114]:
model = model.to(device)
loss_func = nn.CrossEntropyLoss()

In [115]:
learner = Learner(
    data=databunch,
    model=model,
    loss_func=loss_func,
    path='.',
    metrics=[accuracy]
)

In [116]:
learner.lr_find()
learner.recorder.plot()

LR Finder is complete, type {learner_name}.recorder.plot() to see the graph.


RuntimeError: CUDA out of memory. Tried to allocate 458.00 MiB (GPU 0; 3.95 GiB total capacity; 2.82 GiB already allocated; 248.62 MiB free; 459.89 MiB cached)

In [None]:
learner.fit(
    epochs=15,
    callbacks=[
      SaveModelCallback(learner, monitor='accuracy', every='improvement', name='best_model'),
      EarlyStoppingCallback(learner, monitor='valid_loss', min_delta=0.01, patience=10),
    ]
)

In [47]:
learner.load('best_model')
learner.evaluate(dl=learner.data.test_dl)

3496