In [1]:
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForSequenceClassification
from pathlib import Path
import torch

import pandas as pd
import sys
import random
import numpy as np
import os
from tqdm import tqdm, trange
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

In [2]:
MODEL_PATH=Path('../models/intent_classification_model_v6.bin')
BERT_PRETRAINED_PATH = Path('../../bert_fastai/pretrained-weights/uncased_L-12_H-768_A-12/')
LABEL_PATH = Path('../labels/labels.csv')

In [3]:
pd.set_option('display.max_colwidth', -1)

In [4]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

        Args:
            guid: Unique id for the example.
            text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
            text_b: (Optional) string. The untokenized text of the second sequence.
            Only must be specified for sequence pair tasks.
            label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id

In [5]:
def convert_examples_to_features(examples, label_list, max_seq_length, tokenizer):
    """Loads a data file into a list of `InputBatch`s."""

    label_map = {label : i for i, label in enumerate(label_list)}

    features = []
    for (ex_index, example) in enumerate(examples):
        tokens_a = tokenizer.tokenize(example.text_a)

        tokens_b = None
        if example.text_b:
            tokens_b = tokenizer.tokenize(example.text_b)
            # Modifies `tokens_a` and `tokens_b` in place so that the total
            # length is less than the specified length.
            # Account for [CLS], [SEP], [SEP] with "- 3"
            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
        else:
            # Account for [CLS] and [SEP] with "- 2"
            if len(tokens_a) > max_seq_length - 2:
                tokens_a = tokens_a[:(max_seq_length - 2)]

        # The convention in BERT is:
        # (a) For sequence pairs:
        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
        #  type_ids: 0   0  0    0    0     0       0 0    1  1  1  1   1 1
        # (b) For single sequences:
        #  tokens:   [CLS] the dog is hairy . [SEP]
        #  type_ids: 0   0   0   0  0     0 0
        #
        # Where "type_ids" are used to indicate whether this is the first
        # sequence or the second sequence. The embedding vectors for `type=0` and
        # `type=1` were learned during pre-training and are added to the wordpiece
        # embedding vector (and position vector). This is not *strictly* necessary
        # since the [SEP] token unambigiously separates the sequences, but it makes
        # it easier for the model to learn the concept of sequences.
        #
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
        segment_ids = [0] * len(tokens)

        if tokens_b:
            tokens += tokens_b + ["[SEP]"]
            segment_ids += [1] * (len(tokens_b) + 1)

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
        input_mask = [1] * len(input_ids)

        # Zero-pad up to the sequence length.
        padding = [0] * (max_seq_length - len(input_ids))
        input_ids += padding
        input_mask += padding
        segment_ids += padding

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(segment_ids) == max_seq_length
        
        if example.label:
            label_id = label_map[example.label]
        else:
            label_id = ''

        features.append(
                InputFeatures(input_ids=input_ids,
                              input_mask=input_mask,
                              segment_ids=segment_ids,
                              label_id=label_id))
    return features

In [None]:
def load_training_data(processor, DATAPATH, filename='train.csv'):
    train_examples = None
    num_train_steps = None

    train_examples = processor.get_train_examples(DATAPATH, filename=filename, size=args['train_size'])

    train_features = convert_examples_to_features(train_examples, label_list, args['max_seq_length'], tokenizer)
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_examples))
    logger.info("  Batch size = %d", args['train_batch_size'])
#     logger.info("  Num steps = %d", num_train_steps)
    all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
    all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
    train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
    if args['local_rank'] == -1:
        train_sampler = RandomSampler(train_data)
    else:
        train_sampler = DistributedSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args['train_batch_size'])
    
    return train_dataloader, train_examples

In [7]:
def predict_batch(model, label_list, texts, max_seq_length=512, batch_size=32, device='cuda'):
    
    examples = []
    input_data = []
    
    for index, text in enumerate(texts):
        examples.append(InputExample(index, text))
        input_data.append({
            'id': index, 
            'text': text
        })
           
    test_features = convert_examples_to_features(examples, label_list, max_seq_length, tokenizer)
    
    
    all_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in test_features], dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in test_features], dtype=torch.long)

    test_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
    
    # Run prediction for full data
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

    all_logits = None
    
    model.eval()
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    for step, batch in enumerate(test_dataloader):
        input_ids, input_mask, segment_ids = batch
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)

        with torch.no_grad():
            logits = model(input_ids, segment_ids, input_mask)
            logits = logits.softmax(dim=1)

        if all_logits is None:
            all_logits = logits.detach().cpu().numpy()
        else:
            all_logits = np.concatenate((all_logits, logits.detach().cpu().numpy()), axis=0)
            
        nb_eval_examples += input_ids.size(0)
        nb_eval_steps += 1

    result_df =  pd.DataFrame(all_logits, columns=label_list)
    results = result_df.to_dict('record')
    
    return [sorted(x.items(), key=lambda kv: kv[1], reverse=True) for x in results]

In [8]:
device = 

True

In [44]:
def load_model(device='cuda'):
    labels = list(pd.read_csv(LABEL_PATH,header=None).dropna()[0].values)
    
    # Load a trained model that you have fine-tuned
    model_state_dict = torch.load(MODEL_PATH)
    model = BertForSequenceClassification.from_pretrained(BERT_PRETRAINED_PATH, num_labels = len(labels), state_dict=model_state_dict)
    model.half()
    model.to(device)
    return labels, model
    

In [45]:
labels, model = load_model()

In [46]:
tokenizer = tokenizer = BertTokenizer.from_pretrained(BERT_PRETRAINED_PATH, do_lower_case=True)

In [47]:
text = "I need to submit my meter reads"
predict_batch(model, labels, [text])

[[('report-meter_reading', 0.9907),
  ('enquire-meter_reading', 0.00313),
  ('request-appointment_meter_reading', 0.00211),
  ('report-meter_reading_opening', 0.0004838),
  ('problem-meter_reading', 0.000151),
  ('request-advisor', 0.0001475),
  ('enquire-bill_estimate', 0.0001317),
  ('enquire-meter', 0.000123),
  ('enquire-appointment_today', 8.83e-05),
  ('change-address', 8.506e-05),
  ('report-meter_reading_final', 8.31e-05),
  ('enquire-service', 7.3e-05),
  ('request-smart_meter', 5.704e-05),
  ('enquire-tariff', 5.06e-05),
  ('enquire-nectar', 4.584e-05),
  ('enquire-meter_number', 4.56e-05),
  ('enquire-account_online', 4.36e-05),
  ('cancel-homecare', 4.196e-05),
  ('request-balance', 4.137e-05),
  ('enquire-appointment_when', 4.023e-05),
  ('enquire-annual_service', 3.886e-05),
  ('report-meter_reading_wrong', 3.75e-05),
  ('pay-bill', 3.52e-05),
  ('enquire-customer_number', 3.44e-05),
  ('request-install_meter', 3.2e-05),
  ('cancel-agreement', 3e-05),
  ('enquire-usage', 