In [1]:
from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.modeling import BertForPreTraining, BertConfig, BertForMaskedLM, BertForSequenceClassification
from pathlib import Path
import torch

from fastai.text import Tokenizer, Vocab
import pandas as pd
import collections
import os
from tqdm import tqdm, trange
import sys
import random
import numpy as np
import apex
from sklearn.model_selection import train_test_split

import datetime
    
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.utils.data.distributed import DistributedSampler
from pytorch_pretrained_bert.optimization import BertAdam

from energy_bert.modeling import BertForMultiLabelSequenceClassification
from energy_bert.data import BertDataBunch, InputExample, InputFeatures, MultiLabelTextProcessor, convert_examples_to_features
from energy_bert.learner import BertLearner
from energy_bert.metrics import accuracy

In [2]:
torch.cuda.empty_cache()

In [3]:
pd.set_option('display.max_colwidth', -1)
run_start_time = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

In [4]:
DATA_PATH = Path('../data/')
LABEL_PATH = Path('../labels/')

AUG_DATA_PATH = Path('../data/data_augmentation/')

MODEL_PATH=Path('../models/')
LOG_PATH=Path('../logs/')
MODEL_PATH.mkdir(exist_ok=True)

model_state_dict = None

# BERT_PRETRAINED_PATH = Path('../../bert_models/pretrained-weights/cased_L-12_H-768_A-12/')
BERT_PRETRAINED_PATH = Path('../../bert_models/pretrained-weights/uncased_L-12_H-768_A-12/')
# BERT_PRETRAINED_PATH = Path('../../bert_fastai/pretrained-weights/uncased_L-24_H-1024_A-16/')
FINETUNED_PATH = Path('../models/intent_language_model_2019-02-01_00-37-04.bin')
# FINETUNED_PATH = None
# model_state_dict = torch.load(FINETUNED_PATH)

LOG_PATH.mkdir(exist_ok=True)

In [5]:
args = {
    "run_text": "Intent classification finetuned model bert large",
    "train_size": -1,
    "val_size": -1,
    "log_path": LOG_PATH,
    "full_data_dir": DATA_PATH,
    "data_dir": DATA_PATH,
    "task_name": "intent_classification_lib",
    "no_cuda": False,
    "bert_model": BERT_PRETRAINED_PATH,
    "output_dir": MODEL_PATH/'output',
    "max_seq_length": 512,
    "do_train": True,
    "do_eval": True,
    "do_lower_case": True,
    "train_batch_size": 32,
    "eval_batch_size": 32,
    "learning_rate": 8e-5,
    "num_train_epochs": 12.0,
    "warmup_proportion": 0.1,
    "no_cuda": False,
    "local_rank": -1,
    "seed": 42,
    "gradient_accumulation_steps": 1,
    "optimize_on_cpu": False,
    "fp16": False,
    "loss_scale": 128
}

In [6]:
import logging

logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

In [7]:
logger.info(args)

03/11/2019 23:45:01 - INFO - root -   {'run_text': 'Intent classification finetuned model bert large', 'train_size': -1, 'val_size': -1, 'log_path': PosixPath('../logs'), 'full_data_dir': PosixPath('../data'), 'data_dir': PosixPath('../data'), 'task_name': 'intent_classification_lib', 'no_cuda': False, 'bert_model': PosixPath('../../bert_models/pretrained-weights/uncased_L-12_H-768_A-12'), 'output_dir': PosixPath('../models/output'), 'max_seq_length': 512, 'do_train': True, 'do_eval': True, 'do_lower_case': True, 'train_batch_size': 32, 'eval_batch_size': 32, 'learning_rate': 8e-05, 'num_train_epochs': 12.0, 'warmup_proportion': 0.1, 'local_rank': -1, 'seed': 42, 'gradient_accumulation_steps': 1, 'optimize_on_cpu': False, 'fp16': False, 'loss_scale': 128}


In [8]:
tokenizer = BertTokenizer.from_pretrained(BERT_PRETRAINED_PATH, do_lower_case=args['do_lower_case'])

03/11/2019 23:45:01 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file ../../bert_models/pretrained-weights/uncased_L-12_H-768_A-12/vocab.txt


In [9]:
device = torch.device('cuda')

In [10]:
if torch.cuda.device_count() > 1:
    multi_gpu = True
else:
    multi_gpu = False

In [11]:
databunch = BertDataBunch(args['data_dir'], LABEL_PATH, tokenizer, train_file='train_4.csv', val_file='val_4.csv',
                          bs=args['train_batch_size'], maxlen=args['max_seq_length'], multi_gpu=multi_gpu, multi_label=False)

In [12]:
num_labels = len(databunch.labels)

In [13]:
num_labels

250

In [14]:
metrics = []
metrics.append({'name': 'accuracy', 'function': accuracy})

In [15]:
learner = BertLearner.from_pretrained_model(databunch, BERT_PRETRAINED_PATH, metrics, device, logger, 
                                            finetuned_wgts_path=FINETUNED_PATH, 
                                            is_fp16=args['fp16'], loss_scale=args['loss_scale'], 
                                            multi_gpu=multi_gpu,  multi_label=False)

03/11/2019 23:45:08 - INFO - pytorch_pretrained_bert.modeling -   loading archive file ../../bert_models/pretrained-weights/uncased_L-12_H-768_A-12 from cache at ../../bert_models/pretrained-weights/uncased_L-12_H-768_A-12
03/11/2019 23:45:08 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

03/11/2019 23:45:13 - INFO - pytorch_pretrained_bert.modeling -   Weights of BertForSequenceClassification not initialized from pretrained model: ['classifier.weight', 'classifier.bias']
03/11/2019 23:45:13 - INFO - pytorch_pretrained_bert.modeling -   Weights from pretrained model not used in BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.

In [16]:
# list(learner.layer_groups[5][0].parameters())[0].requires_grad

In [17]:
# learner.freeze()
learner.unfreeze()

In [18]:
learner.fit(1, lr=args['learning_rate'])



03/11/2019 23:47:55 - INFO - root -   Loss after epoch 0 - 2.1024376782530885
03/11/2019 23:47:55 - INFO - root -   Running evaluation


03/11/2019 23:48:17 - INFO - root -   Eval results:
03/11/2019 23:48:17 - INFO - root -     eval_loss = 1.2320932297423335
03/11/2019 23:48:17 - INFO - root -     metrics = {'accuracy': 0.7342679127725856}
03/11/2019 23:48:17 - INFO - root -   --------------------------------------------------------------------------------


In [19]:
learner.freeze_to(-2)
learner.fit(1, lr=args['learning_rate'])



03/11/2019 23:49:42 - INFO - root -   Loss after epoch 0 - 0.9195217793396034
03/11/2019 23:49:42 - INFO - root -   Running evaluation


03/11/2019 23:50:03 - INFO - root -   Eval results:
03/11/2019 23:50:03 - INFO - root -     eval_loss = 0.9282807821389472
03/11/2019 23:50:03 - INFO - root -     metrics = {'accuracy': 0.7847352024922118}
03/11/2019 23:50:03 - INFO - root -   --------------------------------------------------------------------------------


In [20]:
learner.freeze_to(-3)
learner.fit(1, lr=args['learning_rate'])



03/11/2019 23:51:48 - INFO - root -   Loss after epoch 0 - 0.8139195168210615
03/11/2019 23:51:48 - INFO - root -   Running evaluation


03/11/2019 23:52:10 - INFO - root -   Eval results:
03/11/2019 23:52:10 - INFO - root -     eval_loss = 0.7524081835947415
03/11/2019 23:52:10 - INFO - root -     metrics = {'accuracy': 0.82398753894081}
03/11/2019 23:52:10 - INFO - root -   --------------------------------------------------------------------------------


In [21]:
learner.freeze_to(-4)
learner.fit(1, lr=args['learning_rate'])



03/11/2019 23:54:16 - INFO - root -   Loss after epoch 0 - 0.749607569785331
03/11/2019 23:54:16 - INFO - root -   Running evaluation


03/11/2019 23:54:37 - INFO - root -   Eval results:
03/11/2019 23:54:37 - INFO - root -     eval_loss = 0.689724515422736
03/11/2019 23:54:37 - INFO - root -     metrics = {'accuracy': 0.8398753894080997}
03/11/2019 23:54:37 - INFO - root -   --------------------------------------------------------------------------------


In [22]:
# learner.freeze_to(-5)
# learner.fit(1, lr=args['learning_rate'])

In [None]:
learner.unfreeze()
learner.fit(6, lr=args['learning_rate'])



03/12/2019 00:29:36 - INFO - root -   Loss after epoch 0 - 0.29133536731338977
03/12/2019 00:29:36 - INFO - root -   Running evaluation


03/12/2019 00:29:58 - INFO - root -   Eval results:
03/12/2019 00:29:58 - INFO - root -     eval_loss = 0.717329169190166
03/12/2019 00:29:58 - INFO - root -     metrics = {'accuracy': 0.8398753894080997}
03/12/2019 00:29:58 - INFO - root -   --------------------------------------------------------------------------------
03/12/2019 00:32:28 - INFO - root -   Loss after epoch 1 - 0.21284127509031936
03/12/2019 00:32:28 - INFO - root -   Running evaluation


03/12/2019 00:32:50 - INFO - root -   Eval results:
03/12/2019 00:32:50 - INFO - root -     eval_loss = 0.6175055609518053
03/12/2019 00:32:50 - INFO - root -     metrics = {'accuracy': 0.8610591900311526}
03/12/2019 00:32:50 - INFO - root -   --------------------------------------------------------------------------------
03/12/2019 00:35:20 - INFO - root -   Loss after epoch 2 - 0.09546715827201199
03/12/2019 00:35:20 - INFO - root -   Running evaluation


03/12/2019 00:35:41 - INFO - root -   Eval results:
03/12/2019 00:35:41 - INFO - root -     eval_loss = 0.6180045382387125
03/12/2019 00:35:41 - INFO - root -     metrics = {'accuracy': 0.8672897196261682}
03/12/2019 00:35:41 - INFO - root -   --------------------------------------------------------------------------------
03/12/2019 00:38:11 - INFO - root -   Loss after epoch 3 - 0.04666820316166425
03/12/2019 00:38:11 - INFO - root -   Running evaluation


03/12/2019 00:38:33 - INFO - root -   Eval results:
03/12/2019 00:38:33 - INFO - root -     eval_loss = 0.6320682714259861
03/12/2019 00:38:33 - INFO - root -     metrics = {'accuracy': 0.8735202492211838}
03/12/2019 00:38:33 - INFO - root -   --------------------------------------------------------------------------------


In [24]:
# learner.save_and_reload(MODEL_PATH, 'intent_classification_lib_{}'.format(run_start_time))

In [25]:
# learner.fit(4, args['learning_rate']/10)

In [26]:
texts = [
    "I have a Homecare Agreement - policy number 911000810357. I have an engineer booked between 12-6pm under this policy, as yet I have not heard anything and I wondered if you could check if someone would be coming to my property re my boiler.",
    "You are joke I am finding anouther provider",
    "I am trying to submit a complaint but it keeps saying Opps something went wrong",
    "why Increase in my direct debit when I have a credit balance???",
    "how to get emergency credit on my gas meter",
    "Why has my annual boiler care agreement had a rise 23% from 1st of January.",
    "i cant find it.. i must have saved it.. let me get back to you",
    "I booked a boiler service for mon 26/11/2018 and no-one turned up",
    "I want to add another account holder but i cant seem to do so",
    "My tenant has moved out of my property and I want to transfer the gas and electricity accounts back into my name",
    "I have mis-placed my payment card",
    "why are you keeping my card details without my permission",
    "Adjust direct debit for new property which is smaller than original. You are still charging at the old rate which means we are in permanent credit. Would you please adjust this for our new property and provide us with a refund of the present credit",
    "All I want to do is simply change the account that my direct debit gets taken from. Why can I not find this option?",
    "trying to get a refund",
    "I am paying too much a month and want to lower my monthly payments please",
    "i want to sort out my bill as you have the wrong readings",
    "would like to know when my direct debit payment was increased",
    "calling to pay a bill as online access has been locked declined",
    "need a bill breakdown of gas and electric usage.",
    "I have had a meter change from an imperial to metric - can you tell me the revised conversion formula to change cubic metres into kilowatt hours on bills",
    "I would just like to submit a meter reading please.",
    "I am trying to find my MPRN.",
    "trying to submit my meter reading",
    "I want to know how to switch to British Gas from other supplier",
    "Finding out if British Gas supplies my gas in my new home",
    "I  have received a letter saying my annual service has been booked for January 18th but my last one was 12 june and so it is not due until June, Please advise",
    "Hi there, I'm looking to check when my last gas service was done? The system is telling me that it was October/November time - we had an engineer visit then but don't think we had a service and didn't receive any paperwork? Can you help?",
    "When my last annual service took place.", 
         "Change date of service appointment", 
         "lost track of account details need engineer appointment please"]
results =  learner.predict_batch(texts)


In [27]:
results[-2]

[('change-appointment', 0.92566013),
 ('change-appointment_service', 0.056371145),
 ('change-appointment_boiler', 0.008903752),
 ('change-payment', 0.0020517344),
 ('cancel-appointment', 0.0012374284),
 ('change-install_smart_meter', 0.0007185378),
 ('enquire-appointment_when', 0.00035999564),
 ('change-direct_debit', 0.00029664714),
 ('change-account_detail', 0.00022843764),
 ('change-address', 0.00017588034),
 ('change-vague', 0.00014699655),
 ('enquire-meter_exchange', 0.00013882756),
 ('pay-bill_energy', 0.0001292206),
 ('request-appointment', 0.0001262744),
 ('report-card_lost', 0.00011987125),
 ('change-phone', 9.9598576e-05),
 ('enquire-install', 9.087401e-05),
 ('enquire-appointment', 7.856356e-05),
 ('setup-direct_debit', 6.90331e-05),
 ('enquire-appointment_smart_meter', 6.5792105e-05),
 ('request-install_meter_payge', 6.190123e-05),
 ('cancel-direct_debit', 6.087178e-05),
 ('report-meter_reading_wrong', 5.8952217e-05),
 ('enquire-meter_number', 5.8566788e-05),
 ('request-car