In [1]:
import pandas as pd
from pathlib import Path
import logging
from box import Box
from datetime import datetime
import sys
import torch

In [2]:
from fast_bert import BertLearner, BertDataBunch, accuracy

In [3]:
pd.set_option('display.max_colwidth', -1)
run_start_time = datetime.today().strftime('%Y-%m-%d_%H-%M-%S')

In [4]:
PATH = Path("../sample_data/imdb_movie_reviews")
DATA_PATH = PATH/'data'
LABEL_PATH = PATH/'label'
OUT_PATH = PATH/'.output'
OUT_PATH.mkdir(exist_ok=True)

MODEL_PATH=OUT_PATH/'model'
MODEL_PATH.mkdir(exist_ok=True)

LOG_PATH = OUT_PATH/'logs/'
LOG_PATH.mkdir(exist_ok=True)

In [5]:
args = Box({
    "run_text": "ibdm_reviews",
    "max_seq_length": 512,
    "batch_size": 8,
    "learning_rate": 5e-5,
    "num_train_epochs": 6,
    "fp16": False,
    "model_name": 'albert-base-v2',
    "model_type": 'albert'
})

device = torch.device('cuda') if torch.cuda.device_count() else torch.device('cpu')
if torch.cuda.device_count() > 1:
    args.multi_gpu = True
else:
    args.multi_gpu = False

In [6]:
logfile = str(LOG_PATH/'log-{}-{}.txt'.format(run_start_time, args["run_text"]))

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    handlers=[
        logging.FileHandler(logfile),
        logging.StreamHandler(sys.stdout)
    ])

logger = logging.getLogger()

In [7]:
device

device(type='cpu')

In [8]:
databunch = BertDataBunch(DATA_PATH, LABEL_PATH, args.model_name, 
                          train_file="train_sample.csv", val_file="val_sample.csv",
                          batch_size_per_gpu=args.batch_size, 
                          max_seq_length=args.max_seq_length, 
                          multi_gpu=args.multi_gpu,
                          multi_label=False,
                          model_type=args.model_type
                         )

11/28/2019 23:21:02 - INFO - transformers.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v2-spiece.model from cache at /Users/kaushaltrivedi/.cache/torch/transformers/dd1588b85b6fdce1320e224d29ad062e97588e17326b9d05a0b29ee84b8f5f93.c81d4deb77aec08ce575b7a39a989a79dd54f321bfb82c2b54dd35f52f8182cf
11/28/2019 23:21:02 - INFO - root -   Loading features from cached file ../sample_data/imdb_movie_reviews/data/cache/cached_albert_train_multi_class_512_train_sample.csv
11/28/2019 23:21:02 - INFO - root -   Loading features from cached file ../sample_data/imdb_movie_reviews/data/cache/cached_albert_dev_multi_class_512_val_sample.csv


In [None]:
metrics = [{"name": "accuracy", "function": accuracy}]

In [None]:
learner = BertLearner.from_pretrained_model(databunch, args.model_name, metrics=metrics, 
                                            device=device, multi_gpu=args.multi_gpu, is_fp16=args.fp16,
                                            multi_label=False, logging_steps=0,
                                            output_dir=OUT_PATH, logger=logger
                                           )

In [None]:
learner.fit(2, args.learning_rate, validate=True)

In [None]:
learner.save_model()

In [11]:
from fast_bert.prediction import BertClassificationPredictor

In [12]:
predictor = BertClassificationPredictor(OUT_PATH/'model_out', LABEL_PATH, multi_label=False, model_type=args.model_type)

11/28/2019 23:26:54 - INFO - transformers.tokenization_utils -   Model name '../sample_data/imdb_movie_reviews/.output/model_out' not found in model shortcut name list (albert-base-v1, albert-large-v1, albert-xlarge-v1, albert-xxlarge-v1, albert-base-v2, albert-large-v2, albert-xlarge-v2, albert-xxlarge-v2). Assuming '../sample_data/imdb_movie_reviews/.output/model_out' is a path or url to a directory containing tokenizer files.
11/28/2019 23:26:54 - INFO - transformers.tokenization_utils -   loading file ../sample_data/imdb_movie_reviews/.output/model_out/spiece.model
11/28/2019 23:26:54 - INFO - transformers.tokenization_utils -   loading file ../sample_data/imdb_movie_reviews/.output/model_out/added_tokens.json
11/28/2019 23:26:54 - INFO - transformers.tokenization_utils -   loading file ../sample_data/imdb_movie_reviews/.output/model_out/special_tokens_map.json
11/28/2019 23:26:54 - INFO - transformers.tokenization_utils -   loading file ../sample_data/imdb_movie_reviews/.output/mo

In [13]:
predictor.predict_batch(["i hate you", "i love this move"])

11/28/2019 23:26:57 - INFO - root -   Writing example 0 of 2


[[('1', 0.5922839045524597), ('0', 0.40771612524986267)],
 [('1', 0.5783963799476624), ('0', 0.42160359025001526)]]