In [2]:
# Setup task specific model and TPU running config.
import modeling
import optimization
import run_classifier
import tokenization
import os
import tensorflow as tf
import datetime
import json
import pprint
import random
import string
import sys

#BERT_PRETRAINED_DIR = '/Users/vijay/Desktop/uncased_L-24_H-1024_A-16'
BERT_PRETRAINED_DIR = '/Users/vijay/Downloads/uncased_L-12_H-768_A-12'

#BERT_MODEL = 'uncased_L-12_H-768_A-16'
BERT_MODEL = 'uncased_L-12_H-768_A-12'

TASK_DATA_DIR = '/Users/vijay/bert/imdb_dataset'
TASK = 'imdb'

# Model Hyper Parameters
TRAIN_BATCH_SIZE = 32
EVAL_BATCH_SIZE = 8
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
WARMUP_PROPORTION = 0.1
MAX_SEQ_LENGTH = 128
# Model configs
SAVE_CHECKPOINTS_STEPS = 1000
ITERATIONS_PER_LOOP = 1000
NUM_TPU_CORES = 8
VOCAB_FILE = os.path.join(BERT_PRETRAINED_DIR, 'vocab.txt')
CONFIG_FILE = os.path.join(BERT_PRETRAINED_DIR, 'bert_config.json')
INIT_CHECKPOINT = os.path.join(BERT_PRETRAINED_DIR, 'bert_model.ckpt')
DO_LOWER_CASE = BERT_MODEL.startswith('uncased')

processors = {
  "cola": run_classifier.ColaProcessor,
  "mnli": run_classifier.MnliProcessor,
  "mrpc": run_classifier.MrpcProcessor,
  "imdb": run_classifier.ImdbProcessor
}
processor = processors[TASK.lower()]()
label_list = processor.get_labels()
tokenizer = tokenization.FullTokenizer(vocab_file=VOCAB_FILE, do_lower_case=DO_LOWER_CASE)
train_examples = processor.get_dev_examples(TASK_DATA_DIR)

# tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(TPU_ADDRESS)
# run_config = tf.contrib.tpu.RunConfig(
#     cluster=tpu_cluster_resolver,
#     model_dir=OUTPUT_DIR,
#     save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS,
#     tpu_config=tf.contrib.tpu.TPUConfig(
#         iterations_per_loop=ITERATIONS_PER_LOOP,
#         num_shards=NUM_TPU_CORES,
#         per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2))


run_config = tf.estimator.RunConfig(
    model_dir='/Users/vijay/bert/model_checkpoints',
    tf_random_seed=None,
    save_summary_steps=100,
    session_config=None,
    keep_checkpoint_max=5,
    keep_checkpoint_every_n_hours=10000,
    log_step_count_steps=100,
    train_distribute=None,
    device_fn=None,
    protocol=None,
    eval_distribute=None,
    experimental_distribute=None)

num_train_steps = int(
    len(train_examples) / TRAIN_BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

model_fn = run_classifier.model_fn_builder_cpu(
    bert_config=modeling.BertConfig.from_json_file(CONFIG_FILE),
    num_labels=len(label_list),
    init_checkpoint=INIT_CHECKPOINT,
    learning_rate=LEARNING_RATE,
    num_train_steps=num_train_steps,
    num_warmup_steps=num_warmup_steps,
    use_tpu=False,
    use_one_hot_embeddings=True)

# estimator = tf.contrib.tpu.TPUEstimator(
#     use_tpu=False,
#     model_fn=model_fn,
#     config=run_config,
#     train_batch_size=TRAIN_BATCH_SIZE,
#     eval_batch_size=EVAL_BATCH_SIZE)

estimator = tf.estimator.Estimator(
    model_fn=model_fn,
    config=run_config,
    params={
        'batch_size': TRAIN_BATCH_SIZE
    })

INFO:tensorflow:Using config: {'_model_dir': '/Users/vijay/bert/model_checkpoints', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x123cc0d30>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [None]:
train_features = run_classifier.convert_examples_to_features(
    train_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
print('***** Started training at {} *****'.format(datetime.datetime.now()))
print('  Num examples = {}'.format(len(train_examples)))
print('  Batch size = {}'.format(TRAIN_BATCH_SIZE))
tf.logging.info("  Num steps = %d", num_train_steps)
train_input_fn = run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=True)
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print('***** Finished training at {} *****'.format(datetime.datetime.now()))

INFO:tensorflow:Writing example 0 of 5000
INFO:tensorflow:*** Example ***
INFO:tensorflow:guid: dev-1
INFO:tensorflow:tokens: [CLS] " i just watched it . a couple of laughs , but nothing to write home about . jason lee looked like he was having fun . the ( long ) dvd gag reel consists almost solely of him having fits of un ##con ##tro ##lla ##ble laughter . selma blair seemed to be punching a time clock , but then again , her character was supposed to be a stick in the mud , so \ well done \ " " i guess ? jim bro ##lin was surprisingly funny . ( being married to ba ##bs can ' t be a picnic . ) the soundtrack was hip , and eclectic . larry miller , who played julia stil ##es father ( hilarious ##ly [SEP]
INFO:tensorflow:input_ids: 101 1000 1045 2074 3427 2009 1012 1037 3232 1997 11680 1010 2021 2498 2000 4339 2188 2055 1012 4463 3389 2246 2066 2002 2001 2383 4569 1012 1996 1006 2146 1007 4966 18201 15934 3774 2471 9578 1997 2032 2383 16142 1997 4895 8663 13181 4571 3468 7239 1012 28112 

INFO:tensorflow:input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
INFO:tensorflow:segment_ids: 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
INFO:tensorflow:label: 0 (id = 0)
***** Started training at 2018-11-14 20:45:35.970617 *****
  Num examples = 5000
  Batch size = 32
INFO:tensorflow:  Num steps = 468
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:*** Features ***
INFO:tensorflow:  name = input_ids, shape = (32, 128)
INFO:tensorflow:  name = input_mask, shape = (32, 128)
INFO:tensorflow:  name = label_ids, shape = (32,)
INFO:tensorflow:  name = segment_ids, 

In [2]:
import pandas

df = pandas.read_csv('/Users/vijay/Downloads/labeledTrainData.tsv', sep='\t')

ModuleNotFoundError: No module named 'pandas'