In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!pip install bert-tensorflow

Collecting bert-tensorflow
[?25l  Downloading https://files.pythonhosted.org/packages/a6/66/7eb4e8b6ea35b7cc54c322c816f976167a43019750279a8473d355800a93/bert_tensorflow-1.0.1-py2.py3-none-any.whl (67kB)
[K     |████▉                           | 10kB 21.7MB/s eta 0:00:01[K     |█████████▊                      | 20kB 1.5MB/s eta 0:00:01[K     |██████████████▋                 | 30kB 1.8MB/s eta 0:00:01[K     |███████████████████▍            | 40kB 1.6MB/s eta 0:00:01[K     |████████████████████████▎       | 51kB 1.8MB/s eta 0:00:01[K     |█████████████████████████████▏  | 61kB 2.1MB/s eta 0:00:01[K     |████████████████████████████████| 71kB 1.9MB/s 
Installing collected packages: bert-tensorflow
Successfully installed bert-tensorflow-1.0.1


In [0]:
%tensorflow_version 1.x
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import re
import nltk
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime

In [0]:
# Set the output directory for saving model file
# Optionally, set a GCP bucket location

OUTPUT_DIR = 'OUTPUT_DIR_NAME'#@param {type:"string"}
#@markdown Whether or not to clear/delete the directory and create a new one
DO_DELETE = True #@param {type:"boolean"}
#@markdown Set USE_BUCKET and BUCKET if you want to (optionally) store model output on GCP bucket.
USE_BUCKET = False #@param {type:"boolean"}
BUCKET = 'BUCKET_NAME' #@param {type:"string"}

if USE_BUCKET:
  OUTPUT_DIR = 'gs://{}/{}'.format(BUCKET, OUTPUT_DIR)
  from google.colab import auth
  auth.authenticate_user()

if DO_DELETE:
  try:
    tf.gfile.DeleteRecursively(OUTPUT_DIR)
  except:
    # Doesn't matter if the directory didn't exist
    pass
tf.gfile.MakeDirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))


***** Model output directory: OUTPUT_DIR_NAME *****


In [0]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

In [0]:
#Importing datasets#
#Train set
file1 = open("/content/drive/My Drive/Emoji_prediction/us_train.text", "r+", encoding="utf-8")
train_text = file1.readlines()
file2 = open("/content/drive/My Drive/Emoji_prediction/us_train.labels", "r+", encoding="utf-8")
train_labels = file2.readlines()
#Development set
file3 = open("/content/drive/My Drive/Emoji_prediction/us_dev.text", "r+", encoding="utf-8")
dev_text = file3.readlines()
file4 = open("/content/drive/My Drive/Emoji_prediction/us_dev.labels", "r+", encoding="utf-8")
dev_labels = file4.readlines()
#Test set
file5 = open("/content/drive/My Drive/Emoji_prediction/us_test.text", "r+", encoding="utf-8")
test_text = file5.readlines()
file6 = open("/content/drive/My Drive/Emoji_prediction/us_test.labels", "r+", encoding="utf-8")
test_labels = file6.readlines()

**Data Preprocessing**

In [0]:
#Initalizing lemmatizer and creating a list of stopwords
lemmatizer = nltk.stem.WordNetLemmatizer()
nltk.download('stopwords')
nltk.download('wordnet')
stopwords = set(nltk.corpus.stopwords.words("english"))

#Cleans reviews, removes stopwords and lemmatize to put the reviews in a form that machine can perform better
def clean_reviews(string):
    lemmatized_string = ""
    #Remove HTML information from the string
    cleaner = re.compile('<.*?>')
    processed_string = re.sub(cleaner," ", string) 
    
    #Remove URLS from the string
    processed_string = re.compile(r"https?://[A-Za-z0-9./]+").sub(" ", processed_string)
    
    #Remove digits and punctuations
    processed_string = re.compile(r"[^a-zA-Z ]").sub(" ", processed_string)
    
    #Lowercase all the words
    processed_string = processed_string.lower()
    
    #Does not append stopwords but appends other strings while lemmatizing them
    for word in processed_string.split():
        if word in stopwords:
            continue
        else:
            lemmatized_string += lemmatizer.lemmatize(word) + " "
    return lemmatized_string

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [0]:
#Train, Dev, Test sets of after data preprocessing#
train_processed_text = []
train_processed_labels = []
dev_processed_text = []
dev_processed_labels = []
test_processed_text = []
test_processed_labels = []


for i in train_text:
  train_processed_text.append(clean_reviews(i))
for i in dev_text:
  dev_processed_text.append(clean_reviews(i))
for i in test_text:
  test_processed_text.append(clean_reviews(i))
for i in train_labels:
   train_processed_labels.append(i.replace("\n", ""))
for i in dev_labels:
   dev_processed_labels.append(i.replace("\n", ""))
for i in test_labels:
   test_processed_labels.append(i.replace("\n", ""))

In [0]:
train_data = pd.DataFrame({'sentence': train_processed_text,'label': train_processed_labels})
dev_data = pd.DataFrame({'sentence': dev_processed_text, 'label': dev_processed_labels})
test_data = pd.DataFrame({'sentence': test_processed_text, 'label': test_processed_labels})

In [0]:
DATA_COLUMN = 'sentence'
LABEL_COLUMN = 'label'
label_list = ['0', '1', '2','3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19']

In [0]:
print(train_data[41:50])

                                             sentence label
41        people work enjoying sunday watching first      0
42  another big happy th birthday partner crime lo...     0
43  day done thank u insta likers rollicking day f...     9
44  snakeshred recording guitar new snakeskin reco...    19
45  sister best friend hamessisters andlewisandclark     13
46  absolute favorite place la jolla cove seal beach      3
47  live awfully big adventure seattle spaceneedle...     8
48  finally met snow white favorite little snowwhi...     0
49  user loved coverage ti always need get sounder...    16


In [0]:
tokenizer = create_tokenizer_from_hub_module()

In [0]:
train_InputExamples = train_data.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN].tokenization.convert_to_unicode()), axis = 1)

dev_InputExamples = dev_data.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)                                                        

test_InputExamples = test_data.apply(lambda x: bert.run_classifier.InputExample(guid=None, 
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)

In [0]:
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore








In [0]:
# We'll set sequences to be at most 128 tokens long.
MAX_SEQ_LENGTH = 128
# Convert our train and test features to InputFeatures that BERT understands.
train_features = bert.run_classifier.convert_examples_to_features(train_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
dev_features = bert.run_classifier.convert_examples_to_features(dev_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)
test_features = bert.run_classifier.convert_examples_to_features(test_InputExamples, label_list, MAX_SEQ_LENGTH, tokenizer)

In [0]:
print(train_features[0:2000])

[<bert.run_classifier.InputFeatures object at 0x7ff463bdd710>, <bert.run_classifier.InputFeatures object at 0x7ff40456de10>, <bert.run_classifier.InputFeatures object at 0x7ff461574f98>, <bert.run_classifier.InputFeatures object at 0x7ff461574e10>, <bert.run_classifier.InputFeatures object at 0x7ff474074a20>, <bert.run_classifier.InputFeatures object at 0x7ff47c98ef60>, <bert.run_classifier.InputFeatures object at 0x7ff463bddb38>, <bert.run_classifier.InputFeatures object at 0x7ff40456deb8>, <bert.run_classifier.InputFeatures object at 0x7ff463bddfd0>, <bert.run_classifier.InputFeatures object at 0x7ff463bdda20>, <bert.run_classifier.InputFeatures object at 0x7ff40456dfd0>, <bert.run_classifier.InputFeatures object at 0x7ff40456d5f8>, <bert.run_classifier.InputFeatures object at 0x7ff40456df60>, <bert.run_classifier.InputFeatures object at 0x7ff40456dd30>, <bert.run_classifier.InputFeatures object at 0x7ff40456dda0>, <bert.run_classifier.InputFeatures object at 0x7ff462eaf2e8>, <bert.r

**Creating a Model**

In [0]:
def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
  """Creates a classification model."""

  bert_module = hub.Module(
      BERT_MODEL_HUB,
      trainable=True)
  bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
  bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

  # Use "pooled_output" for classification tasks on an entire sentence.
  # Use "sequence_outputs" for token-level output.
  output_layer = bert_outputs["pooled_output"]

  hidden_size = output_layer.shape[-1].value

  # Create our own layer to tune for politeness data.
  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):

    # Dropout helps prevent overfitting
    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    # Convert labels into one-hot encoding
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
    # If we're predicting, we want predicted labels and the probabiltiies.
    if is_predicting:
      return (predicted_labels, log_probs)

    # If we're train/eval, compute loss between predicted and actual label
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, predicted_labels, log_probs)


In [0]:
# model_fn_builder actually creates our model function
# using the passed parameters for num_labels, learning_rate, etc.
def model_fn_builder(num_labels, learning_rate, num_train_steps,
                     num_warmup_steps):
  """Returns `model_fn` closure for TPUEstimator."""
  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]

    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
    
    # TRAIN and EVAL
    if not is_predicting:

      (loss, predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      train_op = bert.optimization.create_optimizer(
          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

      # Calculate evaluation metrics. 
      def metric_fn(label_ids, predicted_labels):
        accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
        return {
            "eval_accuracy": accuracy,
        }

      eval_metrics = metric_fn(label_ids, predicted_labels)

      if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode=mode,
          loss=loss,
          train_op=train_op)
      else:
          return tf.estimator.EstimatorSpec(mode=mode,
            loss=loss,
            eval_metric_ops=eval_metrics)
    else:
      (predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      predictions = {
          'probabilities': log_probs,
          'labels': predicted_labels
      }
      return tf.estimator.EstimatorSpec(mode, predictions=predictions)

  # Return the actual model function in the closure
  return model_fn

In [0]:
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
NUM_TRAIN_EPOCHS = 3.0
# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100

In [0]:
# Compute # train and warmup steps from batch size
num_train_steps = int(len(train_features) / BATCH_SIZE * NUM_TRAIN_EPOCHS)
num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

In [0]:
# Specify outpit directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig(
    model_dir=OUTPUT_DIR,
    save_summary_steps=SAVE_SUMMARY_STEPS,
    save_checkpoints_steps=SAVE_CHECKPOINTS_STEPS)

In [0]:
model_fn = model_fn_builder(
  num_labels=len(label_list),
  learning_rate=LEARNING_RATE,
  num_train_steps=num_train_steps,
  num_warmup_steps=num_warmup_steps)

estimator = tf.estimator.Estimator(
  model_fn=model_fn,
  config=run_config,
  params={"batch_size": BATCH_SIZE})

INFO:tensorflow:Using config: {'_model_dir': 'OUTPUT_DIR_NAME', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f505bb0bbe0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


INFO:tensorflow:Using config: {'_model_dir': 'OUTPUT_DIR_NAME', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 500, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f505bb0bbe0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [0]:
# Create an input function for training. drop_remainder = True for using TPUs.
train_input_fn = bert.run_classifier.input_fn_builder(
    features=train_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=True,
    drop_remainder=False)

In [0]:
print(f'Beginning Training!')
current_time = datetime.now()
estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
print("Training took time ", datetime.now() - current_time)

INFO:tensorflow:Saving checkpoints for 46500 into OUTPUT_DIR_NAME/model.ckpt.


INFO:tensorflow:Saving checkpoints for 46500 into OUTPUT_DIR_NAME/model.ckpt.


INFO:tensorflow:global_step/sec: 1.58668


INFO:tensorflow:global_step/sec: 1.58668


INFO:tensorflow:loss = 1.5640657, step = 46500 (63.025 sec)


INFO:tensorflow:loss = 1.5640657, step = 46500 (63.025 sec)


INFO:tensorflow:global_step/sec: 2.0958


INFO:tensorflow:global_step/sec: 2.0958


INFO:tensorflow:loss = 2.1084204, step = 46600 (47.714 sec)


INFO:tensorflow:loss = 2.1084204, step = 46600 (47.714 sec)


INFO:tensorflow:global_step/sec: 2.09658


INFO:tensorflow:global_step/sec: 2.09658


INFO:tensorflow:loss = 1.3009549, step = 46700 (47.697 sec)


INFO:tensorflow:loss = 1.3009549, step = 46700 (47.697 sec)


INFO:tensorflow:global_step/sec: 2.09588


INFO:tensorflow:global_step/sec: 2.09588


INFO:tensorflow:loss = 1.335799, step = 46800 (47.713 sec)


INFO:tensorflow:loss = 1.335799, step = 46800 (47.713 sec)


INFO:tensorflow:Saving checkpoints for 46875 into OUTPUT_DIR_NAME/model.ckpt.


INFO:tensorflow:Saving checkpoints for 46875 into OUTPUT_DIR_NAME/model.ckpt.


INFO:tensorflow:Loss for final step: 1.7953342.


INFO:tensorflow:Loss for final step: 1.7953342.


Training took time  6:48:34.661615


In [0]:
test_input_fn = run_classifier.input_fn_builder(
    features=test_features,
    seq_length=MAX_SEQ_LENGTH,
    is_training=False,
    drop_remainder=False)

In [0]:
estimator.evaluate(input_fn = test_input_fn,steps = None)

INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Calling model_fn.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Done calling model_fn.


INFO:tensorflow:Starting evaluation at 2020-04-06T16:15:42Z


INFO:tensorflow:Starting evaluation at 2020-04-06T16:15:42Z


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Restoring parameters from OUTPUT_DIR_NAME/model.ckpt-46875


INFO:tensorflow:Restoring parameters from OUTPUT_DIR_NAME/model.ckpt-46875


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


InvalidArgumentError: ignored