#### Reference code
1. https://github.com/cbaziotis/ekphrasis
2. https://github.com/google-research/bert/blob/master/run_classifier.py

In [0]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/5922NeuralNetworkAndDeepLearning/FinalProject

In [0]:
pip install emoji --upgrade  # install emoji package

In [0]:
pip install ekphrasis        # install package to process emoji

In [0]:
%tensorflow_version 1.x
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from datetime import datetime
from sklearn.model_selection import train_test_split
import os
import emoji
import numpy as np


print("tensorflow version : ", tf.__version__)
print("tensorflow_hub version : ", hub.__version__)

In [0]:
!pip install bert-tensorflow

In [0]:
import bert
from bert import run_classifier
from bert import optimization
from bert import tokenization

###Loading The Data & processing

In [0]:
ei_oc_train_file = ['2-EI-oc-En-train/EI-oc-En-anger-train.txt']#,'2-EI-oc-En-train/EI-oc-En-fear-train.txt','2-EI-oc-En-train/EI-oc-En-joy-train.txt','2-EI-oc-En-train/EI-oc-En-sadness-train.txt']
ei_oc_dev_file = ['2-EI-oc-En-dev/EI-oc-En-anger-dev.txt']#,'2-EI-oc-En-dev/EI-oc-En-fear-dev.txt','2-EI-oc-En-dev/EI-oc-En-joy-dev.txt','2-EI-oc-En-dev/EI-oc-En-sadness-dev.txt']
ei_oc_test_file = ['2-EI-oc-En-test/EI-oc-En-anger-test.txt']#,'2-EI-oc-En-test/EI-oc-En-fear-test.txt','2-EI-oc-En-test/EI-oc-En-joy-test.txt','2-EI-oc-En-test/EI-oc-En-sadness-test.txt']


In [0]:
class read_file():
  def __init__(self, train_file, dev_file, test_file):
    self.train_file = train_file
    self.dev_file = dev_file
    self.test_file = test_file

  def read_data(self):
    train_data =  pd.read_csv(self.train_file, sep = '\t')
    dev_data =  pd.read_csv(self.dev_file, sep = '\t')
    test_data =  pd.read_csv(self.test_file, sep = '\t')
    return train_data, dev_data, test_data

  def array_format_data(self):
    train_data, dev_data, test_data = self.read_data()
    train_title = train_data.columns[:-1]
    target_title = train_data.columns[-1]

    train_data_array = train_data[train_title].values
    dev_data_array = dev_data[train_title].values
    test_data_array = test_data[train_title].values

    train_target_array = train_data[target_title].values
    dev_target_array = dev_data[target_title].values
    return train_data_array, dev_data_array, test_data_array, train_target_array, dev_target_array

In [0]:
ei_oc_files = [ei_oc_train_file, ei_oc_dev_file, ei_oc_test_file]
train_data_array, dev_data_array, test_data_array, train_target_array, dev_target_array = dict(), dict(), dict(), dict(), dict()
train_data, dev_data, test_data, train_target_array_first_col, dev_target_array_first_col = dict(), dict(), dict(), dict(), dict()

files = [ei_oc_files] 

files_len = len(files)
for i in range(len(files)):
  read_file_obj = read_file(files[i][0][0], files[i][1][0], files[i][2][0])
  train_data[i], dev_data[i], test_data[i] = read_file_obj.read_data()
  train_data_array[i], dev_data_array[i], test_data_array[i], train_target_array[i], dev_target_array[i] = read_file_obj.array_format_data()
  train_target_num = []
  dev_target_num = []
  for d in train_target_array[i]:
    train_target_num.append(int(d.split(":")[0]))
  for d in dev_target_array[i]:
    dev_target_num.append(int(d.split(":")[0]))
  train_target_array_first_col[i] = train_target_num
  dev_target_array_first_col[i] = dev_target_num

In [0]:
class process_data():
  def __init__(self, train_data_array, dev_data_array, test_data_array):
    self.train_data_array = train_data_array
    self.dev_data_array = dev_data_array
    self.test_data_array = test_data_array

  def preprocessing_text(self, file_name):
    from ekphrasis.classes.preprocessor import TextPreProcessor
    from ekphrasis.classes.tokenizer import SocialTokenizer
    from ekphrasis.dicts.emoticons import emoticons

    text_processor = TextPreProcessor(
        # terms that will be normalized
        normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
            'time', 'url', 'date', 'number'],
        # terms that will be annotated
        annotate={"hashtag", "allcaps", "elongated", "repeated",
            'emphasis', 'censored'},
        fix_html=True,  # fix HTML tokens

        # corpus from which the word statistics are going to be used 
        # for word segmentation 
        segmenter="twitter", 

        # corpus from which the word statistics are going to be used 
        # for spell correction
        corrector="twitter", 

        unpack_hashtags=True,  # perform word segmentation on hashtags
        unpack_contractions=True,  # Unpack contractions (can't -> can not)
        spell_correct_elong=False,  # spell correction for elongated words

        # select a tokenizer. 
        tokenizer=SocialTokenizer(lowercase=True).tokenize,

        # list of dictionaries, for replacing tokens extracted from the text,
        # with other expressions. You can pass more than one dictionaries.
        dicts=[emoticons]
    )
    sentences = []
    for i in file_name:
        temp = [i]
        sentences.append(temp)
    result = []
    for s in sentences:
        temp = emoji.demojize(s[0])
        result.append(" ".join(text_processor.pre_process_doc(temp)))
    return np.array(result)

  def transfor_tweet_data(self):
    preprocessed_tweet_train = dict()
    preprocessed_tweet_dev = dict()
    preprocessed_tweet_test = dict()
    for i in range(files_len):
      preprocessed_tweet_train[i] = self.preprocessing_text(self.train_data_array[i][:,1])
      preprocessed_tweet_dev[i] = self.preprocessing_text(self.dev_data_array[i][:,1])
      preprocessed_tweet_test[i] = self.preprocessing_text(self.test_data_array[i][:,1])
    return preprocessed_tweet_train, preprocessed_tweet_dev, preprocessed_tweet_test
    
  def transfor_array_to_dataFrame(self):
    train_X = dict()
    dev_X = dict()
    test_X = dict()

    train_y = dict()
    dev_y = dict()
    preprocessed_tweet_train, preprocessed_tweet_dev, preprocessed_tweet_test = self.transfor_tweet_data()
    for i in range(files_len):
      train_X[i] = pd.DataFrame.from_dict(preprocessed_tweet_train[i])
      dev_X[i] = pd.DataFrame.from_dict(preprocessed_tweet_dev[i])
      test_X[i] = pd.DataFrame.from_dict(preprocessed_tweet_test[i])
      
      train_X[i].columns=['Tweet']
      dev_X[i].columns=['Tweet']
      test_X[i].columns=['Tweet']

      train_y[i] = pd.DataFrame.from_dict(train_target_array_first_col[i])
      dev_y[i] = pd.DataFrame.from_dict(dev_target_array_first_col[i])
      
      train_y[i].columns=['Score']
      dev_y[i].columns=['Score']

    return train_X, dev_X, test_X, train_y, dev_y

  def combine_dataFrame_Xy(self):
    bert_train_Xy = dict()
    bert_dev_Xy = dict()
    train_X, dev_X, test_X, train_y, dev_y = self.transfor_array_to_dataFrame()
    for i in range(len(train_X)):
      bert_train_Xy[i] = pd.concat([train_X[i],train_y[i]], axis=1)
      bert_dev_Xy[i] = pd.concat([dev_X[i], dev_y[i]], axis=1)
    return bert_train_Xy, bert_dev_Xy

In [0]:
process_data_obj = process_data(train_data_array, dev_data_array, test_data_array)
bert_train_Xy, bert_dev_Xy = process_data_obj.combine_dataFrame_Xy()

In [0]:
def transclass(dframe):
  for i in range(len(dframe)):
    if dframe.loc[i,'Score'] >1:
      dframe.loc[i,'Score'] =1
    else:
      dframe.loc[i,'Score'] =0
  return dframe

In [0]:
bert_train_Xy[0] = transclass(bert_train_Xy[0])
bert_dev_Xy[0] = transclass(bert_dev_Xy[0])

In [0]:
DATA_COLUMN = 'Tweet'
LABEL_COLUMN = 'Score'
BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

# Warmup is a period of time where hte learning rate 
# is small and gradually increases--usually helps training.
WARMUP_PROPORTION = 0.1
# Model configs
SAVE_CHECKPOINTS_STEPS = 500
SAVE_SUMMARY_STEPS = 100

# label_list is the list of labels, i.e. True, False or 0, 1 or 'dog', 'cat'
label_list = [[0,1],[0,1,2,3],np.arange(-3,3)]

### Data processing

In [0]:
def input_examples(data, DATA_COLUMN, LABEL_COLUMN):
  InputExamples = data.apply(lambda x: bert.run_classifier.InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this example
                                                                   text_a = x[DATA_COLUMN], 
                                                                   text_b = None, 
                                                                   label = x[LABEL_COLUMN]), axis = 1)
  return InputExamples


In [0]:
def input_examples_result(bert_train_Xy, bert_dev_Xy, DATA_COLUMN, LABEL_COLUMN):
  train_InputExamples = dict()
  dev_InputExamples = dict()
  for i in range(files_len):
    train_InputExamples[i] = input_examples(bert_train_Xy[i], DATA_COLUMN, LABEL_COLUMN)  
    dev_InputExamples[i] = input_examples(bert_dev_Xy[i], DATA_COLUMN, LABEL_COLUMN)
  return train_InputExamples, dev_InputExamples

In [0]:
def create_tokenizer_from_hub_module():
  """Get the vocab file and casing info from the Hub module."""
  with tf.Graph().as_default():
    bert_module = hub.Module(BERT_MODEL_HUB)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
      vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                            tokenization_info["do_lower_case"]])
      
  return bert.tokenization.FullTokenizer(
      vocab_file=vocab_file, do_lower_case=do_lower_case)

In [0]:
train_InputExamples, dev_InputExamples = input_examples_result(bert_train_Xy, bert_dev_Xy, DATA_COLUMN, LABEL_COLUMN)
tokenizer = create_tokenizer_from_hub_module()

In [0]:
print("Row 0 - guid of training set : ", train_InputExamples[0].iloc[2].guid)
print("\n__________\nRow 0 - text_a of training set : ", train_InputExamples[0].iloc[2].text_a)
print("\n__________\nRow 0 - text_b of training set : ", train_InputExamples[0].iloc[2].text_b)
print("\n__________\nRow 0 - label of training set : ", train_InputExamples[0].iloc[2].label)
print(tokenizer.tokenize(train_InputExamples[0].iloc[2].text_a))

In [0]:
train_features = dict()
dev_features = dict()
MAX_SEQ_LENGTH_List = [64]#,8,32]

for i in range(len(MAX_SEQ_LENGTH_List)):
  train_features[MAX_SEQ_LENGTH_List[i]] = bert.run_classifier.convert_examples_to_features(train_InputExamples[0], label_list[0], MAX_SEQ_LENGTH_List[i], tokenizer)
  dev_features[MAX_SEQ_LENGTH_List[i]] = bert.run_classifier.convert_examples_to_features(dev_InputExamples[0], label_list[0], MAX_SEQ_LENGTH_List[i], tokenizer)

In [0]:
print("Sentence : ", train_InputExamples[0].iloc[0].text_a)
print("-"*30)
print("Tokens : ", tokenizer.tokenize(train_InputExamples[0].iloc[0].text_a))
print("-"*30)
print("Input IDs : ", train_features[64][0].input_ids)
print("-"*30)
print("Input Masks : ", train_features[64][0].input_mask)
print("-"*30)
print("Segment IDs : ", train_features[64][0].segment_ids)

### creating a multiple-class classifier 

In [0]:
# We use some function from https://github.com/google-research/bert/run_classfier.py

def create_model(is_predicting, input_ids, input_mask, segment_ids, labels,
                 num_labels):
  """Creates a classification model."""

  bert_module = hub.Module(
      BERT_MODEL_HUB,
      trainable=True)
  bert_inputs = dict(
      input_ids=input_ids,
      input_mask=input_mask,
      segment_ids=segment_ids)
  bert_outputs = bert_module(
      inputs=bert_inputs,
      signature="tokens",
      as_dict=True)

 
  output_layer = bert_outputs["pooled_output"]

  hidden_size = output_layer.shape[-1].value

  # Create our own layer to tune for politeness data.
  output_weights = tf.get_variable(
      "output_weights", [num_labels, hidden_size],
      initializer=tf.truncated_normal_initializer(stddev=0.02))

  output_bias = tf.get_variable(
      "output_bias", [num_labels], initializer=tf.zeros_initializer())

  with tf.variable_scope("loss"):

    # Dropout helps prevent overfitting
    output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)

    logits = tf.matmul(output_layer, output_weights, transpose_b=True)
    logits = tf.nn.bias_add(logits, output_bias)
    log_probs = tf.nn.log_softmax(logits, axis=-1)

    # Convert labels into one-hot encoding
    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)

    predicted_labels = tf.squeeze(tf.argmax(log_probs, axis=-1, output_type=tf.int32))
    # If we're predicting, we want predicted labels and the probabiltiies.
    if is_predicting:
      return (predicted_labels, log_probs)

    # If we're train/eval, compute loss between predicted and actual label
    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
    loss = tf.reduce_mean(per_example_loss)
    return (loss, predicted_labels, log_probs)

In [0]:
def model_fn_builder(num_labels, learning_rate, num_train_steps,
                     num_warmup_steps):
  """Returns `model_fn` closure for TPUEstimator."""
  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    input_ids = features["input_ids"]
    input_mask = features["input_mask"]
    segment_ids = features["segment_ids"]
    label_ids = features["label_ids"]

    is_predicting = (mode == tf.estimator.ModeKeys.PREDICT)
    
    # TRAIN and EVAL
    if not is_predicting:

      (loss, predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      train_op = bert.optimization.create_optimizer(
          loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu=False)

      # Calculate evaluation metrics. 
      def metric_fn(label_ids, predicted_labels):
        accuracy = tf.metrics.accuracy(label_ids, predicted_labels)
        true_pos = tf.metrics.true_positives(
            label_ids,
            predicted_labels)
        true_neg = tf.metrics.true_negatives(
            label_ids,
            predicted_labels)   
        false_pos = tf.metrics.false_positives(
            label_ids,
            predicted_labels)  
        false_neg = tf.metrics.false_negatives(
            label_ids,
            predicted_labels)
        
        return {
            "eval_accuracy": accuracy
            }

      eval_metrics = metric_fn(label_ids, predicted_labels)

      if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode=mode,
          loss=loss,
          train_op=train_op)
      else:
          return tf.estimator.EstimatorSpec(mode=mode,
            loss=loss,
            eval_metric_ops=eval_metrics)
    else:
      (predicted_labels, log_probs) = create_model(
        is_predicting, input_ids, input_mask, segment_ids, label_ids, num_labels)

      predictions = {
          'probabilities': log_probs,
          'labels': predicted_labels
      }
      return tf.estimator.EstimatorSpec(mode, predictions=predictions)

  # Return the actual model function in the closure
  return model_fn

In [0]:
# Compute train and warmup steps from batch size
# These hyperparameters are copied from this colab notebook (https://colab.sandbox.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb)
BATCH_SIZE_List = [8,50,100]
LEARNING_RATE_List = [1e-5]#, 2e-5, 5e-5]
NUM_TRAIN_EPOCHS_List = [5]#,10]

# Compute # train and warmup steps from batch size
num_train_steps = dict()
num_warmup_steps = dict()
for BATCH_SIZE in BATCH_SIZE_List:
  for NUM_TRAIN_EPOCHS in NUM_TRAIN_EPOCHS_List:
    num_train_steps[BATCH_SIZE,NUM_TRAIN_EPOCHS] = int(len(train_features[MAX_SEQ_LENGTH_List[0]]) /  BATCH_SIZE * NUM_TRAIN_EPOCHS)
    num_warmup_steps[BATCH_SIZE,NUM_TRAIN_EPOCHS] = int(num_train_steps[BATCH_SIZE,NUM_TRAIN_EPOCHS] * WARMUP_PROPORTION)

In [0]:
type(num_train_steps.values())
for k,v in num_train_steps.items():
  print(k,num_train_steps[k])
  print('warm up',num_warmup_steps[k])
# num_train_steps.values()

In [0]:

# Specify output directory and number of checkpoint steps to save
run_config = tf.estimator.RunConfig()


In [0]:
model_fn = dict()
estimator = dict()
train_input_fn = dict()
dev_input_fn = dict()
evaluate_result=dict()


###train and evaluate

In [0]:
for i in range(1):#(len(LEARNING_RATE_List)):
  # 1e-5,2e-5,5e-5
  for k,v in num_train_steps.items():
    # (8,5),(8,10),(50,5),(50,10),(100,5),(100,10)
    model_fn[i,k] = model_fn_builder(
        num_labels=len(label_list[0]),
        learning_rate=LEARNING_RATE_List[i],
        num_train_steps=num_train_steps[k],
        num_warmup_steps=num_warmup_steps[k])
    # since the limitation of disk and ram, we choose toe run 2 max_seq_length every time.
    for MAX_SEQ_LENGTH in MAX_SEQ_LENGTH_List[:1]:
      # 64，8，32, 128.
      estimator[i, MAX_SEQ_LENGTH, k] = tf.estimator.Estimator(
          model_fn=model_fn[i,k],
          config=run_config,
          params={"batch_size": k[0]})
      train_input_fn[MAX_SEQ_LENGTH] = bert.run_classifier.input_fn_builder(
          features=train_features[MAX_SEQ_LENGTH],
          seq_length=MAX_SEQ_LENGTH,
          is_training=True,
          drop_remainder=False)
      print(f'Beginning Training!')
      current_time = datetime.now()
      estimator[i,MAX_SEQ_LENGTH, k].train(input_fn=train_input_fn[MAX_SEQ_LENGTH], max_steps=num_train_steps[k])
      print("Training took time ", datetime.now() - current_time)


In [0]:
print(estimator)

In [0]:
for i in range(1):#(len(LEARNING_RATE_List)):
  for k,v in num_train_steps.items():
    for MAX_SEQ_LENGTH in MAX_SEQ_LENGTH_List[:1]:
      dev_input_fn[MAX_SEQ_LENGTH] = run_classifier.input_fn_builder(
          features=dev_features[MAX_SEQ_LENGTH],
          seq_length=MAX_SEQ_LENGTH,
          is_training=False,
          drop_remainder=False)
      evaluate_result[i,MAX_SEQ_LENGTH, k]=estimator[i,MAX_SEQ_LENGTH, k].evaluate(input_fn=dev_input_fn[MAX_SEQ_LENGTH], steps=None)

In [0]:
for k,v in evaluate_result.items():
  print(k,v)   # The result of evaulate