# BERT Chunks Classification pipeline
X - Code Chunks

y - Chunk Labels (generated by regex)

## Modules

In [1]:
!pip install tensorflow_hub==0.8
!pip install tensorflow==2.1.0
!pip install tensorflow-gpu==2.1.0
!pip install bert-for-tf2
!pip install sentencepiece

Collecting tensorflow==2.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/85/d4/c0cd1057b331bc38b65478302114194bd8e1b9c2bbc06e300935c0e93d90/tensorflow-2.1.0-cp36-cp36m-manylinux2010_x86_64.whl (421.8MB)
[K     |████████████████████████████████| 421.8MB 33kB/s 
Collecting gast==0.2.2
  Downloading https://files.pythonhosted.org/packages/4e/35/11749bf99b2d4e3cceb4d55ca22590b0d7c2c62b9de38ac4a4a7f4687421/gast-0.2.2.tar.gz
Collecting tensorflow-estimator<2.2.0,>=2.1.0rc0
[?25l  Downloading https://files.pythonhosted.org/packages/18/90/b77c328a1304437ab1310b463e533fa7689f4bfc41549593056d812fab8e/tensorflow_estimator-2.1.0-py2.py3-none-any.whl (448kB)
[K     |████████████████████████████████| 450kB 46.8MB/s 
[?25hCollecting tensorboard<2.2.0,>=2.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/d9/41/bbf49b61370e4f4d245d4c6051dfb6db80cec672605c91b1652ac8cc3d38/tensorboard-2.1.1-py3-none-any.whl (3.8MB)
[K     |████████████████████████████████| 3.9MB 40.4MB/

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
print("TF version: ", tf.__version__)
print("Hub version: ", hub.__version__)

TF version:  2.1.0
Hub version:  0.8.0


In [0]:
import tensorflow_hub as hub
import tensorflow as tf
import bert
from tensorflow.keras.models import Model       # Keras is the new high level API for TensorFlow
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.stats import wasserstein_distance

## Functions

In [0]:
def get_masks(tokens, max_seq_length):
    """Mask for padding"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))


def get_segments(tokens, max_seq_length):
    """Segments: 0 for the first sequence, 1 for the second"""
    if len(tokens)>max_seq_length:
        raise IndexError("Token length more than max seq length!")
    segments = []
    current_segment_id = 0
    for token in tokens:
        segments.append(current_segment_id)
        if token == "[SEP]":
            current_segment_id = 1
    return segments + [0] * (max_seq_length - len(tokens))


def get_ids(tokens, tokenizer, max_seq_length):
    """Token ids from Tokenizer vocab"""
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))
    return input_ids
def tokenize_sentence(sentence):
    stokens = tokenizer.tokenize(sentence)
    stokens = ["[CLS]"] + stokens + ["[SEP]"]
    
    input_ids = get_ids(stokens, tokenizer, max_seq_length)
    input_masks = get_masks(stokens, max_seq_length)
    input_segments = get_segments(stokens, max_seq_length)
    
    return input_ids, input_masks, input_segments

def compare_sentences(sentence_1, sentence_2, distance_metric):
    input_ids_1, input_masks_1, input_segments_1 = tokenize_sentence(sentence_1)
    input_ids_2, input_masks_2, input_segments_2 = tokenize_sentence(sentence_2)
    
    pool_embs_1, all_embs_1 = model.predict([[input_ids_1],[input_masks_1],[input_segments_1]])
    pool_embs_2, all_embs_2 = model.predict([[input_ids_2],[input_masks_2],[input_segments_2]])
#     print(pool_embs_1, all_embs_1)
#     print(pool_embs_2, all_embs_2)
    return distance_metric(pool_embs_1[0], pool_embs_2[0])

def distance_between_sentences(sentence_1, sentence_2, distance_metric):
    input_ids_1, input_masks_1, input_segments_1 = tokenize_sentence(sentence_1)
    input_ids_2, input_masks_2, input_segments_2 = tokenize_sentence(sentence_2)
    pool_embs_1, all_embs_1 = model.predict([[input_ids_1],[input_masks_1],[input_segments_1]])
    pool_embs_2, all_embs_2 = model.predict([[input_ids_2],[input_masks_2],[input_segments_2]])
    distances = []
    for i in range(0,max_seq_length):
      distances.append(distance_metric(all_embs_1[0][i], all_embs_2[0][i]))
    distance = np.mean(distances)
    return distance

def get_embs(sentence):
    input_ids, input_masks, input_segments = tokenize_sentence(sentence)
    pool_embs, all_embs = model.predict([[input_ids],[input_masks],[input_segments]])
    return pool_embs, all_embs

def get_all_embs(sentence):
    input_ids, input_masks, input_segments = tokenize_sentence(sentence)
    pool_embs, _ = model.predict([[input_ids],[input_masks],[input_segments]])
    return pool_embs

def square_rooted(x):
    return math.sqrt(sum([a*a for a in x]))

def cosine_similarity(x,y):
    numerator = sum(a*b for a,b in zip(x,y))
    denominator = square_rooted(x)*square_rooted(y)
    return 1-numerator/float(denominator)

def dummy_metric(x,y):
    return 42

def create_single_input(sentence, MAX_LEN):
  
    stokens = tokenizer.tokenize(sentence)
    
    stokens = stokens[:MAX_LEN]
    
    stokens = ["[CLS]"] + stokens + ["[SEP]"]
  
    ids = get_ids(stokens, tokenizer, max_seq_length)
    masks = get_masks(stokens, max_seq_length)
    segments = get_segments(stokens, max_seq_length)

    return ids,masks,segments

def create_input_array(sentences):

    input_ids, input_masks, input_segments = [], [], []

    for sentence in tqdm(sentences, position=0, leave=True):
        ids,masks,segments=create_single_input(sentence, max_seq_length-2)

        input_ids.append(ids)
        input_masks.append(masks)
        input_segments.append(segments)

    return [np.asarray(input_ids, dtype=np.int32), 
              np.asarray(input_masks, dtype=np.int32), 
              np.asarray(input_segments, dtype=np.int32)]

## Data Preprocessing

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
df = pd.read_csv('/content/drive/My Drive/NL2ML/chunks_30_tags.csv')

In [0]:
X = df['code'].values
class_names = df[['tag_import_output', 'tag_visualization']].columns.values
class_ids = [class_id for class_id,class_name in enumerate(class_names)]
y = df[['tag_import_output', 'tag_visualization']].values

In [8]:
from keras.utils import to_categorical
labels = to_categorical(y)

Using TensorFlow backend.


In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.1, random_state=42)

## Compiling BERT


In [10]:
%time
max_seq_length = 128

input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")

BERT_HUB_URL = 'https://tfhub.dev/tensorflow/bert_en_cased_L-24_H-1024_A-16/2'
bert_layer = hub.KerasLayer(BERT_HUB_URL, trainable=True)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output, sequence_output])

x = tf.keras.layers.GlobalAveragePooling1D()(sequence_output)
x = tf.keras.layers.Dropout(0.2)(x)
out = tf.keras.layers.Dense(len(class_names), activation="sigmoid", name="dense_output")(x)
pred = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.models.Model(
      inputs=[input_word_ids, input_mask, segment_ids],
      outputs=out)

model.compile(loss='sparse_categorical_crossentropy', #'binary_crossentropy'
                  optimizer='adam',
                  metrics=['accuracy'])

model.summary()

vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
FullTokenizer = bert.bert_tokenization.FullTokenizer
tokenizer = FullTokenizer(vocab_file, do_lower_case)

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.01 µs
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 128)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 333579265   input_word_ids[0][0]             
                        

In [0]:
train_inputs = create_input_array(X_train)

 59%|█████▉    | 21973/36949 [01:58<01:10, 212.62it/s]

## BERT Training (fine-tuning)

In [0]:
model.fit(train_inputs, y_train, epochs=10, batch_size=50, validation_split=0.1, verbose=1, shuffle=True)

## BERT Predicting

In [0]:
test_inputs = create_input_array(X_test)

In [0]:
predictions = model.predict(test_inputs)
predictions

## Layout

In [0]:
print(predictions.shape)
for i in range(0,20):
    print(class_names[np.argmax(predictions[i])])

In [0]:
from sklearn import metrics
metrics.f1_score(y_test, predictions, average='macro')