# Bert GitHub classification 

Source: https://www.analyticsvidhya.com/blog/2020/10/simple-text-multi-classification-task-using-keras-bert/

# Setup

In [38]:
!pip install sentencepiece
!pip install gradio
!pip install wandb



In [39]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import logging
logging.basicConfig(level=logging.INFO)

In [40]:
# Flexible integration for any Python script
import wandb
from wandb.keras import WandbCallback

# 1. Start a W&B run
wandb.init(project='github_bug_feature_question', entity='wasaequreshi')

# 2. Save model inputs and hyperparameters
config = wandb.config
config.learning_rate = 0.01    

In [4]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Bert Layer

In [6]:
import tensorflow_hub as hub
import tokenization
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'.
INFO:absl:Downloaded https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2, Total size: 421.50MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'.


# Load Datasets

In [7]:
!cp "/content/drive/My Drive/297 2021/hw_3/hw_3_bert_github_bug_feature_question/embold_train.json" .
!cp "/content/drive/My Drive/297 2021/hw_3/hw_3_bert_github_bug_feature_question/embold_test.json" .

In [8]:
train=pd.read_json("embold_train.json").reset_index(drop=True)

INFO:numexpr.utils:NumExpr defaulting to 4 threads.


In [9]:
test=pd.read_json("embold_test.json").reset_index(drop=True)

# Preprocessing

In [10]:
train['Review'] = (train['title'].map(str) +' '+ train['body']).apply(lambda row: row.strip())

In [11]:
test['Review'] = (test['title'].map(str) +' '+ test['body']).apply(lambda row: row.strip())

In [12]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

# Model

In [13]:
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(3, activation='softmax')(net)
    
    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(lr=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model


In [14]:
max_len = 150
train_input = bert_encode(train.Review.values, tokenizer, max_len=max_len)
test_input = bert_encode(test.Review.values, tokenizer, max_len=max_len)
# train_labels = tf.onclick="parent.postMessage({'referent':'.tensorflow.keras'}, '*')">keras.utils.to_categorical(train.label.values, num_classes=3)

In [15]:
type(train.label.values)

numpy.ndarray

In [16]:
train_labels = tf.keras.utils.to_categorical(train.label.values, num_classes=3)

In [17]:
model = build_model(bert_layer, max_len=max_len)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 150)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 150)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 150)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

  "The `lr` argument is deprecated, use `learning_rate` instead.")


# Run model

In [None]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('model.h5', monitor='val_accuracy', save_best_only=True, verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

train_history = model.fit(
    train_input, train_labels, 
    validation_split=0.2,
    epochs=1,
    callbacks=[checkpoint, earlystopping, WandbCallback(monitor="val_loss", verbose=0, mode="auto", save_weights_only=(False),    log_weights=(False), log_gradients=(False), save_model=(True),    training_data=None, validation_data=None, labels=[], data_type=None,    predictions=36, generator=None, input_type=None, output_type=None,    log_evaluation=(False), validation_steps=None, class_colors=None,    log_batch_frequency=None, log_best_prefix="best_", save_graph=(True),    validation_indexes=None, validation_row_processor=None,    prediction_row_processor=None, infer_missing_processors=(True),    log_evaluation_frequency=0)],
    batch_size=32,
    verbose=1)



# Interactive

In [48]:
import gradio as gr

def greet(input_sentence):
  new_tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

  sentence_input = bert_encode([input_sentence], new_tokenizer, max_len=max_len)

  test_pred = model.predict(sentence_input)
  result = ['bug', 'feature', 'question']
  return result[numpy.argmax(test_pred)]

iface = gr.Interface(fn=greet, inputs="text", outputs="text")
iface.launch()

Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
This share link will expire in 72 hours. If you need a permanent link, visit: https://gradio.app/introducing-hosted


INFO:paramiko.transport:Connected (version 2.0, client OpenSSH_7.6p1)
INFO:paramiko.transport:Authentication (publickey) successful!


Running on External URL: https://56622.gradio.app


(<Flask 'gradio.networking'>,
 'http://127.0.0.1:7862/',
 'https://56622.gradio.app')