# Data Setup

In [None]:
#pip install protobuf==3.20.*

In [1]:
import sys
sys.path.append('helpermodule')

from helpermodule import data
from data import get_chapter_data, get_excerpt_data
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [2]:
help(data)

Help on module helpermodule.data in helpermodule:

NAME
    helpermodule.data

FUNCTIONS
    get_chapter_data()
    
    get_excerpt_data(n_words: int = 100)
        #n_words: # words in an excerpt
    
    import_data()

FILE
    /home/vmgu/DATASCI266-Final-Project-Gu/helpermodule/data.py




In [3]:
chapter_labels, chapter_examples = get_chapter_data()
print(chapter_labels[0])
print(chapter_examples[0][:199])

Virginia
“Citizens of the Solar Republic, this is your Sovereign.” I stare half blind into a ﬁring squad of ﬂy-eyed cameras. Out the viewport behind my stage, battle stations and ships of war ﬂoat beyond the 


In [4]:
excerpt_labels, excerpt_examples = get_excerpt_data(n_words=100)
print(excerpt_labels[100])
print(excerpt_examples[100])

Darrow
mottled with the resFlesh that has replaced the chunks Atlas took out. New metal ﬁngers extend from her knuckles. “Trouble?” she asks. “Pushy relations.” Without a smile, she turns back to watch the polar sky. Beyond the atmosphere of the planet, Atalantia’s warships rove, waiting for us to just nip our heads outside the great shield chains so they can drop mass drivers down and make craters of us. “Cold back here,” I say over the whistling wind. Our ship passes over the edge of an ice shelf. “Why don’t you head to mess? Colloway says it’s bad to sync


In [5]:
x_train, x_test, y_train, y_test = train_test_split(excerpt_examples, excerpt_labels, test_size=.2, random_state=2457)
print(len(x_train))
print(len(x_test))
print(len(y_train))
print(len(y_test))

2257
565
2257
565


# BERT Setup

In [12]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
from keras.utils import to_categorical


import sklearn as sk
import os
import nltk
from nltk.data import find

import matplotlib.pyplot as plt

import re

In [7]:
from transformers import BertTokenizer, TFBertModel

from transformers import logging
logging.set_verbosity_error()

In [8]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
bert_model = TFBertModel.from_pretrained('bert-base-cased')

2023-06-04 09:35:21.514890: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:966] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-06-04 09:35:21.549966: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2023-06-04 09:35:21.549982: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1934] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2023-06-04 09:35:21.550721: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (o

## Basic BERT Text Classification

In [14]:
print(y_train[:5])
label = preprocessing.LabelEncoder()
y = label.fit_transform(y_train)
y = to_categorical(y)
y[:5]

['Lyria', 'Darrow', 'Ephraim', 'Ephraim', 'Lysander']


array([[0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0.]], dtype=float32)

In [None]:
x_train_tf = tf.convert_to_tensor(x_train)
x_test_tf = tf.convert_to_tensor(x_test)
y_train_tf = tf.convert_to_tensor(y_train)
y_test_tf = tf.convert_to_tensor(y_test)

In [None]:
num_train_examples = 2000      # set number of train examples
num_test_examples = 500        # set number of test examples

MAX_SEQUENCE_LENGTH = 128                 # set max_length of the input sequence

all_train_examples = [x.decode('utf-8') for x in x_train_tf.numpy()]
all_test_examples = [x.decode('utf-8') for x in x_test_tf.numpy()]

x_train_tf = bert_tokenizer(all_train_examples[:num_train_examples],
              max_length=MAX_SEQUENCE_LENGTH,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')
y_train_tf = y_train_tf[:num_train_examples]

x_test_tf = bert_tokenizer(all_test_examples[:num_test_examples],
              max_length=MAX_SEQUENCE_LENGTH,
              truncation=True,
              padding='max_length', 
              return_tensors='tf')
y_test_tf = y_test_tf[:num_test_examples]

In [None]:
y_train_tf

In [None]:
def create_bert_classification_model(bert_model,
                                     num_train_layers=0,
                                     hidden_size = 200, 
                                     dropout=0.3,
                                     learning_rate=0.00005):
    """
    Build a simple classification model with BERT. Use the Pooler Output for classification purposes
    """
    if num_train_layers == 0:
        # Freeze all layers of pre-trained BERT model
        bert_model.trainable = False

    elif num_train_layers == 12: 
        # Train all layers of the BERT model
        bert_model.trainable = True

    else:
        # Restrict training to the num_train_layers outer transformer layers
        retrain_layers = []

        for retrain_layer_number in range(num_train_layers):

            layer_code = '_' + str(11 - retrain_layer_number)
            retrain_layers.append(layer_code)
          
        
        print('retrain layers: ', retrain_layers)

        for w in bert_model.weights:
            if not any([x in w.name for x in retrain_layers]):
                #print('freezing: ', w)
                w._trainable = False

    input_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}      

    bert_out = bert_model(bert_inputs)

    pooler_token = bert_out[1]
    #cls_token = bert_out[0][:, 0, :]

    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(pooler_token)


    hidden = tf.keras.layers.Dropout(dropout)(hidden)  


    classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)
    
    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
    
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss=tf.keras.losses.BinaryCrossentropy(from_logits=False), 
                                 metrics='accuracy')
    
    return classification_model

In [None]:
bert_classification_model = create_bert_classification_model(bert_model, num_train_layers=12)
#confirm all layers are frozen
bert_classification_model.summary()

In [None]:
bert_classification_model_history = bert_classification_model.fit(
    [x_train_tf.input_ids, x_train_tf.token_type_ids, x_train_tf.attention_mask],
    y_train_tf,
    validation_data=([x_test_tf.input_ids, x_test_tf.token_type_ids, x_test_tf.attention_mask], y_test_tf),
    batch_size=32,
    epochs=2
) 

In [None]:
y_train_tf