In [1]:
from zipfile import ZipFile 
import os.path
from os import path
import pandas as pd
import numpy as np
import base64
import re
from itertools import groupby
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
import tensorflow as tf

import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
file_name = "poems_with_new_tags.zip"
poem_df = pd.read_csv('poems_with_new_tags.zip', compression='zip', header=0, quotechar='"')


In [115]:
poems_info = poem_df[['content','dynasty','author','title']]
labels = poem_df.drop(['content','dynasty','author','title','star',
                       'author_stars','tags','tags_list','new_tags','new_first_tag'], axis=1)

In [116]:
poems_info.head()

Unnamed: 0,content,dynasty,author,title
0,《吴都赋》云：“户藏烟浦，家具画船。”唯吴兴为然。春游之盛，西湖未能过也。己酉岁，予与萧时父...,宋代,姜夔,琵琶仙·《吴都赋》云：「户藏烟浦
1,《廿一史弹词》第三段说秦汉开场词滚滚长江东逝水，浪花淘尽英雄。是非成败转头空。青山依旧在，几...,明代,杨慎,临江仙·滚滚长江东逝水
2,《水经》云：“彭蠡之口有石钟山焉。”郦元以为下临深潭，微风鼓浪，水石相搏，声如洪钟。是说也，...,宋代,苏轼,石钟山记
3,【序】辛亥之冬，予载雪诣石湖。止既月，授简索句，且征新声，作此两曲。石湖把玩不已，使工妓隶习...,宋代,姜夔,暗香疏影
4,〔一枝花〕 攀出墙朵朵花，折临路枝枝柳。花攀红蕊嫩，柳折翠条柔，浪子风流。凭着我折柳攀花手...,元代,关汉卿,【南吕】一枝花不伏老


In [117]:
labels.head()

Unnamed: 0,写物,劝勉,家庭,快乐,悲苦,政治,朋友,游玩
0,0,0,0,0,0,1,0,1
1,1,1,0,0,1,1,0,0
2,1,0,0,0,0,1,0,1
3,0,1,0,0,0,0,0,0
4,1,0,0,1,1,0,0,0


In [118]:
labels_list = labels.to_numpy()
print(labels_list.shape)

(6107, 8)


In [13]:
from transformers import BertTokenizer, RobertaTokenizer, DistilBertTokenizer,TFBertModel


In [17]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
MAX_TEXT_LENGTH=100

In [18]:
# initialize numpy arrays for Token-Ids and Attention Masks
Xids = np.zeros((len(poems_info), MAX_TEXT_LENGTH), dtype=int)
Xmask = np.zeros((len(poems_info), MAX_TEXT_LENGTH), dtype=int)

In [19]:
Xids.shape


(6107, 100)

In [22]:
# Use a simple for loop to encode all data points
for i, sequence in enumerate(poems_info['content']):
  tokens = bert_tokenizer.encode_plus(sequence
                               ,max_length = MAX_TEXT_LENGTH          # Using text with this max length
                               ,truncation=True               # truncate any text longer than max_length
                               ,padding='max_length'          # padd text that is smaller than max_length
                               ,add_special_tokens=True       # add special tokens for start, end of sentence, unknown, and mask tokens
                               ,return_token_type_ids = False # do not return ids for types of tokens
                               ,return_attention_mask = True
                               ,return_tensors='tf')
  
  Xids[i, :], Xmask[i, :] = tokens['input_ids'], tokens['attention_mask']

In [23]:

# Array of tokenized Ids
Xids


array([[ 101,  517, 1426, ...,  749, 1126,  102],
       [ 101,  517, 2457, ...,    0,    0,    0],
       [ 101,  517, 3717, ..., 1898, 1141,  102],
       ...,
       [ 101, 7987, 7716, ..., 2495, 3341,  102],
       [ 101, 8020, 1923, ..., 3344, 6983,  102],
       [ 101, 8020, 3448, ..., 3308, 1911,  102]])

In [24]:
# Array of attaention masks
Xmask

array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 1, 1, 1],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 1, 1, 1]])

In [25]:
with open ('xids.npy', 'wb') as f:
  np.save(f, Xids)
with open ('xmask.npy', 'wb') as f:
  np.save(f, Xmask)
with open ('labels.npy', 'wb') as f:
  np.save(f, labels_list)

# del df, Xids, Xmask, labels
# df = None

## Ininitializing Hugging Face Tokenizer and Model


In [122]:
with open ('xids.npy', 'rb') as f:
  Xids = np.load(f)
with open ('xmask.npy', 'rb') as f:
  Xmask = np.load(f)
with open ('labels.npy', 'rb') as f:
  labels = np.load(f)

In [123]:
Xids


array([[ 101,  517, 1426, ...,  749, 1126,  102],
       [ 101,  517, 2457, ...,    0,    0,    0],
       [ 101,  517, 3717, ..., 1898, 1141,  102],
       ...,
       [ 101, 7987, 7716, ..., 2495, 3341,  102],
       [ 101, 8020, 1923, ..., 3344, 6983,  102],
       [ 101, 8020, 3448, ..., 3308, 1911,  102]])

In [124]:
for i in range(0, 20):
  print(labels[i])

[0 0 0 0 0 1 0 1]
[1 1 0 0 1 1 0 0]
[1 0 0 0 0 1 0 1]
[0 1 0 0 0 0 0 0]
[1 0 0 1 1 0 0 0]
[0 0 0 0 1 1 0 0]
[0 0 0 0 0 1 0 0]
[1 0 0 1 0 1 0 0]
[0 0 0 0 1 1 0 0]
[0 1 0 0 0 1 0 0]
[0 0 0 0 1 1 0 0]
[1 0 0 0 0 1 0 0]
[1 0 1 0 0 1 0 0]
[1 0 0 0 0 0 0 0]
[1 0 0 0 1 0 1 0]
[1 1 0 0 0 1 1 0]
[1 0 0 0 0 1 0 0]
[1 0 0 0 0 1 0 0]
[1 1 0 0 0 0 0 0]
[0 0 0 0 0 1 0 1]


In [29]:
import tensorflow as tf


In [30]:
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))


In [31]:
print (dataset.take(1))
print ()
for i in dataset.take(1):
  print (i)

<TakeDataset shapes: ((100,), (100,), (8,)), types: (tf.int64, tf.int64, tf.int64)>

(<tf.Tensor: shape=(100,), dtype=int64, numpy=
array([ 101,  517, 1426, 6963, 6602,  518,  756, 8038,  100, 2787, 5966,
       4170, 3855, 8024, 2157, 1072, 4514, 5670,  511,  100, 1546, 1426,
       1069,  711, 4197,  511, 3217, 3952,  722, 4670, 8024, 6205, 3959,
       3313, 5543, 6814,  738,  511, 2346, 6977, 2259, 8024,  750,  680,
       5854, 3198, 4266, 6770, 6983, 1298, 6958, 8024, 2697, 6878, 2768,
       3625,  511, 1352, 3444, 3341, 3198, 8024, 3300,  782,  849,  510,
       3191, 3289, 3425, 3418, 3425, 1383,  511, 3625, 2794, 6768, 5276,
       7607, 5709, 8024, 6042, 4691, 3633, 1936, 5318,  511, 3217, 3933,
       6823,  510, 3722, 3828, 5632, 5344, 8024, 3291, 3924,  749, 1126,
        102])>, <tf.Tensor: shape=(100,), dtype=int64, numpy=
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
   

In [32]:
def map_func(input_ids, masks, labels):
  return {'input_ids': input_ids, 'attention_mask': masks}, labels

In [33]:

# we can use the dataset map function to apply this format
dataset = dataset.map(map_func)

In [34]:
print (dataset.take(1))
print ()
for i in dataset.take(1):
  print (i)

<TakeDataset shapes: ({input_ids: (100,), attention_mask: (100,)}, (8,)), types: ({input_ids: tf.int64, attention_mask: tf.int64}, tf.int64)>

({'input_ids': <tf.Tensor: shape=(100,), dtype=int64, numpy=
array([ 101,  517, 1426, 6963, 6602,  518,  756, 8038,  100, 2787, 5966,
       4170, 3855, 8024, 2157, 1072, 4514, 5670,  511,  100, 1546, 1426,
       1069,  711, 4197,  511, 3217, 3952,  722, 4670, 8024, 6205, 3959,
       3313, 5543, 6814,  738,  511, 2346, 6977, 2259, 8024,  750,  680,
       5854, 3198, 4266, 6770, 6983, 1298, 6958, 8024, 2697, 6878, 2768,
       3625,  511, 1352, 3444, 3341, 3198, 8024, 3300,  782,  849,  510,
       3191, 3289, 3425, 3418, 3425, 1383,  511, 3625, 2794, 6768, 5276,
       7607, 5709, 8024, 6042, 4691, 3633, 1936, 5318,  511, 3217, 3933,
       6823,  510, 3722, 3828, 5632, 5344, 8024, 3291, 3924,  749, 1126,
        102])>, 'attention_mask': <tf.Tensor: shape=(100,), dtype=int64, numpy=
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [35]:
DS_LEN = len(list(dataset))
DS_LEN

6107

In [36]:
SPLIT = .8

# take or skip the specified number of batches to split by factor
test = dataset.skip(round(DS_LEN * SPLIT)).batch(32)
trainevalu = dataset.take(round(DS_LEN * SPLIT))

DS_LEN2 = len(list(trainevalu))

train = trainevalu.take(round(DS_LEN2 * SPLIT)).shuffle(1000).batch(32)
evalu = trainevalu.skip(round(DS_LEN2 * SPLIT)).shuffle(1000).batch(32)

# del dataset

In [37]:
print (f"test data: {len(test)}")
print (f"train data: {len(train)}, train evalu data: {len(evalu)}")

test data: 39
train data: 123, train evalu data: 31


## Build Model

In [44]:
from transformers import BertConfig
from transformers import TFBertForSequenceClassification

In [45]:
bertConfig = BertConfig.from_pretrained('bert-base-chinese'
                                        , output_hidden_states=True
                                        , num_lables=6
                                        , max_length=MAX_TEXT_LENGTH
                                        )

In [46]:
tranformersPreTrainedModelName = 'bert-base-chinese'
bert = TFBertForSequenceClassification.from_pretrained(tranformersPreTrainedModelName, config=bertConfig)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [47]:
bert.summary()


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  102267648 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 102,269,186
Trainable params: 102,269,186
Non-trainable params: 0
_________________________________________________________________


In [48]:
# build 2 input layers to Bert Model where name needs to match the input values in the dataset
input_ids = tf.keras.Input(shape=(MAX_TEXT_LENGTH,), name = 'input_ids', dtype='int32')
mask = tf.keras.Input(shape=(MAX_TEXT_LENGTH,), name = 'attention_mask', dtype='int32')

embedings = bert.layers[0](input_ids, attention_mask=mask)[0]

#Original Author: Ferry Djaja
#https://djajafer.medium.com/multi-class-text-classification-with-keras-and-lstm-4c5525bef592
X = tf.keras.layers.Dropout(0.5)(embedings)
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(768))(X)
y = tf.keras.layers.Dense(8, activation='sigmoid', name='outputs')(X)

model = tf.keras.Model(inputs=[input_ids,mask], outputs=y)

# Freeze the Bert model by freezing this layer to make training less of an overkill in terms of required performance
model.layers[2].trainable = False
#bert.layers[1].trainable = False
#bert.layers[2].trainable = False

In [58]:
model.summary()


Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 100)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 100)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 102267648   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
dropout_38 (Dropout)            (None, 100, 768)     0           bert[0][13]           

In [106]:
# hidden_size = output_layer.shape[-1].value
# model.layers[5].get_weights()
bert.get_pooled_output()

AttributeError: 'TFBertForSequenceClassification' object has no attribute 'get_pooled_output'

In [132]:
# loss=tf.keras.losses.BinaryCrossentropy()
# output_layer = model.get_pooled_output()

hidden_size = 768
#output_layer.shape[-1].value

output_weights = tf.compat.v1.get_variable(
    "output_weights", [8, 8],initializer=tf.compat.v1.truncated_normal_initializer(stddev=0.02))

output_bias = tf.compat.v1.get_variable(
    "output_bias", [8], initializer=tf.compat.v1.zeros_initializer())
logits = tf.matmul(model.layers[5].get_weights()[0], output_weights)
logits = tf.nn.bias_add(logits, output_bias)
# logits.shape
new_labels_list = labels[:1536]
labels_truncated = tf.cast(new_labels_list, tf.float32)
per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels_truncated,logits=logits)
loss = tf.reduce_mean(per_example_loss)
optimizer = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(
    loss=loss,
    optimizer=optimizer,
    metrics=['accuracy'])

In [133]:
from timeit import default_timer as timer
from datetime import timedelta

start = timer()

history = model.fit(train
                    , validation_data=evalu
                    , epochs=5)

end = timer()
print(timedelta(seconds=end-start))

Epoch 1/5


ValueError: in user code:

    /home/lindayang16/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:806 train_function  *
        return step_function(self, iterator)
    /home/lindayang16/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /home/lindayang16/anaconda3/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/lindayang16/anaconda3/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/lindayang16/anaconda3/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/lindayang16/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:789 run_step  **
        outputs = model.train_step(data)
    /home/lindayang16/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:748 train_step
        loss = self.compiled_loss(
    /home/lindayang16/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:187 __call__
        self.build(y_pred)
    /home/lindayang16/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:140 build
        self._losses = nest.map_structure(self._get_loss_object, self._losses)
    /home/lindayang16/anaconda3/lib/python3.8/site-packages/tensorflow/python/util/nest.py:635 map_structure
        structure[0], [func(*x) for x in entries],
    /home/lindayang16/anaconda3/lib/python3.8/site-packages/tensorflow/python/util/nest.py:635 <listcomp>
        structure[0], [func(*x) for x in entries],
    /home/lindayang16/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:263 _get_loss_object
        loss = losses_mod.get(loss)
    /home/lindayang16/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:1901 get
        raise ValueError(

    ValueError: Could not interpret loss function identifier: 0.6931503415107727
