In [2]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras

from sklearn.utils import shuffle

In [3]:
train_dataset_fp = './data/local_train_set.csv'
test_dataset_fp = './data/local_test_set.csv'

trainset = pd.read_csv(train_dataset_fp)
testset = pd.read_csv(test_dataset_fp)

In [4]:
from utility import BertTokenizer, CleanedTextDict

tokenizer = BertTokenizer(max_len = 256)
#textDic = CleanedTextDict(trainset, testset)

In [5]:
trainset['stokens'] = trainset['CleanedText'].apply(lambda x: tokenizer.GetStokens(x))
trainset['input_ids'] = trainset['stokens'].apply(lambda x: tokenizer.GetInput_ids(x))
trainset['input_masks'] = trainset['stokens'].apply(lambda x: tokenizer.GetInput_masks(x))
trainset['input_segments'] = trainset['stokens'].apply(lambda x: tokenizer.GetInput_segments(x))

In [6]:
trainset.head()

Unnamed: 0,Product_ID,User_ID,Time_ID,HelpfulnessNumerator,HelpfulnessDenominator,CleanedText,Score,stokens,input_ids,input_masks,input_segments
0,65149,233944,1973,9,14,look small print bottom back packag product ch...,1,"[[CLS], look, small, print, bottom, back, pack...","[101, 2298, 2235, 6140, 3953, 2067, 5308, 8490...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,26218,38234,633,10,10,updat april 2012 i return 6 pouch purebit chic...,1,"[[CLS], up, ##da, ##t, april, 2012, i, return,...","[101, 2039, 2850, 2102, 2258, 2262, 1045, 2709...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,18722,96355,315,0,1,was wonder gum tast differ textur slimi read i...,1,"[[CLS], was, wonder, gum, ta, ##st, differ, te...","[101, 2001, 4687, 16031, 11937, 3367, 11234, 3...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,61569,225352,828,2,5,well reason i go i need give morn caffein habi...,1,"[[CLS], well, reason, i, go, i, need, give, mo...","[101, 2092, 3114, 1045, 2175, 1045, 2342, 2507...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,6393,41023,170,0,0,i bought product fiber prune juic aid son cons...,1,"[[CLS], i, bought, product, fiber, pr, ##une, ...","[101, 1045, 4149, 4031, 11917, 10975, 9816, 18...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
X = [tf.convert_to_tensor(trainset['input_ids'], dtype=tf.int32),
    tf.convert_to_tensor(trainset['input_masks'], dtype=tf.int32),
    tf.convert_to_tensor(trainset['input_segments'], dtype=tf.int32),
]

In [8]:
Y = tf.convert_to_tensor(trainset['Score']-1, dtype=tf.float32)

In [9]:
from tensorflow.keras.models import Model 
import tensorflow_hub as hub

max_len = 256

input_id = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32)
input_mask = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32)
input_segment = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32)

bert_layer = hub.KerasLayer("./bert_layer", trainable=True)
pooled_output, sequence_output = bert_layer([input_id, input_mask, input_segment])

F1 = keras.layers.Dense(64, activation='relu')(pooled_output)
F2 = keras.layers.Dropout(0.2)(F1)
F3 = keras.layers.Dense(5, activation='softmax')(F2)

model = Model(inputs=[input_id, input_mask, input_segment], outputs=F3)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
keras_layer_1 (KerasLayer)      [(None, 768), (None, 109482241   input_1[0][0]                    
                                                                 input_2[0][0]                

In [10]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
keras_layer_1 (KerasLayer)      [(None, 768), (None, 109482241   input_1[0][0]                    
                                                                 input_2[0][0]                

In [11]:
history = model.fit(x=X, y=Y, epochs = 3, validation_split = 0.2, shuffle=
                    'steps_per_epoch')

Train on 2000 samples, validate on 500 samples
Epoch 1/3
  64/2000 [..............................] - ETA: 45:28 - loss: 1.5812 - accuracy: 0.1562

KeyboardInterrupt: 