In [2]:
#from kaggle.api.kaggle_api_extended import kaggleApi
#import zipfile
#import os

In [9]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer

In [22]:
import tensorflow as tf

In [32]:
from transformers import TFAutoModel

In [4]:
df = pd.read_csv ("train.tsv", sep='\t')
df.head()

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
0,1,1,A series of escapades demonstrating the adage ...,1
1,2,1,A series of escapades demonstrating the adage ...,2
2,3,1,A series,2
3,4,1,A,2
4,5,1,series,2


In [7]:
seq_len = 512
num_samples = len(df)

Xid = np.zeros((num_samples, seq_len))
Xmask = np.zeros((num_samples, seq_len))

In [8]:
Xid.shape

(156060, 512)

In [13]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
for i, phrase in enumerate(df['Phrase']):
    tokens = tokenizer.encode_plus(phrase, max_length = seq_len, truncation = True,
                                  padding = 'max_length', add_special_tokens= True,
                                  return_tensors='tf')
    Xid[i, :] = tokens['input_ids']
    Xmask[i, :] = tokens ['attention_mask']

In [14]:
Xid

array([[  101.,   138.,  1326., ...,     0.,     0.,     0.],
       [  101.,   138.,  1326., ...,     0.,     0.,     0.],
       [  101.,   138.,  1326., ...,     0.,     0.,     0.],
       ...,
       [  101.,   170., 25247., ...,     0.,     0.,     0.],
       [  101.,   170., 25247., ...,     0.,     0.,     0.],
       [  101., 22572., 12148., ...,     0.,     0.,     0.]])

In [15]:
Xmask

array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]])

In [16]:
arr = df['Sentiment'].values
arr

array([1, 2, 2, ..., 3, 2, 2], dtype=int64)

In [17]:
labels = np.zeros((num_samples, arr.max()+1))
labels.shape

(156060, 5)

In [20]:
labels[np.arange(num_samples), arr] = 1

In [21]:
labels

array([[0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0.]])

In [24]:
dataset=tf.data.Dataset.from_tensor_slices((Xid, Xmask, labels))
dataset.take(1)

<TakeDataset shapes: ((512,), (512,), (5,)), types: (tf.float64, tf.float64, tf.float64)>

In [25]:
def map_func(input_ids, mask, labels):
    return {'input_ids': input_ids, 'attention_mask': mask}, labels

In [26]:
dataset = dataset.map(map_func)

In [27]:
dataset.take(1)

<TakeDataset shapes: ({input_ids: (512,), attention_mask: (512,)}, (5,)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.float64)>

In [28]:
batch_size = 16
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)
dataset.take(1)


<TakeDataset shapes: ({input_ids: (16, 512), attention_mask: (16, 512)}, (16, 5)), types: ({input_ids: tf.float64, attention_mask: tf.float64}, tf.float64)>

In [30]:
split = 0.9

size = int ((num_samples / batch_size))
int((num_samples / batch_size) * split)

8778

In [31]:
train_ds = dataset.take(size)
val_ds = dataset.skip(size)

del dataset

In [34]:
bert = TFAutoModel.from_pretrained('bert-base-uncased')
bert.summary()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "tf_bert_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
Total params: 109,482,240
Trainable params: 109,482,240
Non-trainable params: 0
_________________________________________________________________


In [38]:
input_ids = tf.keras.layers.Input(shape=(seq_len, ), name = 'input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(seq_len,), name = 'attention_mask', dtype = 'int32')

embeddings = bert.bert(input_ids, attention_mask=mask)[1]

x = tf.keras.layers.Dense(1024, activation='relu')(embeddings)
y = tf.keras.layers.Dense(arr.max()+1, activation='softmax', name='outputs')(x)

In [41]:
model = tf.keras.Model(inputs=[input_ids, mask], outputs=y)
#model.layers[2].trainable = False
model.summary()

Model: "functional_5"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 512)]        0                                            
__________________________________________________________________________________________________
attention_mask (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 109482240   input_ids[0][0]                  
                                                                 attention_mask[0][0]             
__________________________________________________________________________________________________
dense_3 (Dense)                 (None, 1024)         787456      bert[3][1]            

In [42]:
optimizer = tf.keras.optimizers.Adam(lr=1e-5, decay=1e-6)
loss = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy("accuracy")

In [43]:
model.compile(optimizer=optimizer, loss=loss, metrics=[acc])

In [45]:
history = model.fit(
    train_ds,
    validation_data = val_ds,
    epochs = 3)

Epoch 1/3
  17/9753 [..............................] - ETA: 64:47:35 - loss: 1.5342 - accuracy: 0.4044

KeyboardInterrupt: 