In [None]:
!pip install -q transformers

[K     |████████████████████████████████| 4.4 MB 16.9 MB/s 
[K     |████████████████████████████████| 6.6 MB 55.4 MB/s 
[K     |████████████████████████████████| 101 kB 13.4 MB/s 
[K     |████████████████████████████████| 596 kB 57.1 MB/s 
[?25h

In [None]:
!pip install pydot --quiet

In [None]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [None]:
from collections import Counter
import numpy as np
import tensorflow as tf
from tensorflow import keras

import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint 

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
from transformers import BertTokenizer, TFBertModel

In [None]:
import pandas as pd
import numpy as np
 
data = pd.read_csv('/content/gdrive/My Drive/wzx/train-balanced-sarcasm.csv')
data.dropna(subset=['comment'], inplace = True)


In [None]:
 data['txt_length'] = data['comment'].apply(lambda x: len(x.split(' ')))


In [None]:
data['txt_length'].quantile(0.99)

38.0

In [None]:
def get_train_test(dataset, test_size=0.1):

  documents = dataset.comment
  labels = dataset.label

  return train_test_split(documents, labels, test_size=test_size, random_state = 42)
  

train_texts, test_texts, train_labels, test_labels = get_train_test(data)

In [None]:
validation_size = int(len(train_texts) * 0.05)

In [None]:
validation_size

45484

In [None]:
len(train_texts)
valid_texts = train_texts[-validation_size:]
valid_labels = train_labels[-validation_size:]
train_texts = train_texts[:-validation_size]
train_labels = train_labels[:-validation_size]

In [None]:
train_texts = list(train_texts.values)
valid_texts = list(valid_texts.values)
test_texts = list(test_texts.values)



In [None]:
train_labels = np.array(train_labels)
valid_labels = np.array(valid_labels)
test_labels = np.array(test_labels)

In [None]:
len(valid_texts)

45484

In [None]:
len(train_texts)

864211

In [None]:

npvalid_labels = np.asarray(valid_labels)

In [None]:

model_checkpoint = 'bert-base-cased'

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
bert_model = TFBertModel.from_pretrained(model_checkpoint)

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


We're setting our maximum training record length to 200.  BERT models can handle more and after you've completed the assignment you're welcome to try larger and small sized records.

In [None]:
max_length = 50

In [None]:

train_encodings = bert_tokenizer(train_texts, truncation=True, padding=True, max_length=max_length, return_tensors='tf')
valid_encodings = bert_tokenizer(valid_texts, truncation=True, padding=True, max_length=max_length, return_tensors='tf')
test_encodings = bert_tokenizer(test_texts, truncation=True, padding=True, max_length=max_length, return_tensors='tf')

In [None]:
def create_sarcasm_model(train_layers=-1,
                          hidden_size = 100, 
                          dropout=0.3,
                          learning_rate=0.00005):


    model_checkpoint = 'bert-base-cased'
    bert_tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
    bert_model = TFBertModel.from_pretrained(model_checkpoint)



    if not train_layers == -1:

            retrain_layers = []

            for retrain_layer_number in range(train_layers):

                layer_code = '_' + str(11 - retrain_layer_number)
                retrain_layers.append(layer_code)

            for w in bert_model.weights:
                if not any([x in w.name for x in retrain_layers]):
                    w._trainable = False

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='input_ids_layer_sarcasm') 
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='token_type_ids_layer_sarcasm')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer_sarcasm')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}         

    bert_out = bert_model(bert_inputs) 

    pooled_token = bert_out[1]

    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer_sarcasm')(pooled_token)
    hidden = tf.keras.layers.Dropout(dropout)(hidden)  

    classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer_sarcasm')(hidden)

    
    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
    
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                            loss=tf.keras.losses.BinaryCrossentropy(from_logits=False),
                            metrics='accuracy') 

    return classification_model

In [None]:
pooled_bert_model = create_sarcasm_model()
pooled_bert_model.summary()

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "model_11"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 attention_mask_layer_sarcasm (  [(None, 50)]        0           []                               
 InputLayer)                                                                                      
                                                                                                  
 input_ids_layer_sarcasm (Input  [(None, 50)]        0           []                               
 Layer)                                                                                           
                                                                                                  
 token_type_ids_layer_sarcasm (  [(None, 50)]        0           []                               
 InputLayer)                                                                               

In [None]:
checkpoint_path = '/content/gdrive/My Drive/wzx/sarcasm_model/sarcasm_best_weights.h5'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath= checkpoint_path ,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)


pooled_bert_model_history = pooled_bert_model.fit([train_encodings.input_ids, train_encodings.token_type_ids, train_encodings.attention_mask], 
                                                  train_labels,
                             
                                                  validation_data=([valid_encodings.input_ids, valid_encodings.token_type_ids, valid_encodings.attention_mask], 
                                                  npvalid_labels),
                             
                                                  batch_size=128, 
                                                  epochs=10,
                                                  callbacks=[model_checkpoint_callback]
                                                  )  

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
score = pooled_bert_model.evaluate([test_encodings.input_ids, test_encodings.token_type_ids, test_encodings.attention_mask], 
                                                  test_labels) 

print('Test loss:', score[0]) 
print('Test accuracy:', np.round(score[1], 5))

In [None]:
# model_save_name = 'sarcasm_weights.h5'
path = '/content/gdrive/My Drive/wzx/sarcasm_model/sarcasm_weights_all_epoch.h5'


pooled_bert_model.save_weights(path)

# Load Trained Sarcasm Model

In [None]:
def create_sarcasm_model(train_layers=-1,
                          hidden_size = 100, 
                          dropout=0.3,
                          learning_rate=0.00005):

    model_checkpoint = 'bert-base-cased'
    bert_tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
    bert_model = TFBertModel.from_pretrained(model_checkpoint)



    if not train_layers == -1:

            retrain_layers = []

            for retrain_layer_number in range(train_layers):

                layer_code = '_' + str(11 - retrain_layer_number)
                retrain_layers.append(layer_code)

            for w in bert_model.weights:
                if not any([x in w.name for x in retrain_layers]):
                    w._trainable = False

    input_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='input_ids_layer') 
    token_type_ids = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(max_length,), dtype=tf.int64, name='attention_mask_layer')

    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}         

    bert_out = bert_model(bert_inputs) 

    pooled_token = bert_out[1]

    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(pooled_token)
    hidden = tf.keras.layers.Dropout(dropout)(hidden)  

    classification = tf.keras.layers.Dense(2, activation='sigmoid',name='classification_layer')(hidden)

    
    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
    
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                            loss=tf.keras.losses.SparseCategoricalCrossentropy(),
                            metrics='accuracy') 

    return classification_model

In [None]:
# Load trained model

model_save_name = 'sarcasm_best_weights.h5'
path = F"/content/gdrive/My Drive/wzx/{model_save_name}" 

sarcasm_model = create_sarcasm_model()
sarcasm_model.load_weights(path)




Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
# Test loaded model 
score = loaded_model.evaluate([test_encodings.input_ids, test_encodings.token_type_ids, test_encodings.attention_mask], 
                                                  test_labels) 

print('Test loss:', score[0]) 
print('Test accuracy:', np.round(score[1], 5))


Test loss: 0.4811343848705292
Test accuracy: 0.77043
