## requirements:
- tensorflow (latest)
- nltk
- sklearn
- numpy 
- pandas
- seaborn
- matplotlib

In [1]:
import os
import shutil
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

from official.nlp import optimization  # to create AdamW optimizer

import warnings
import matplotlib.pyplot as plt
import seaborn as sns
tf.get_logger().setLevel('ERROR')

In [2]:
main_data=pd.read_csv("GEN-sarc-notsarc.csv")
data=main_data.copy()
data.drop(columns=['id'],axis=1,inplace=True)
classes = {"notsarc" : 0,"sarc" : 1}
data["class"] = data["class"].map(classes)
data

Unnamed: 0,class,text
0,0,"If that's true, then Freedom of Speech is doom..."
1,0,Neener neener - is it time to go in from the p...
2,0,"Just like the plastic gun fear, the armour pie..."
3,0,So geology is a religion because we weren't he...
4,0,Well done Monty. Mark that up as your first ev...
...,...,...
6515,1,depends on when the baby bird died. run alon...
6516,1,"ok, sheesh, to clarify, women who arent aborti..."
6517,1,so.. eh?? hows this sound? will it fly w...
6518,1,"I think we should put to a vote, the right of ..."


In [3]:
X=data['text']
y=data['class']
X=np.array(X)
y=np.asarray(y).astype('int32')#.reshape((-1,1))

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
print('Train data len:'+str(len(X_train)))
print('Class distribution\n'+str((pd.Series(y_train)).value_counts()))
print('Test data len:'+str(len(X_test)))
print('Class distribution\n'+str((pd.Series(y_test)).value_counts()))

Train data len:5216
Class distribution
1    2616
0    2600
dtype: int64
Test data len:1304
Class distribution
0    660
1    644
dtype: int64


In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [7]:
print('Train data len:'+str(len(X_train)))
print('Class distribution\n'+str((pd.Series(y_train)).value_counts()))
print('Valid data len:'+str(len(X_valid)))
print('Class distribution\n'+str((pd.Series(y_valid)).value_counts()))
print('Test data len:'+str(len(X_test)))
print('Class distribution\n'+str((pd.Series(y_test)).value_counts()))

Train data len:4172
Class distribution
0    2087
1    2085
dtype: int64
Valid data len:1044
Class distribution
1    531
0    513
dtype: int64
Test data len:1304
Class distribution
0    660
1    644
dtype: int64


In [8]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1'
tfhub_handle_preprocess = 'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3'

In [9]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)

In [10]:
bert_model = hub.KerasLayer(tfhub_handle_encoder)

In [11]:
def build_classifier_model():
    text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
    preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
    encoder_inputs = preprocessing_layer(text_input)
    encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
    outputs = encoder(encoder_inputs)
    net = outputs['pooled_output']
    net = tf.keras.layers.Dropout(0.1)(net)
    net = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
    return tf.keras.Model(text_input, net)

In [12]:
classifier_model = build_classifier_model()

In [13]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

In [14]:
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(32)
valid_ds = tf.data.Dataset.from_tensor_slices((X_valid, y_valid)).batch(32)
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32)

In [15]:
train_ds

<BatchDataset shapes: ((None,), (None,)), types: (tf.string, tf.int32)>

In [16]:
epochs = 3
steps_per_epoch = tf.data.experimental.cardinality(train_ds).numpy()
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [17]:
classifier_model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)

In [None]:
print(f'Training model with {tfhub_handle_encoder}')
history = classifier_model.fit(x=train_ds,
                               validation_data=valid_ds,
                               epochs=epochs)

Training model with https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1
Epoch 1/3

In [None]:
y_pred=classifier_model.predict(X_test)
labels = [0, 1]
print(classification_report(y_test,tf.round(tf.nn.sigmoid(y_pred)),labels=labels))

In [None]:
loss, accuracy = classifier_model.evaluate(test_ds)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

In [None]:
model_name='BERT_TalkingHeads_Sarcasm_GEN_'+str(accuracy)
classifier_model.save(model_name, include_optimizer=True)

In [None]:
test_data=pd.read_csv("HYP-sarc-notsarc.csv")
test_data.drop(columns=['id'],axis=1,inplace=True)
classes = {"notsarc" : 0,"sarc" : 1}
test_data["class"] = test_data["class"].map(classes)
test_data

In [None]:
test_X=test_data['text']
test_y=test_data['class']
test_X=np.array(test_X)
test_y=np.asarray(test_y).astype('int32')

In [None]:
test_tf = tf.data.Dataset.from_tensor_slices((test_X, test_y)).batch(32)

In [None]:
pred_y=classifier_model.predict(test_X)
print(classification_report(test_y,tf.round(tf.nn.sigmoid(pred_y)),labels=labels))

In [None]:
loss, accuracy = classifier_model.evaluate(test_tf)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')

In [None]:
test_data2=pd.read_csv("RQ-sarc-notsarc.csv")
test_data2.drop(columns=['id'],axis=1,inplace=True)
classes = {"notsarc" : 0,"sarc" : 1}
test_data2["class"] = test_data2["class"].map(classes)
test_data2

In [None]:
test2_X=test_data2['text']
test2_y=test_data2['class']
test2_X=np.array(test2_X)
test2_y=np.asarray(test2_y).astype('int32')

In [None]:
pred_y2=classifier_model.predict(test2_X)
print(classification_report(test2_y,tf.round(tf.nn.sigmoid(pred_y2)),labels=labels))

In [None]:
test2_tf = tf.data.Dataset.from_tensor_slices((test2_X,test2_y)).batch(32)
loss, accuracy = classifier_model.evaluate(test2_tf)

print(f'Loss: {loss}')
print(f'Accuracy: {accuracy}')